diff --git a/.circleci/config.yml b/.circleci/config.yml index cdb800aa222..bba65c8aa20 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,7 +1,7 @@ version: 2.1 orbs: - win: circleci/windows@2.4.0 + win: circleci/windows@5.0.0 commands: install-cmake-on-macos: @@ -45,6 +45,38 @@ commands: echo "export LZ4_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/lz4" >> $BASH_ENV echo "export ZSTD_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zstd" >> $BASH_ENV + windows-build-steps: + steps: + - checkout + - run: + name: "Install thirdparty dependencies" + command: | + echo "Installing CMake..." + choco install cmake --installargs 'ADD_CMAKE_TO_PATH=System' -y + mkdir $Env:THIRDPARTY_HOME + cd $Env:THIRDPARTY_HOME + echo "Building Snappy dependency..." + curl https://github.com/google/snappy/archive/refs/tags/1.1.8.zip -O snappy-1.1.8.zip + unzip -q snappy-1.1.8.zip + cd snappy-1.1.8 + mkdir build + cd build + & $Env:CMAKE_BIN -G "$Env:CMAKE_GENERATOR" .. + msbuild.exe Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 + - run: + name: "Build RocksDB" + command: | + mkdir build + cd build + & $Env:CMAKE_BIN -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DJNI=1 .. + cd .. + echo "Building with VS version: $Env:CMAKE_GENERATOR" + msbuild.exe build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 + - run: + name: "Test RocksDB" + shell: powershell.exe + command: | + build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 pre-steps-macos: steps: - pre-steps @@ -154,11 +186,6 @@ commands: true executors: - windows-2xlarge: - machine: - image: 'windows-server-2019-vs2019:stable' - resource_class: windows.2xlarge - shell: bash.exe linux-docker: docker: # The image configuration is build_tools/ubuntu20_image/Dockerfile @@ -518,73 +545,35 @@ jobs: no_output_timeout: 100m - post-steps - build-windows: - executor: windows-2xlarge - parameters: - extra_cmake_opt: - default: "" - type: string - vs_year: - default: "2019" - type: string - cmake_generator: - default: "Visual Studio 16 2019" - type: string + build-windows-vs2022: + executor: + name: win/server-2022 + size: 2xlarge + environment: + THIRDPARTY_HOME: C:/Users/circleci/thirdparty + CMAKE_HOME: C:/Program Files/CMake + CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe + SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8 + SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build + SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib + CMAKE_GENERATOR: Visual Studio 17 2022 + steps: + - windows-build-steps + + build-windows-vs2019: + executor: + name: win/server-2019 + size: 2xlarge environment: THIRDPARTY_HOME: C:/Users/circleci/thirdparty - CMAKE_HOME: C:/Users/circleci/thirdparty/cmake-3.16.4-win64-x64 - CMAKE_BIN: C:/Users/circleci/thirdparty/cmake-3.16.4-win64-x64/bin/cmake.exe - SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.7 - SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.7;C:/Users/circleci/thirdparty/snappy-1.1.7/build - SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.7/build/Debug/snappy.lib - VS_YEAR: <> - CMAKE_GENERATOR: <> + CMAKE_HOME: C:/Program Files/CMake + CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe + SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8 + SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build + SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib + CMAKE_GENERATOR: Visual Studio 16 2019 steps: - - checkout - - run: - name: "Setup VS" - command: | - if [[ "${VS_YEAR}" == "2019" ]]; then - echo "VS2019 already present." - elif [[ "${VS_YEAR}" == "2017" ]]; then - echo "Installing VS2017..." - powershell .circleci/vs2017_install.ps1 - elif [[ "${VS_YEAR}" == "2015" ]]; then - echo "Installing VS2015..." - powershell .circleci/vs2015_install.ps1 - fi - - store_artifacts: - path: \Users\circleci\AppData\Local\Temp\vslogs.zip - - run: - name: "Install thirdparty dependencies" - command: | - mkdir ${THIRDPARTY_HOME} - cd ${THIRDPARTY_HOME} - echo "Installing CMake..." - curl --fail --silent --show-error --output cmake-3.16.4-win64-x64.zip --location https://github.com/Kitware/CMake/releases/download/v3.16.4/cmake-3.16.4-win64-x64.zip - unzip -q cmake-3.16.4-win64-x64.zip - echo "Building Snappy dependency..." - curl --fail --silent --show-error --output snappy-1.1.7.zip --location https://github.com/google/snappy/archive/1.1.7.zip - unzip -q snappy-1.1.7.zip - cd snappy-1.1.7 - mkdir build - cd build - ${CMAKE_BIN} -G "${CMAKE_GENERATOR}" .. - msbuild.exe Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 - - run: - name: "Build RocksDB" - command: | - mkdir build - cd build - ${CMAKE_BIN} -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DJNI=1 << parameters.extra_cmake_opt >> .. - cd .. - echo "Building with VS version: ${CMAKE_GENERATOR}" - msbuild.exe build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 - - run: - name: "Test RocksDB" - shell: powershell.exe - command: | - build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 + - windows-build-steps build-linux-java: executor: linux-docker @@ -859,15 +848,8 @@ workflows: - build-linux-mini-crashtest jobs-windows: jobs: - - build-windows: - name: "build-windows-vs2019" - - build-windows: - name: "build-windows-vs2019-cxx20" - extra_cmake_opt: -DCMAKE_CXX_STANDARD=20 - - build-windows: - name: "build-windows-vs2017" - vs_year: "2017" - cmake_generator: "Visual Studio 15 Win64" + - build-windows-vs2022 + - build-windows-vs2019 - build-cmake-mingw jobs-java: jobs: diff --git a/CMakeLists.txt b/CMakeLists.txt index 6743c285b3b..228c7677e4a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -253,11 +253,21 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") endif(HAS_S390X_MARCH_NATIVE) endif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64") + CHECK_C_COMPILER_FLAG("-march=loongarch64" HAS_LOONGARCH64) + if(HAS_LOONGARCH64) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=loongarch64 -mtune=loongarch64") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=loongarch64 -mtune=loongarch64") + endif(HAS_LOONGARCH64) +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64") + option(PORTABLE "build a portable binary" OFF) option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF) option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF) option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF) if(PORTABLE) + add_definitions(-DROCKSDB_PORTABLE) + # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h # is available, it is available by default. if(FORCE_SSE42 AND NOT MSVC) @@ -281,6 +291,9 @@ if(PORTABLE) if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196") endif() + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=loongarch64") + endif() endif() else() if(MSVC) @@ -300,8 +313,7 @@ if(NOT MSVC) set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul") endif() -if (NOT PORTABLE OR FORCE_SSE42) - CHECK_CXX_SOURCE_COMPILES(" +CHECK_CXX_SOURCE_COMPILES(" #include #include #include @@ -313,12 +325,11 @@ int main() { auto d = _mm_cvtsi128_si64(c); } " HAVE_SSE42) - if(HAVE_SSE42) - add_definitions(-DHAVE_SSE42) - add_definitions(-DHAVE_PCLMUL) - elseif(FORCE_SSE42) - message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") - endif() +if(HAVE_SSE42) + add_definitions(-DHAVE_SSE42) + add_definitions(-DHAVE_PCLMUL) +elseif(FORCE_SSE42) + message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") endif() # Check if -latomic is required or not @@ -657,12 +668,13 @@ set(SOURCES cache/cache.cc cache/cache_entry_roles.cc cache/cache_key.cc + cache/cache_helpers.cc cache/cache_reservation_manager.cc cache/charged_cache.cc cache/clock_cache.cc cache/compressed_secondary_cache.cc - cache/fast_lru_cache.cc cache/lru_cache.cc + cache/secondary_cache.cc cache/sharded_cache.cc db/arena_wrapped_db_iter.cc db/blob/blob_contents.cc @@ -804,6 +816,7 @@ set(SOURCES options/options.cc options/options_helper.cc options/options_parser.cc + port/mmap.cc port/stack_trace.cc table/adaptive/adaptive_table_factory.cc table/block_based/binary_search_index_reader.cc @@ -813,6 +826,7 @@ set(SOURCES table/block_based/block_based_table_iterator.cc table/block_based/block_based_table_reader.cc table/block_based/block_builder.cc + table/block_based/block_cache.cc table/block_based/block_prefetcher.cc table/block_based/block_prefix_index.cc table/block_based/data_block_hash_index.cc @@ -974,7 +988,6 @@ set(SOURCES cloud/cloud_manifest.cc cloud/cloud_scheduler.cc cloud/cloud_storage_provider.cc - cloud/cloud_file_cache.cc cloud/cloud_file_deletion_scheduler.cc db/db_impl/db_impl_remote_compaction.cc db/db_impl/replication_codec.cc) @@ -1004,6 +1017,12 @@ if ( ROCKSDB_PLUGINS ) plugin/${plugin}/${src} PROPERTIES COMPILE_FLAGS "${${plugin}_COMPILE_FLAGS}") endforeach() + foreach (test ${${plugin}_TESTS}) + list(APPEND PLUGIN_TESTS plugin/${plugin}/${test}) + set_source_files_properties( + plugin/${plugin}/${test} + PROPERTIES COMPILE_FLAGS "${${plugin}_COMPILE_FLAGS}") + endforeach() foreach (path ${${plugin}_INCLUDE_PATHS}) include_directories(${path}) endforeach() @@ -1501,6 +1520,7 @@ if(WITH_TESTS) utilities/ttl/ttl_test.cc utilities/util_merge_operators_test.cc utilities/write_batch_with_index/write_batch_with_index_test.cc + ${PLUGIN_TESTS} ) endif() @@ -1604,9 +1624,9 @@ endif() option(WITH_TRACE_TOOLS "build with trace tools" ON) if(WITH_TRACE_TOOLS) - add_executable(block_cache_trace_analyzer_tool${ARTIFACT_SUFFIX} + add_executable(block_cache_trace_analyzer${ARTIFACT_SUFFIX} tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc) - target_link_libraries(block_cache_trace_analyzer_tool${ARTIFACT_SUFFIX} + target_link_libraries(block_cache_trace_analyzer${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS}) add_executable(trace_analyzer${ARTIFACT_SUFFIX} diff --git a/HISTORY.md b/HISTORY.md index df33b7b39b2..237c25d17ef 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,20 +1,67 @@ # Rocksdb Change Log -## 7.8.3 (11/29/2022) -* Revert an internal change in 7.8.0 associated with some memory usage churn. +## 7.10.2 (02/10/2023) +### Bug Fixes +* Fixed a bug in DB open/recovery from a compressed WAL that was caused due to incorrect handling of certain record fragments with the same offset within a WAL block. + +## 7.10.1 (02/01/2023) +### Bug Fixes +* Fixed a data race on `ColumnFamilyData::flush_reason` caused by concurrent flushes. +* Fixed `DisableManualCompaction()` and `CompactRangeOptions::canceled` to cancel compactions even when they are waiting on conflicting compactions to finish +* Fixed a bug in which a successful `GetMergeOperands()` could transiently return `Status::MergeInProgress()` +* Return the correct error (Status::NotSupported()) to MultiGet caller when ReadOptions::async_io flag is true and IO uring is not enabled. Previously, Status::Corruption() was being returned when the actual failure was lack of async IO support. -## 7.8.2 (11/27/2022) +## 7.10.0 (01/23/2023) ### Behavior changes * Make best-efforts recovery verify SST unique ID before Version construction (#10962) -* Fix failed memtable flush retry bug that could cause wrongly ordered updates, which would surface to writers as `Status::Corruption` in case of `force_consistency_checks=true` (default). It affects use cases that enable both parallel flush (`max_background_flushes > 1` or `max_background_jobs >= 8`) and non-default memtable count (`max_write_buffer_number > 2`). -* Tiered Storage: fixed excessive keys written to penultimate level in non-debug builds. +* Introduce `epoch_number` and sort L0 files by `epoch_number` instead of `largest_seqno`. `epoch_number` represents the order of a file being flushed or ingested/imported. Compaction output file will be assigned with the minimum `epoch_number` among input files'. For L0, larger `epoch_number` indicates newer L0 file. ### Bug Fixes -* Fixed a regression in scan for async_io. During seek, valid buffers were getting cleared causing a regression. -* Fixed a performance regression in iterator where range tombstones after `iterate_upper_bound` is processed. +* Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed. +* Fixed a memory leak in MultiGet with async_io read option, caused by IO errors during table file open +* Fixed a bug that multi-level FIFO compaction deletes one file in non-L0 even when `CompactionOptionsFIFO::max_table_files_size` is no exceeded since #10348 or 7.8.0. +* Fixed a bug caused by `DB::SyncWAL()` affecting `track_and_verify_wals_in_manifest`. Without the fix, application may see "open error: Corruption: Missing WAL with log number" while trying to open the db. The corruption is a false alarm but prevents DB open (#10892). +* Fixed a BackupEngine bug in which RestoreDBFromLatestBackup would fail if the latest backup was deleted and there is another valid backup available. +* Fix L0 file misorder corruption caused by ingesting files of overlapping seqnos with memtable entries' through introducing `epoch_number`. Before the fix, `force_consistency_checks=true` may catch the corruption before it's exposed to readers, in which case writes returning `Status::Corruption` would be expected. Also replace the previous incomplete fix (#5958) to the same corruption with this new and more complete fix. +* Fixed a bug in LockWAL() leading to re-locking mutex (#11020). +* Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache. +* Fixed a heap use after free in async scan prefetching if dictionary compression is enabled, in which case sync read of the compression dictionary gets mixed with async prefetching +* Fixed a data race bug of `CompactRange()` under `change_level=true` acts on overlapping range with an ongoing file ingestion for level compaction. This will either result in overlapping file ranges corruption at a certain level caught by `force_consistency_checks=true` or protentially two same keys both with seqno 0 in two different levels (i.e, new data ends up in lower/older level). The latter will be caught by assertion in debug build but go silently and result in read returning wrong result in release build. This fix is general so it also replaced previous fixes to a similar problem for `CompactFiles()` (#4665), general `CompactRange()` and auto compaction (commit 5c64fb6 and 87dfc1d). +* Fixed a bug in compaction output cutting where small output files were produced due to TTL file cutting states were not being updated (#11075). + +### New Features +* When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction. +* Add BackupEngine feature to exclude files from backup that are known to be backed up elsewhere, using `CreateBackupOptions::exclude_files_callback`. To restore the DB, the excluded files must be provided in alternative backup directories using `RestoreOptions::alternate_dirs`. + +### Public API Changes +* Substantial changes have been made to the Cache class to support internal development goals. Direct use of Cache class members is discouraged and further breaking modifications are expected in the future. SecondaryCache has some related changes and implementations will need to be updated. (Unlike Cache, SecondaryCache is still intended to support user implementations, and disruptive changes will be avoided.) (#10975) +* Add `MergeOperationOutput::op_failure_scope` for merge operator users to control the blast radius of merge operator failures. Existing merge operator users do not need to make any change to preserve the old behavior + +### Performance Improvements +* Updated xxHash source code, which should improve kXXH3 checksum speed, at least on ARM (#11098). +* Improved CPU efficiency of DB reads, from block cache access improvements (#10975). + +## 7.9.0 (11/21/2022) +### Performance Improvements +* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877). -## 7.8.1 (11/2/2022) ### Bug Fixes * Fix memory corruption error in scans if async_io is enabled. Memory corruption happened if there is IOError while reading the data leading to empty buffer and other buffer already in progress of async read goes again for reading. +* Fix failed memtable flush retry bug that could cause wrongly ordered updates, which would surface to writers as `Status::Corruption` in case of `force_consistency_checks=true` (default). It affects use cases that enable both parallel flush (`max_background_flushes > 1` or `max_background_jobs >= 8`) and non-default memtable count (`max_write_buffer_number > 2`). +* Fixed an issue where the `READ_NUM_MERGE_OPERANDS` ticker was not updated when the base key-value or tombstone was read from an SST file. +* Fixed a memory safety bug when using a SecondaryCache with `block_cache_compressed`. `block_cache_compressed` no longer attempts to use SecondaryCache features. +* Fixed a regression in scan for async_io. During seek, valid buffers were getting cleared causing a regression. +* Tiered Storage: fixed excessive keys written to penultimate level in non-debug builds. + +### New Features +* Add basic support for user-defined timestamp to Merge (#10819). +* Add stats for ReadAsync time spent and async read errors. +* Basic support for the wide-column data model is now available. Wide-column entities can be stored using the `PutEntity` API, and retrieved using `GetEntity` and the new `columns` API of iterator. For compatibility, the classic APIs `Get` and `MultiGet`, as well as iterator's `value` API return the value of the anonymous default column of wide-column entities; also, `GetEntity` and iterator's `columns` return any plain key-values in the form of an entity which only has the anonymous default column. `Merge` (and `GetMergeOperands`) currently also apply to the default column; any other columns of entities are unaffected by `Merge` operations. Note that some features like compaction filters, transactions, user-defined timestamps, and the SST file writer do not yet support wide-column entities; also, there is currently no `MultiGet`-like API to retrieve multiple entities at once. We plan to gradually close the above gaps and also implement new features like column-level operations (e.g. updating or querying only certain columns of an entity). +* Marked HyperClockCache as a production-ready alternative to LRUCache for the block cache. HyperClockCache greatly improves hot-path CPU efficiency under high parallel load or high contention, with some documented caveats and limitations. As much as 4.5x higher ops/sec vs. LRUCache has been seen in db_bench under high parallel load. +* Add periodic diagnostics to info_log (LOG file) for HyperClockCache block cache if performance is degraded by bad `estimated_entry_charge` option. + +### Public API Changes +* Marked `block_cache_compressed` as a deprecated feature. Use SecondaryCache instead. +* Added a `SecondaryCache::InsertSaved()` API, with default implementation depending on `Insert()`. Some implementations might need to add a custom implementation of `InsertSaved()`. (Details in API comments.) ## 7.8.0 (10/22/2022) ### New Features @@ -49,6 +96,8 @@ ### Behavior Changes * Sanitize min_write_buffer_number_to_merge to 1 if atomic flush is enabled to prevent unexpected data loss when WAL is disabled in a multi-column-family setting (#10773). +* With periodic stat dumper waits up every options.stats_dump_period_sec seconds, it won't dump stats for a CF if it has no change in the period, unless 7 periods have been skipped. +* Only periodic stats dumper triggered by options.stats_dump_period_sec will update stats interval. Ones triggered by DB::GetProperty() will not update stats interval and will report based on an interval since the last time stats dump period. ### Public API changes * Make kXXH3 checksum the new default, because it is faster on common hardware, especially with kCRC32c affected by a performance bug in some versions of clang (https://github.com/facebook/rocksdb/issues/9891). DBs written with this new setting can be read by RocksDB 6.27 and newer. diff --git a/Makefile b/Makefile index 16ef6004763..c88793238c2 100644 --- a/Makefile +++ b/Makefile @@ -266,6 +266,7 @@ ROCKSDB_PLUGIN_EXTERNS = $(foreach p, $(ROCKSDB_PLUGIN_W_FUNCS), int $($(p)_FUNC ROCKSDB_PLUGIN_BUILTINS = $(foreach p, $(ROCKSDB_PLUGIN_W_FUNCS), {\"$(p)\"\, $($(p)_FUNC)}\,) ROCKSDB_PLUGIN_LDFLAGS = $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_LDFLAGS)) ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES = $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_PKGCONFIG_REQUIRES)) +ROCKSDB_PLUGIN_TESTS = $(foreach p, $(ROCKSDB_PLUGINS), $(foreach test, $($(p)_TESTS), plugin/$(p)/$(test))) CXXFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_CXXFLAGS)) PLATFORM_LDFLAGS += $(ROCKSDB_PLUGIN_LDFLAGS) @@ -648,10 +649,12 @@ STRESS_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES)) ALL_SOURCES = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES) ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES) -ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES) +ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES) $(ROCKSDB_PLUGIN_TESTS) +PLUGIN_TESTS = $(patsubst %.cc, %, $(notdir $(ROCKSDB_PLUGIN_TESTS))) TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES))) TESTS += $(patsubst %.c, %, $(notdir $(TEST_MAIN_SOURCES_C))) +TESTS += $(PLUGIN_TESTS) # `make check-headers` to very that each header file includes its own # dependencies @@ -703,6 +706,7 @@ NON_PARALLEL_TEST = \ env_test \ deletefile_test \ db_bloom_filter_test \ + $(PLUGIN_TESTS) \ PARALLEL_TEST = $(filter-out $(NON_PARALLEL_TEST), $(TESTS)) @@ -1358,6 +1362,14 @@ db_sanity_test: $(OBJ_DIR)/tools/db_sanity_test.o $(LIBRARY) db_repl_stress: $(OBJ_DIR)/tools/db_repl_stress.o $(LIBRARY) $(AM_LINK) +define MakeTestRule +$(notdir $(1:%.cc=%)): $(1:%.cc=$$(OBJ_DIR)/%.o) $$(TEST_LIBRARY) $$(LIBRARY) + $$(AM_LINK) +endef + +# For each PLUGIN test, create a rule to generate the test executable +$(foreach test, $(ROCKSDB_PLUGIN_TESTS), $(eval $(call MakeTestRule, $(test)))) + arena_test: $(OBJ_DIR)/memory/arena_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -2079,7 +2091,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux ifeq ($(PLATFORM), OS_SOLARIS) ARCH := $(shell isainfo -b) else ifeq ($(PLATFORM), OS_OPENBSD) - ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64, $(MACHINE))) + ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE))) ARCH := 64 else ARCH := 32 @@ -2100,7 +2112,7 @@ ifneq ($(origin JNI_LIBC), undefined) endif ifeq (,$(ROCKSDBJNILIB)) -ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64, $(MACHINE))) +ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE))) ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so else ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so @@ -2113,8 +2125,8 @@ ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-javadoc.jar ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-sources.jar SHA256_CMD = sha256sum -ZLIB_VER ?= 1.2.12 -ZLIB_SHA256 ?= 91844808532e5ce316b3c010929493c0244f3d37593afd6de04f71821d5136d9 +ZLIB_VER ?= 1.2.13 +ZLIB_SHA256 ?= b3a24de97a8fdbc835b9833169501030b8977031bcb54b3b3ac13740f846ab30 ZLIB_DOWNLOAD_BASE ?= http://zlib.net BZIP2_VER ?= 1.0.8 BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269 diff --git a/TARGETS b/TARGETS index afdcbaa34f1..93bdf75dd95 100644 --- a/TARGETS +++ b/TARGETS @@ -11,20 +11,20 @@ load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrap cpp_library_wrapper(name="rocksdb_lib", srcs=[ "cache/cache.cc", "cache/cache_entry_roles.cc", + "cache/cache_helpers.cc", "cache/cache_key.cc", "cache/cache_reservation_manager.cc", "cache/charged_cache.cc", "cache/clock_cache.cc", "cache/compressed_secondary_cache.cc", - "cache/fast_lru_cache.cc", "cache/lru_cache.cc", + "cache/secondary_cache.cc", "cache/sharded_cache.cc", "cloud/aws/aws_file_system.cc", "cloud/aws/aws_kafka.cc", "cloud/aws/aws_kinesis.cc", "cloud/aws/aws_retry.cc", "cloud/aws/aws_s3.cc", - "cloud/cloud_file_cache.cc", "cloud/cloud_file_deletion_scheduler.cc", "cloud/cloud_file_system.cc", "cloud/cloud_file_system_impl.cc", @@ -181,6 +181,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "options/options.cc", "options/options_helper.cc", "options/options_parser.cc", + "port/mmap.cc", "port/port_posix.cc", "port/stack_trace.cc", "port/win/env_default.cc", @@ -197,6 +198,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "table/block_based/block_based_table_iterator.cc", "table/block_based/block_based_table_reader.cc", "table/block_based/block_builder.cc", + "table/block_based/block_cache.cc", "table/block_based/block_prefetcher.cc", "table/block_based/block_prefix_index.cc", "table/block_based/data_block_footer.cc", @@ -368,20 +370,20 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "cache/cache.cc", "cache/cache_entry_roles.cc", + "cache/cache_helpers.cc", "cache/cache_key.cc", "cache/cache_reservation_manager.cc", "cache/charged_cache.cc", "cache/clock_cache.cc", "cache/compressed_secondary_cache.cc", - "cache/fast_lru_cache.cc", "cache/lru_cache.cc", + "cache/secondary_cache.cc", "cache/sharded_cache.cc", "cloud/aws/aws_file_system.cc", "cloud/aws/aws_kafka.cc", "cloud/aws/aws_kinesis.cc", "cloud/aws/aws_retry.cc", "cloud/aws/aws_s3.cc", - "cloud/cloud_file_cache.cc", "cloud/cloud_file_deletion_scheduler.cc", "cloud/cloud_file_system.cc", "cloud/cloud_file_system_impl.cc", @@ -538,6 +540,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "options/options.cc", "options/options_helper.cc", "options/options_parser.cc", + "port/mmap.cc", "port/port_posix.cc", "port/stack_trace.cc", "port/win/env_default.cc", @@ -554,6 +557,7 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "table/block_based/block_based_table_iterator.cc", "table/block_based/block_based_table_reader.cc", "table/block_based/block_builder.cc", + "table/block_based/block_cache.cc", "table/block_based/block_prefetcher.cc", "table/block_based/block_prefix_index.cc", "table/block_based/data_block_footer.cc", diff --git a/USERS.md b/USERS.md index be42b9b0cbf..f31e2678522 100644 --- a/USERS.md +++ b/USERS.md @@ -79,6 +79,9 @@ quasardb uses a heavily tuned RocksDB as its persistence layer. ## TiKV [TiKV](https://github.com/pingcap/tikv) is a GEO-replicated, high-performance, distributed, transactional key-value database. TiKV is powered by Rust and Raft. TiKV uses RocksDB as its persistence layer. +## Apache Spark +[Spark Structured Streaming](https://docs.databricks.com/structured-streaming/rocksdb-state-store.html) uses RocksDB as the local state store. + ## Apache Flink [Apache Flink](https://flink.apache.org/news/2016/03/08/release-1.0.0.html) uses RocksDB to store state locally on a machine. diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 4accf7ba045..1dfbfe3c7ff 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -13,7 +13,6 @@ #include #include -#include "cache/fast_lru_cache.h" #include "db/db_impl/db_impl.h" #include "monitoring/histogram.h" #include "port/port.h" @@ -227,7 +226,7 @@ struct KeyGen { } }; -char* createValue(Random64& rnd) { +Cache::ObjectPtr createValue(Random64& rnd) { char* rv = new char[FLAGS_value_bytes]; // Fill with some filler data, and take some CPU time for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) { @@ -237,28 +236,33 @@ char* createValue(Random64& rnd) { } // Callbacks for secondary cache -size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; } +size_t SizeFn(Cache::ObjectPtr /*obj*/) { return FLAGS_value_bytes; } -Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) { - memcpy(out, obj, size); +Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/, + size_t length, char* out) { + memcpy(out, from_obj, length); return Status::OK(); } -// Different deleters to simulate using deleter to gather -// stats on the code origin and kind of cache entries. -void deleter1(const Slice& /*key*/, void* value) { - delete[] static_cast(value); -} -void deleter2(const Slice& /*key*/, void* value) { - delete[] static_cast(value); -} -void deleter3(const Slice& /*key*/, void* value) { +Status CreateFn(const Slice& data, Cache::CreateContext* /*context*/, + MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj, + size_t* out_charge) { + *out_obj = new char[data.size()]; + memcpy(*out_obj, data.data(), data.size()); + *out_charge = data.size(); + return Status::OK(); +}; + +void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) { delete[] static_cast(value); } -Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1); -Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2); -Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3); +Cache::CacheItemHelper helper1(CacheEntryRole::kDataBlock, DeleteFn, SizeFn, + SaveToFn, CreateFn); +Cache::CacheItemHelper helper2(CacheEntryRole::kIndexBlock, DeleteFn, SizeFn, + SaveToFn, CreateFn); +Cache::CacheItemHelper helper3(CacheEntryRole::kFilterBlock, DeleteFn, SizeFn, + SaveToFn, CreateFn); } // namespace class CacheBench { @@ -297,10 +301,6 @@ class CacheBench { cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits) .MakeSharedCache(); - } else if (FLAGS_cache_type == "fast_lru_cache") { - cache_ = NewFastLRUCache( - FLAGS_cache_size, FLAGS_value_bytes, FLAGS_num_shard_bits, - false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false /* strict_capacity_limit */, @@ -441,7 +441,7 @@ class CacheBench { uint64_t total_entry_count = 0; uint64_t table_occupancy = 0; uint64_t table_size = 0; - std::set deleters; + std::set helpers; StopWatchNano timer(clock); for (;;) { @@ -466,7 +466,7 @@ class CacheBench { << BytesToHumanString(static_cast( 1.0 * total_charge / total_entry_count)) << "\n" - << "Unique deleters: " << deleters.size() << "\n"; + << "Unique helpers: " << helpers.size() << "\n"; *stats_report = ostr.str(); return; } @@ -482,14 +482,14 @@ class CacheBench { total_key_size = 0; total_charge = 0; total_entry_count = 0; - deleters.clear(); - auto fn = [&](const Slice& key, void* /*value*/, size_t charge, - Cache::DeleterFn deleter) { + helpers.clear(); + auto fn = [&](const Slice& key, Cache::ObjectPtr /*value*/, size_t charge, + const Cache::CacheItemHelper* helper) { total_key_size += key.size(); total_charge += charge; ++total_entry_count; - // Something slightly more expensive as in (future) stats by category - deleters.insert(deleter); + // Something slightly more expensive as in stats by category + helpers.insert(helper); }; timer.Start(); Cache::ApplyToAllEntriesOptions opts; @@ -538,14 +538,6 @@ class CacheBench { for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { Slice key = gen.GetRand(thread->rnd, max_key_, max_log_); uint64_t random_op = thread->rnd.Next(); - Cache::CreateCallback create_cb = [](const void* buf, size_t size, - void** out_obj, - size_t* charge) -> Status { - *out_obj = reinterpret_cast(new char[size]); - memcpy(*out_obj, buf, size); - *charge = size; - return Status::OK(); - }; timer.Start(); @@ -555,8 +547,8 @@ class CacheBench { handle = nullptr; } // do lookup - handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW, - true); + handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, + Cache::Priority::LOW, true); if (handle) { if (!FLAGS_lean) { // do something with the data @@ -584,8 +576,8 @@ class CacheBench { handle = nullptr; } // do lookup - handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW, - true); + handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, + Cache::Priority::LOW, true); if (handle) { if (!FLAGS_lean) { // do something with the data diff --git a/cache/cache_entry_roles.cc b/cache/cache_entry_roles.cc index b27349554d3..f83ada23134 100644 --- a/cache/cache_entry_roles.cc +++ b/cache/cache_entry_roles.cc @@ -101,34 +101,4 @@ std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) { return GetPrefixedCacheEntryRoleName(kPrefix, role); } -namespace { - -struct Registry { - std::mutex mutex; - UnorderedMap role_map; - void Register(Cache::DeleterFn fn, CacheEntryRole role) { - std::lock_guard lock(mutex); - role_map[fn] = role; - } - UnorderedMap Copy() { - std::lock_guard lock(mutex); - return role_map; - } -}; - -Registry& GetRegistry() { - STATIC_AVOID_DESTRUCTION(Registry, registry); - return registry; -} - -} // namespace - -void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) { - GetRegistry().Register(fn, role); -} - -UnorderedMap CopyCacheDeleterRoleMap() { - return GetRegistry().Copy(); -} - } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_entry_roles.h b/cache/cache_entry_roles.h index 5a49fdfd4b3..78bec792f2f 100644 --- a/cache/cache_entry_roles.h +++ b/cache/cache_entry_roles.h @@ -7,11 +7,8 @@ #include #include -#include -#include #include "rocksdb/cache.h" -#include "util/hash_containers.h" namespace ROCKSDB_NAMESPACE { @@ -20,84 +17,4 @@ extern std::array extern std::array kCacheEntryRoleToHyphenString; -// To associate cache entries with their role, we use a hack on the -// existing Cache interface. Because the deleter of an entry can authenticate -// the code origin of an entry, we can elaborate the choice of deleter to -// also encode role information, without inferring false role information -// from entries not choosing to encode a role. -// -// The rest of this file is for handling mappings between deleters and -// roles. - -// To infer a role from a deleter, the deleter must be registered. This -// can be done "manually" with this function. This function is thread-safe, -// and the registration mappings go into private but static storage. (Note -// that DeleterFn is a function pointer, not std::function. Registrations -// should not be too many.) -void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role); - -// Gets a copy of the registered deleter -> role mappings. This is the only -// function for reading the mappings made with RegisterCacheDeleterRole. -// Why only this interface for reading? -// * This function has to be thread safe, which could incur substantial -// overhead. We should not pay this overhead for every deleter look-up. -// * This is suitable for preparing for batch operations, like with -// CacheEntryStatsCollector. -// * The number of mappings should be sufficiently small (dozens). -UnorderedMap CopyCacheDeleterRoleMap(); - -// ************************************************************** // -// An automatic registration infrastructure. This enables code -// to simply ask for a deleter associated with a particular type -// and role, and registration is automatic. In a sense, this is -// a small dependency injection infrastructure, because linking -// in new deleter instantiations is essentially sufficient for -// making stats collection (using CopyCacheDeleterRoleMap) aware -// of them. - -namespace cache_entry_roles_detail { - -template -struct RegisteredDeleter { - RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); } - - // These have global linkage to help ensure compiler optimizations do not - // break uniqueness for each - static void Delete(const Slice& /* key */, void* value) { - // Supports T == Something[], unlike delete operator - std::default_delete()( - static_cast::type*>(value)); - } -}; - -template -struct RegisteredNoopDeleter { - RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); } - - static void Delete(const Slice& /* key */, void* /* value */) { - // Here was `assert(value == nullptr);` but we can also put pointers - // to static data in Cache, for testing at least. - } -}; - -} // namespace cache_entry_roles_detail - -// Get an automatically registered deleter for value type T and role R. -// Based on C++ semantics, registration is invoked exactly once in a -// thread-safe way on first call to this function, for each . -template -Cache::DeleterFn GetCacheEntryDeleterForRole() { - static cache_entry_roles_detail::RegisteredDeleter reg; - return reg.Delete; -} - -// Get an automatically registered no-op deleter (value should be nullptr) -// and associated with role R. This is used for Cache "reservation" entries -// such as for WriteBufferManager. -template -Cache::DeleterFn GetNoopDeleterForRole() { - static cache_entry_roles_detail::RegisteredNoopDeleter reg; - return reg.Delete; -} - } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h index 63b12735b9b..054304086d1 100644 --- a/cache/cache_entry_stats.h +++ b/cache/cache_entry_stats.h @@ -10,8 +10,8 @@ #include #include -#include "cache/cache_helpers.h" #include "cache/cache_key.h" +#include "cache/typed_cache.h" #include "port/lang.h" #include "rocksdb/cache.h" #include "rocksdb/status.h" @@ -111,11 +111,14 @@ class CacheEntryStatsCollector { // Gets or creates a shared instance of CacheEntryStatsCollector in the // cache itself, and saves into `ptr`. This shared_ptr will hold the // entry in cache until all refs are destroyed. - static Status GetShared(Cache *cache, SystemClock *clock, + static Status GetShared(Cache *raw_cache, SystemClock *clock, std::shared_ptr *ptr) { - const Slice &cache_key = GetCacheKey(); + assert(raw_cache); + BasicTypedCacheInterface + cache{raw_cache}; - Cache::Handle *h = cache->Lookup(cache_key); + const Slice &cache_key = GetCacheKey(); + auto h = cache.Lookup(cache_key); if (h == nullptr) { // Not yet in cache, but Cache doesn't provide a built-in way to // avoid racing insert. So we double-check under a shared mutex, @@ -123,15 +126,15 @@ class CacheEntryStatsCollector { STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex); std::lock_guard lock(static_mutex); - h = cache->Lookup(cache_key); + h = cache.Lookup(cache_key); if (h == nullptr) { - auto new_ptr = new CacheEntryStatsCollector(cache, clock); + auto new_ptr = new CacheEntryStatsCollector(cache.get(), clock); // TODO: non-zero charge causes some tests that count block cache // usage to go flaky. Fix the problem somehow so we can use an // accurate charge. size_t charge = 0; - Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h, - Cache::Priority::HIGH); + Status s = + cache.Insert(cache_key, new_ptr, charge, &h, Cache::Priority::HIGH); if (!s.ok()) { assert(h == nullptr); delete new_ptr; @@ -140,11 +143,11 @@ class CacheEntryStatsCollector { } } // If we reach here, shared entry is in cache with handle `h`. - assert(cache->GetDeleter(h) == Deleter); + assert(cache.get()->GetCacheItemHelper(h) == &cache.kBasicHelper); // Build an aliasing shared_ptr that keeps `ptr` in cache while there // are references. - *ptr = MakeSharedCacheHandleGuard(cache, h); + *ptr = cache.SharedGuard(h); return Status::OK(); } @@ -157,10 +160,6 @@ class CacheEntryStatsCollector { cache_(cache), clock_(clock) {} - static void Deleter(const Slice &, void *value) { - delete static_cast(value); - } - static const Slice &GetCacheKey() { // For each template instantiation static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime(); diff --git a/cache/cache_helpers.cc b/cache/cache_helpers.cc new file mode 100644 index 00000000000..22597bf6daf --- /dev/null +++ b/cache/cache_helpers.cc @@ -0,0 +1,40 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/cache_helpers.h" + +namespace ROCKSDB_NAMESPACE { + +void ReleaseCacheHandleCleanup(void* arg1, void* arg2) { + Cache* const cache = static_cast(arg1); + assert(cache); + + Cache::Handle* const cache_handle = static_cast(arg2); + assert(cache_handle); + + cache->Release(cache_handle); +} + +Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved, + Cache::CreateContext* create_context, + const Cache::CacheItemHelper* helper, + Cache::Priority priority, size_t* out_charge) { + assert(helper); + assert(helper->create_cb); + Cache::ObjectPtr value; + size_t charge; + Status st = helper->create_cb(saved, create_context, + cache->memory_allocator(), &value, &charge); + if (st.ok()) { + st = + cache->Insert(key, value, helper, charge, /*handle*/ nullptr, priority); + if (out_charge) { + *out_charge = charge; + } + } + return st; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_helpers.h b/cache/cache_helpers.h index 7ea2365b88b..eb4559dfe98 100644 --- a/cache/cache_helpers.h +++ b/cache/cache_helpers.h @@ -17,22 +17,17 @@ template T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) { assert(cache); assert(handle); - return static_cast(cache->Value(handle)); } -// Simple generic deleter for Cache (to be used with Cache::Insert). -template -void DeleteCacheEntry(const Slice& /* key */, void* value) { - delete static_cast(value); -} - // Turns a T* into a Slice so it can be used as a key with Cache. template -Slice GetSlice(const T* t) { +Slice GetSliceForKey(const T* t) { return Slice(reinterpret_cast(t), sizeof(T)); } +void ReleaseCacheHandleCleanup(void* arg1, void* arg2); + // Generic resource management object for cache handles that releases the handle // when destroyed. Has unique ownership of the handle, so copying it is not // allowed, while moving it transfers ownership. @@ -88,7 +83,7 @@ class CacheHandleGuard { if (cleanable) { if (handle_ != nullptr) { assert(cache_); - cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, handle_); + cleanable->RegisterCleanup(&ReleaseCacheHandleCleanup, cache_, handle_); } } ResetFields(); @@ -115,16 +110,6 @@ class CacheHandleGuard { value_ = nullptr; } - static void ReleaseCacheHandle(void* arg1, void* arg2) { - Cache* const cache = static_cast(arg1); - assert(cache); - - Cache::Handle* const cache_handle = static_cast(arg2); - assert(cache_handle); - - cache->Release(cache_handle); - } - private: Cache* cache_ = nullptr; Cache::Handle* handle_ = nullptr; @@ -139,7 +124,16 @@ template std::shared_ptr MakeSharedCacheHandleGuard(Cache* cache, Cache::Handle* handle) { auto wrapper = std::make_shared>(cache, handle); - return std::shared_ptr(wrapper, static_cast(cache->Value(handle))); + return std::shared_ptr(wrapper, GetFromCacheHandle(cache, handle)); } +// Given the persistable data (saved) for a block cache entry, parse that +// into a cache entry object and insert it into the given cache. The charge +// of the new entry can be returned to the caller through `out_charge`. +Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved, + Cache::CreateContext* create_context, + const Cache::CacheItemHelper* helper, + Cache::Priority priority = Cache::Priority::LOW, + size_t* out_charge = nullptr); + } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_reservation_manager.cc b/cache/cache_reservation_manager.cc index 53dee5d7903..b43bfddc693 100644 --- a/cache/cache_reservation_manager.cc +++ b/cache/cache_reservation_manager.cc @@ -13,7 +13,6 @@ #include #include -#include "cache/cache_entry_roles.h" #include "rocksdb/cache.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" @@ -41,17 +40,17 @@ CacheReservationManagerImpl< template CacheReservationManagerImpl::CacheReservationManagerImpl( std::shared_ptr cache, bool delayed_decrease) - : delayed_decrease_(delayed_decrease), + : cache_(cache), + delayed_decrease_(delayed_decrease), cache_allocated_size_(0), memory_used_(0) { assert(cache != nullptr); - cache_ = cache; } template CacheReservationManagerImpl::~CacheReservationManagerImpl() { for (auto* handle : dummy_handles_) { - cache_->Release(handle, true); + cache_.ReleaseAndEraseIfLastRef(handle); } } @@ -115,8 +114,7 @@ Status CacheReservationManagerImpl::IncreaseCacheReservation( Status return_status = Status::OK(); while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) { Cache::Handle* handle = nullptr; - return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry, - GetNoopDeleterForRole(), &handle); + return_status = cache_.Insert(GetNextCacheKey(), kSizeDummyEntry, &handle); if (return_status != Status::OK()) { return return_status; @@ -141,7 +139,7 @@ Status CacheReservationManagerImpl::DecreaseCacheReservation( cache_allocated_size_.load(std::memory_order_relaxed)) { assert(!dummy_handles_.empty()); auto* handle = dummy_handles_.back(); - cache_->Release(handle, true); + cache_.ReleaseAndEraseIfLastRef(handle); dummy_handles_.pop_back(); cache_allocated_size_ -= kSizeDummyEntry; } @@ -169,8 +167,9 @@ Slice CacheReservationManagerImpl::GetNextCacheKey() { } template -Cache::DeleterFn CacheReservationManagerImpl::TEST_GetNoopDeleterForRole() { - return GetNoopDeleterForRole(); +const Cache::CacheItemHelper* +CacheReservationManagerImpl::TEST_GetCacheItemHelperForRole() { + return &CacheInterface::kHelper; } template class CacheReservationManagerImpl< diff --git a/cache/cache_reservation_manager.h b/cache/cache_reservation_manager.h index 147aaa915d7..08bf59b0066 100644 --- a/cache/cache_reservation_manager.h +++ b/cache/cache_reservation_manager.h @@ -18,7 +18,7 @@ #include "cache/cache_entry_roles.h" #include "cache/cache_key.h" -#include "rocksdb/cache.h" +#include "cache/typed_cache.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "util/coding.h" @@ -197,10 +197,10 @@ class CacheReservationManagerImpl static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; } - // For testing only - it is to help ensure the NoopDeleterForRole + // For testing only - it is to help ensure the CacheItemHelperForRole // accessed from CacheReservationManagerImpl and the one accessed from the // test are from the same translation units - static Cache::DeleterFn TEST_GetNoopDeleterForRole(); + static const Cache::CacheItemHelper *TEST_GetCacheItemHelperForRole(); private: static constexpr std::size_t kSizeDummyEntry = 256 * 1024; @@ -211,7 +211,8 @@ class CacheReservationManagerImpl Status IncreaseCacheReservation(std::size_t new_mem_used); Status DecreaseCacheReservation(std::size_t new_mem_used); - std::shared_ptr cache_; + using CacheInterface = PlaceholderSharedCacheInterface; + CacheInterface cache_; bool delayed_decrease_; std::atomic cache_allocated_size_; std::size_t memory_used_; diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 75c28c2b8b6..32335f3d2b7 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -15,15 +15,15 @@ #include #include -#include "cache/fast_lru_cache.h" #include "cache/lru_cache.h" +#include "cache/typed_cache.h" #include "port/stack_trace.h" #include "test_util/testharness.h" #include "util/coding.h" #include "util/string_util.h" -// FastLRUCache and HyperClockCache only support 16-byte keys, so some of -// the tests originally wrote for LRUCache do not work on the other caches. +// HyperClockCache only supports 16-byte keys, so some of the tests +// originally written for LRUCache do not work on the other caches. // Those tests were adapted to use 16-byte keys. We kept the original ones. // TODO: Remove the original tests if they ever become unused. @@ -56,27 +56,34 @@ int DecodeKey32Bits(const Slice& k) { return DecodeFixed32(k.data()); } -void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } +Cache::ObjectPtr EncodeValue(uintptr_t v) { + return reinterpret_cast(v); +} int DecodeValue(void* v) { return static_cast(reinterpret_cast(v)); } -void DumbDeleter(const Slice& /*key*/, void* /*value*/) {} +const Cache::CacheItemHelper kDumbHelper{ + CacheEntryRole::kMisc, + [](Cache::ObjectPtr /*value*/, MemoryAllocator* /*alloc*/) {}}; -void EraseDeleter1(const Slice& /*key*/, void* value) { - Cache* cache = reinterpret_cast(value); - cache->Erase("foo"); -} +const Cache::CacheItemHelper kEraseOnDeleteHelper1{ + CacheEntryRole::kMisc, + [](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) { + Cache* cache = static_cast(value); + cache->Erase("foo"); + }}; -void EraseDeleter2(const Slice& /*key*/, void* value) { - Cache* cache = reinterpret_cast(value); - cache->Erase(EncodeKey16Bytes(1234)); -} +const Cache::CacheItemHelper kEraseOnDeleteHelper2{ + CacheEntryRole::kMisc, + [](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) { + Cache* cache = static_cast(value); + cache->Erase(EncodeKey16Bytes(1234)); + }}; const std::string kLRU = "lru"; const std::string kHyperClock = "hyper_clock"; -const std::string kFast = "fast"; } // anonymous namespace @@ -85,14 +92,11 @@ class CacheTest : public testing::TestWithParam { static CacheTest* current_; static std::string type_; - static void Deleter(const Slice& key, void* v) { - if (type_ == kFast || type_ == kHyperClock) { - current_->deleted_keys_.push_back(DecodeKey16Bytes(key)); - } else { - current_->deleted_keys_.push_back(DecodeKey32Bits(key)); - } + static void Deleter(Cache::ObjectPtr v, MemoryAllocator*) { current_->deleted_values_.push_back(DecodeValue(v)); } + static constexpr Cache::CacheItemHelper kHelper{CacheEntryRole::kMisc, + &Deleter}; static const int kCacheSize = 1000; static const int kNumShardBits = 4; @@ -100,7 +104,6 @@ class CacheTest : public testing::TestWithParam { static const int kCacheSize2 = 100; static const int kNumShardBits2 = 2; - std::vector deleted_keys_; std::vector deleted_values_; std::shared_ptr cache_; std::shared_ptr cache2_; @@ -126,11 +129,6 @@ class CacheTest : public testing::TestWithParam { capacity, estimated_value_size_ /*estimated_value_size*/) .MakeSharedCache(); } - if (type == kFast) { - return NewFastLRUCache( - capacity, estimated_value_size_, -1 /*num_shard_bits*/, - false /*strict_capacity_limit*/, kDefaultCacheMetadataChargePolicy); - } return nullptr; } @@ -153,11 +151,6 @@ class CacheTest : public testing::TestWithParam { nullptr /*allocator*/, charge_policy) .MakeSharedCache(); } - if (type == kFast) { - return NewFastLRUCache(capacity, 1 /*estimated_value_size*/, - num_shard_bits, strict_capacity_limit, - charge_policy); - } return nullptr; } @@ -167,7 +160,7 @@ class CacheTest : public testing::TestWithParam { // LRUCache doesn't, so the encoding depends on the cache type. std::string EncodeKey(int k) { auto type = GetParam(); - if (type == kFast || type == kHyperClock) { + if (type == kHyperClock) { return EncodeKey16Bytes(k); } else { return EncodeKey32Bits(k); @@ -176,7 +169,7 @@ class CacheTest : public testing::TestWithParam { int DecodeKey(const Slice& k) { auto type = GetParam(); - if (type == kFast || type == kHyperClock) { + if (type == kHyperClock) { return DecodeKey16Bytes(k); } else { return DecodeKey32Bits(k); @@ -194,37 +187,29 @@ class CacheTest : public testing::TestWithParam { void Insert(std::shared_ptr cache, int key, int value, int charge = 1) { - EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge, - &CacheTest::Deleter)); + EXPECT_OK( + cache->Insert(EncodeKey(key), EncodeValue(value), &kHelper, charge)); } void Erase(std::shared_ptr cache, int key) { cache->Erase(EncodeKey(key)); } - int Lookup(int key) { - return Lookup(cache_, key); - } + int Lookup(int key) { return Lookup(cache_, key); } void Insert(int key, int value, int charge = 1) { Insert(cache_, key, value, charge); } - void Erase(int key) { - Erase(cache_, key); - } + void Erase(int key) { Erase(cache_, key); } - int Lookup2(int key) { - return Lookup(cache2_, key); - } + int Lookup2(int key) { return Lookup(cache2_, key); } void Insert2(int key, int value, int charge = 1) { Insert(cache2_, key, value, charge); } - void Erase2(int key) { - Erase(cache2_, key); - } + void Erase2(int key) { Erase(cache2_, key); } }; CacheTest* CacheTest::current_; @@ -256,10 +241,8 @@ TEST_P(CacheTest, UsageTest) { key = EncodeKey(i); } auto kv_size = key.size() + 5; - ASSERT_OK(cache->Insert(key, reinterpret_cast(value), kv_size, - DumbDeleter)); - ASSERT_OK(precise_cache->Insert(key, reinterpret_cast(value), - kv_size, DumbDeleter)); + ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size)); + ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size)); usage += kv_size; ASSERT_EQ(usage, cache->GetUsage()); if (type == kHyperClock) { @@ -282,10 +265,8 @@ TEST_P(CacheTest, UsageTest) { } else { key = EncodeKey(static_cast(1000 + i)); } - ASSERT_OK(cache->Insert(key, reinterpret_cast(value), key.size() + 5, - DumbDeleter)); - ASSERT_OK(precise_cache->Insert(key, reinterpret_cast(value), - key.size() + 5, DumbDeleter)); + ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5)); + ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5)); } // the usage should be close to the capacity @@ -340,11 +321,9 @@ TEST_P(CacheTest, PinnedUsageTest) { auto kv_size = key.size() + 5; Cache::Handle* handle; Cache::Handle* handle_in_precise_cache; - ASSERT_OK(cache->Insert(key, reinterpret_cast(value), kv_size, - DumbDeleter, &handle)); + ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size, &handle)); assert(handle); - ASSERT_OK(precise_cache->Insert(key, reinterpret_cast(value), - kv_size, DumbDeleter, + ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size, &handle_in_precise_cache)); assert(handle_in_precise_cache); pinned_usage += kv_size; @@ -385,10 +364,8 @@ TEST_P(CacheTest, PinnedUsageTest) { } else { key = EncodeKey(static_cast(1000 + i)); } - ASSERT_OK(cache->Insert(key, reinterpret_cast(value), key.size() + 5, - DumbDeleter)); - ASSERT_OK(precise_cache->Insert(key, reinterpret_cast(value), - key.size() + 5, DumbDeleter)); + ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5)); + ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5)); } ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage()); @@ -418,13 +395,13 @@ TEST_P(CacheTest, HitAndMiss) { Insert(100, 101); ASSERT_EQ(101, Lookup(100)); - ASSERT_EQ(-1, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); + ASSERT_EQ(-1, Lookup(200)); + ASSERT_EQ(-1, Lookup(300)); Insert(200, 201); ASSERT_EQ(101, Lookup(100)); ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); + ASSERT_EQ(-1, Lookup(300)); Insert(100, 102); if (GetParam() == kHyperClock) { @@ -434,10 +411,9 @@ TEST_P(CacheTest, HitAndMiss) { ASSERT_EQ(102, Lookup(100)); } ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(-1, Lookup(300)); + ASSERT_EQ(-1, Lookup(300)); - ASSERT_EQ(1U, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(1U, deleted_values_.size()); if (GetParam() == kHyperClock) { ASSERT_EQ(102, deleted_values_[0]); } else { @@ -458,21 +434,20 @@ TEST_P(CacheTest, InsertSameKey) { TEST_P(CacheTest, Erase) { Erase(200); - ASSERT_EQ(0U, deleted_keys_.size()); + ASSERT_EQ(0U, deleted_values_.size()); Insert(100, 101); Insert(200, 201); Erase(100); - ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(-1, Lookup(100)); ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(1U, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(1U, deleted_values_.size()); ASSERT_EQ(101, deleted_values_[0]); Erase(100); - ASSERT_EQ(-1, Lookup(100)); + ASSERT_EQ(-1, Lookup(100)); ASSERT_EQ(201, Lookup(200)); - ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(1U, deleted_values_.size()); } TEST_P(CacheTest, EntriesArePinned) { @@ -489,23 +464,21 @@ TEST_P(CacheTest, EntriesArePinned) { Insert(100, 102); Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); - ASSERT_EQ(0U, deleted_keys_.size()); + ASSERT_EQ(0U, deleted_values_.size()); ASSERT_EQ(2U, cache_->GetUsage()); cache_->Release(h1); - ASSERT_EQ(1U, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[0]); + ASSERT_EQ(1U, deleted_values_.size()); ASSERT_EQ(101, deleted_values_[0]); ASSERT_EQ(1U, cache_->GetUsage()); Erase(100); ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(1U, deleted_values_.size()); ASSERT_EQ(1U, cache_->GetUsage()); cache_->Release(h2); - ASSERT_EQ(2U, deleted_keys_.size()); - ASSERT_EQ(100, deleted_keys_[1]); + ASSERT_EQ(2U, deleted_values_.size()); ASSERT_EQ(102, deleted_values_[1]); ASSERT_EQ(0U, cache_->GetUsage()); } @@ -515,7 +488,7 @@ TEST_P(CacheTest, EvictionPolicy) { Insert(200, 201); // Frequently used entry must be kept around for (int i = 0; i < 2 * kCacheSize; i++) { - Insert(1000+i, 2000+i); + Insert(1000 + i, 2000 + i); ASSERT_EQ(101, Lookup(100)); } ASSERT_EQ(101, Lookup(100)); @@ -608,9 +581,9 @@ TEST_P(CacheTest, EvictEmptyCache) { // Insert item large than capacity to trigger eviction on empty cache. auto cache = NewCache(1, 0, false); if (type == kLRU) { - ASSERT_OK(cache->Insert("foo", nullptr, 10, DumbDeleter)); + ASSERT_OK(cache->Insert("foo", nullptr, &kDumbHelper, 10)); } else { - ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, 10, DumbDeleter)); + ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, &kDumbHelper, 10)); } } @@ -621,19 +594,19 @@ TEST_P(CacheTest, EraseFromDeleter) { // the cache at that point. std::shared_ptr cache = NewCache(10, 0, false); std::string foo, bar; - Cache::DeleterFn erase_deleter; + const Cache::CacheItemHelper* erase_helper; if (type == kLRU) { foo = "foo"; bar = "bar"; - erase_deleter = EraseDeleter1; + erase_helper = &kEraseOnDeleteHelper1; } else { foo = EncodeKey(1234); bar = EncodeKey(5678); - erase_deleter = EraseDeleter2; + erase_helper = &kEraseOnDeleteHelper2; } - ASSERT_OK(cache->Insert(foo, nullptr, 1, DumbDeleter)); - ASSERT_OK(cache->Insert(bar, cache.get(), 1, erase_deleter)); + ASSERT_OK(cache->Insert(foo, nullptr, &kDumbHelper, 1)); + ASSERT_OK(cache->Insert(bar, cache.get(), erase_helper, 1)); cache->Erase(bar); ASSERT_EQ(nullptr, cache->Lookup(foo)); @@ -686,7 +659,7 @@ TEST_P(CacheTest, HeavyEntries) { ASSERT_EQ(1000 + i, r); } } - ASSERT_LE(cached_weight, kCacheSize + kCacheSize/10); + ASSERT_LE(cached_weight, kCacheSize + kCacheSize / 10); } TEST_P(CacheTest, NewId) { @@ -695,53 +668,54 @@ TEST_P(CacheTest, NewId) { ASSERT_NE(a, b); } -class Value { - public: - explicit Value(int v) : v_(v) {} - - int v_; -}; - -namespace { -void deleter(const Slice& /*key*/, void* value) { - delete static_cast(value); -} -} // namespace - TEST_P(CacheTest, ReleaseAndErase) { std::shared_ptr cache = NewCache(5, 0, false); Cache::Handle* handle; - Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1, - &CacheTest::Deleter, &handle); + Status s = + cache->Insert(EncodeKey(100), EncodeValue(100), &kHelper, 1, &handle); ASSERT_TRUE(s.ok()); ASSERT_EQ(5U, cache->GetCapacity()); ASSERT_EQ(1U, cache->GetUsage()); - ASSERT_EQ(0U, deleted_keys_.size()); + ASSERT_EQ(0U, deleted_values_.size()); auto erased = cache->Release(handle, true); ASSERT_TRUE(erased); // This tests that deleter has been called - ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(1U, deleted_values_.size()); } TEST_P(CacheTest, ReleaseWithoutErase) { std::shared_ptr cache = NewCache(5, 0, false); Cache::Handle* handle; - Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1, - &CacheTest::Deleter, &handle); + Status s = + cache->Insert(EncodeKey(100), EncodeValue(100), &kHelper, 1, &handle); ASSERT_TRUE(s.ok()); ASSERT_EQ(5U, cache->GetCapacity()); ASSERT_EQ(1U, cache->GetUsage()); - ASSERT_EQ(0U, deleted_keys_.size()); + ASSERT_EQ(0U, deleted_values_.size()); auto erased = cache->Release(handle); ASSERT_FALSE(erased); // This tests that deleter is not called. When cache has free capacity it is // not expected to immediately erase the released items. - ASSERT_EQ(0U, deleted_keys_.size()); + ASSERT_EQ(0U, deleted_values_.size()); } +namespace { +class Value { + public: + explicit Value(int v) : v_(v) {} + + int v_; + + static constexpr auto kCacheEntryRole = CacheEntryRole::kMisc; +}; + +using SharedCache = BasicTypedSharedCacheInterface; +using TypedHandle = SharedCache::TypedHandle; +} // namespace + TEST_P(CacheTest, SetCapacity) { auto type = GetParam(); - if (type == kFast || type == kHyperClock) { + if (type == kHyperClock) { ROCKSDB_GTEST_BYPASS( "FastLRUCache and HyperClockCache don't support arbitrary capacity " "adjustments."); @@ -751,19 +725,19 @@ TEST_P(CacheTest, SetCapacity) { // lets create a cache with capacity 5, // then, insert 5 elements, then increase capacity // to 10, returned capacity should be 10, usage=5 - std::shared_ptr cache = NewCache(5, 0, false); - std::vector handles(10); + SharedCache cache{NewCache(5, 0, false)}; + std::vector handles(10); // Insert 5 entries, but not releasing. for (int i = 0; i < 5; i++) { std::string key = EncodeKey(i + 1); - Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]); ASSERT_TRUE(s.ok()); } - ASSERT_EQ(5U, cache->GetCapacity()); - ASSERT_EQ(5U, cache->GetUsage()); - cache->SetCapacity(10); - ASSERT_EQ(10U, cache->GetCapacity()); - ASSERT_EQ(5U, cache->GetUsage()); + ASSERT_EQ(5U, cache.get()->GetCapacity()); + ASSERT_EQ(5U, cache.get()->GetUsage()); + cache.get()->SetCapacity(10); + ASSERT_EQ(10U, cache.get()->GetCapacity()); + ASSERT_EQ(5U, cache.get()->GetUsage()); // test2: decrease capacity // insert 5 more elements to cache, then release 5, @@ -771,85 +745,77 @@ TEST_P(CacheTest, SetCapacity) { // and usage should be 7 for (int i = 5; i < 10; i++) { std::string key = EncodeKey(i + 1); - Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]); ASSERT_TRUE(s.ok()); } - ASSERT_EQ(10U, cache->GetCapacity()); - ASSERT_EQ(10U, cache->GetUsage()); + ASSERT_EQ(10U, cache.get()->GetCapacity()); + ASSERT_EQ(10U, cache.get()->GetUsage()); for (int i = 0; i < 5; i++) { - cache->Release(handles[i]); + cache.Release(handles[i]); } - ASSERT_EQ(10U, cache->GetCapacity()); - ASSERT_EQ(10U, cache->GetUsage()); - cache->SetCapacity(7); - ASSERT_EQ(7, cache->GetCapacity()); - ASSERT_EQ(7, cache->GetUsage()); + ASSERT_EQ(10U, cache.get()->GetCapacity()); + ASSERT_EQ(10U, cache.get()->GetUsage()); + cache.get()->SetCapacity(7); + ASSERT_EQ(7, cache.get()->GetCapacity()); + ASSERT_EQ(7, cache.get()->GetUsage()); // release remaining 5 to keep valgrind happy for (int i = 5; i < 10; i++) { - cache->Release(handles[i]); + cache.Release(handles[i]); } // Make sure this doesn't crash or upset ASAN/valgrind - cache->DisownData(); + cache.get()->DisownData(); } TEST_P(LRUCacheTest, SetStrictCapacityLimit) { - auto type = GetParam(); - if (type == kFast) { - ROCKSDB_GTEST_BYPASS( - "FastLRUCache only supports a limited number of " - "inserts beyond " - "capacity."); - return; - } // test1: set the flag to false. Insert more keys than capacity. See if they // all go through. - std::shared_ptr cache = NewCache(5, 0, false); - std::vector handles(10); + SharedCache cache{NewCache(5, 0, false)}; + std::vector handles(10); Status s; for (int i = 0; i < 10; i++) { std::string key = EncodeKey(i + 1); - s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + s = cache.Insert(key, new Value(i + 1), 1, &handles[i]); ASSERT_OK(s); ASSERT_NE(nullptr, handles[i]); } - ASSERT_EQ(10, cache->GetUsage()); + ASSERT_EQ(10, cache.get()->GetUsage()); // test2: set the flag to true. Insert and check if it fails. std::string extra_key = EncodeKey(100); Value* extra_value = new Value(0); - cache->SetStrictCapacityLimit(true); - Cache::Handle* handle; - s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle); + cache.get()->SetStrictCapacityLimit(true); + TypedHandle* handle; + s = cache.Insert(extra_key, extra_value, 1, &handle); ASSERT_TRUE(s.IsMemoryLimit()); ASSERT_EQ(nullptr, handle); - ASSERT_EQ(10, cache->GetUsage()); + ASSERT_EQ(10, cache.get()->GetUsage()); for (int i = 0; i < 10; i++) { - cache->Release(handles[i]); + cache.Release(handles[i]); } // test3: init with flag being true. - std::shared_ptr cache2 = NewCache(5, 0, true); + SharedCache cache2{NewCache(5, 0, true)}; for (int i = 0; i < 5; i++) { std::string key = EncodeKey(i + 1); - s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + s = cache2.Insert(key, new Value(i + 1), 1, &handles[i]); ASSERT_OK(s); ASSERT_NE(nullptr, handles[i]); } - s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle); + s = cache2.Insert(extra_key, extra_value, 1, &handle); ASSERT_TRUE(s.IsMemoryLimit()); ASSERT_EQ(nullptr, handle); // test insert without handle - s = cache2->Insert(extra_key, extra_value, 1, &deleter); + s = cache2.Insert(extra_key, extra_value, 1); // AS if the key have been inserted into cache but get evicted immediately. ASSERT_OK(s); - ASSERT_EQ(5, cache2->GetUsage()); - ASSERT_EQ(nullptr, cache2->Lookup(extra_key)); + ASSERT_EQ(5, cache2.get()->GetUsage()); + ASSERT_EQ(nullptr, cache2.Lookup(extra_key)); for (int i = 0; i < 5; i++) { - cache2->Release(handles[i]); + cache2.Release(handles[i]); } } @@ -857,55 +823,54 @@ TEST_P(CacheTest, OverCapacity) { size_t n = 10; // a LRUCache with n entries and one shard only - std::shared_ptr cache = NewCache(n, 0, false); - - std::vector handles(n+1); + SharedCache cache{NewCache(n, 0, false)}; + std::vector handles(n + 1); // Insert n+1 entries, but not releasing. for (int i = 0; i < static_cast(n + 1); i++) { std::string key = EncodeKey(i + 1); - Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]); ASSERT_TRUE(s.ok()); } // Guess what's in the cache now? for (int i = 0; i < static_cast(n + 1); i++) { std::string key = EncodeKey(i + 1); - auto h = cache->Lookup(key); + auto h = cache.Lookup(key); ASSERT_TRUE(h != nullptr); - if (h) cache->Release(h); + if (h) cache.Release(h); } // the cache is over capacity since nothing could be evicted - ASSERT_EQ(n + 1U, cache->GetUsage()); + ASSERT_EQ(n + 1U, cache.get()->GetUsage()); for (int i = 0; i < static_cast(n + 1); i++) { - cache->Release(handles[i]); + cache.Release(handles[i]); } if (GetParam() == kHyperClock) { // Make sure eviction is triggered. - ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0])); + ASSERT_OK(cache.Insert(EncodeKey(-1), nullptr, 1, &handles[0])); // cache is under capacity now since elements were released - ASSERT_GE(n, cache->GetUsage()); + ASSERT_GE(n, cache.get()->GetUsage()); // clean up - cache->Release(handles[0]); + cache.Release(handles[0]); } else { // LRUCache checks for over-capacity in Release. // cache is exactly at capacity now with minimal eviction - ASSERT_EQ(n, cache->GetUsage()); + ASSERT_EQ(n, cache.get()->GetUsage()); // element 0 is evicted and the rest is there // This is consistent with the LRU policy since the element 0 // was released first for (int i = 0; i < static_cast(n + 1); i++) { std::string key = EncodeKey(i + 1); - auto h = cache->Lookup(key); + auto h = cache.Lookup(key); if (h) { ASSERT_NE(static_cast(i), 0U); - cache->Release(h); + cache.Release(h); } else { ASSERT_EQ(static_cast(i), 0U); } @@ -913,40 +878,15 @@ TEST_P(CacheTest, OverCapacity) { } } -namespace { -std::vector> legacy_callback_state; -void legacy_callback(void* value, size_t charge) { - legacy_callback_state.push_back( - {DecodeValue(value), static_cast(charge)}); -} -}; - -TEST_P(CacheTest, ApplyToAllCacheEntriesTest) { - std::vector> inserted; - legacy_callback_state.clear(); - - for (int i = 0; i < 10; ++i) { - Insert(i, i * 2, i + 1); - inserted.push_back({i * 2, i + 1}); - } - cache_->ApplyToAllCacheEntries(legacy_callback, true); - - std::sort(inserted.begin(), inserted.end()); - std::sort(legacy_callback_state.begin(), legacy_callback_state.end()); - ASSERT_EQ(inserted.size(), legacy_callback_state.size()); - for (int i = 0; i < static_cast(inserted.size()); ++i) { - EXPECT_EQ(inserted[i], legacy_callback_state[i]); - } -} - TEST_P(CacheTest, ApplyToAllEntriesTest) { std::vector callback_state; - const auto callback = [&](const Slice& key, void* value, size_t charge, - Cache::DeleterFn deleter) { + const auto callback = [&](const Slice& key, Cache::ObjectPtr value, + size_t charge, + const Cache::CacheItemHelper* helper) { callback_state.push_back(std::to_string(DecodeKey(key)) + "," + std::to_string(DecodeValue(value)) + "," + std::to_string(charge)); - assert(deleter == &CacheTest::Deleter); + assert(helper == &CacheTest::kHelper); }; std::vector inserted; @@ -985,8 +925,8 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) { // For callback int special_count = 0; - const auto callback = [&](const Slice&, void*, size_t charge, - Cache::DeleterFn) { + const auto callback = [&](const Slice&, Cache::ObjectPtr, size_t charge, + const Cache::CacheItemHelper*) { if (charge == static_cast(kSpecialCharge)) { ++special_count; } @@ -1048,14 +988,13 @@ TEST_P(CacheTest, GetChargeAndDeleter) { Cache::Handle* h1 = cache_->Lookup(EncodeKey(1)); ASSERT_EQ(2, DecodeValue(cache_->Value(h1))); ASSERT_EQ(1, cache_->GetCharge(h1)); - ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1)); + ASSERT_EQ(&CacheTest::kHelper, cache_->GetCacheItemHelper(h1)); cache_->Release(h1); } INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, - testing::Values(kLRU, kHyperClock, kFast)); -INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, - testing::Values(kLRU, kFast)); + testing::Values(kLRU, kHyperClock)); +INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU)); } // namespace ROCKSDB_NAMESPACE diff --git a/cache/charged_cache.cc b/cache/charged_cache.cc index a9ff969b81e..3c32fc9611a 100644 --- a/cache/charged_cache.cc +++ b/cache/charged_cache.cc @@ -17,25 +17,10 @@ ChargedCache::ChargedCache(std::shared_ptr cache, CacheReservationManagerImpl>( block_cache))) {} -Status ChargedCache::Insert(const Slice& key, void* value, size_t charge, - DeleterFn deleter, Handle** handle, - Priority priority) { - Status s = cache_->Insert(key, value, charge, deleter, handle, priority); - if (s.ok()) { - // Insert may cause the cache entry eviction if the cache is full. So we - // directly call the reservation manager to update the total memory used - // in the cache. - assert(cache_res_mgr_); - cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) - .PermitUncheckedError(); - } - return s; -} - -Status ChargedCache::Insert(const Slice& key, void* value, +Status ChargedCache::Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper, size_t charge, Handle** handle, Priority priority) { - Status s = cache_->Insert(key, value, helper, charge, handle, priority); + Status s = cache_->Insert(key, obj, helper, charge, handle, priority); if (s.ok()) { // Insert may cause the cache entry eviction if the cache is full. So we // directly call the reservation manager to update the total memory used @@ -47,22 +32,21 @@ Status ChargedCache::Insert(const Slice& key, void* value, return s; } -Cache::Handle* ChargedCache::Lookup(const Slice& key, Statistics* stats) { - return cache_->Lookup(key, stats); -} - Cache::Handle* ChargedCache::Lookup(const Slice& key, const CacheItemHelper* helper, - const CreateCallback& create_cb, + CreateContext* create_context, Priority priority, bool wait, Statistics* stats) { - auto handle = cache_->Lookup(key, helper, create_cb, priority, wait, stats); + auto handle = + cache_->Lookup(key, helper, create_context, priority, wait, stats); // Lookup may promote the KV pair from the secondary cache to the primary // cache. So we directly call the reservation manager to update the total // memory used in the cache. - assert(cache_res_mgr_); - cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) - .PermitUncheckedError(); + if (helper && helper->create_cb) { + assert(cache_res_mgr_); + cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + .PermitUncheckedError(); + } return handle; } diff --git a/cache/charged_cache.h b/cache/charged_cache.h index 1739e408893..4bbb6675962 100644 --- a/cache/charged_cache.h +++ b/cache/charged_cache.h @@ -23,16 +23,14 @@ class ChargedCache : public Cache { std::shared_ptr block_cache); ~ChargedCache() override = default; - Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter, - Handle** handle, Priority priority) override; - Status Insert(const Slice& key, void* value, const CacheItemHelper* helper, + Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper, size_t charge, Handle** handle = nullptr, Priority priority = Priority::LOW) override; - Cache::Handle* Lookup(const Slice& key, Statistics* stats) override; Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper, - const CreateCallback& create_cb, Priority priority, - bool wait, Statistics* stats = nullptr) override; + CreateContext* create_context, + Priority priority = Priority::LOW, bool wait = true, + Statistics* stats = nullptr) override; bool Release(Cache::Handle* handle, bool useful, bool erase_if_last_ref = false) override; @@ -56,7 +54,9 @@ class ChargedCache : public Cache { return cache_->HasStrictCapacityLimit(); } - void* Value(Cache::Handle* handle) override { return cache_->Value(handle); } + ObjectPtr Value(Cache::Handle* handle) override { + return cache_->Value(handle); + } bool IsReady(Cache::Handle* handle) override { return cache_->IsReady(handle); @@ -84,22 +84,17 @@ class ChargedCache : public Cache { return cache_->GetCharge(handle); } - Cache::DeleterFn GetDeleter(Cache::Handle* handle) const override { - return cache_->GetDeleter(handle); + const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { + return cache_->GetCacheItemHelper(handle); } void ApplyToAllEntries( - const std::function& callback, + const std::function& callback, const Cache::ApplyToAllEntriesOptions& opts) override { cache_->ApplyToAllEntries(callback, opts); } - void ApplyToAllCacheEntries(void (*callback)(void* value, size_t charge), - bool thread_safe) override { - cache_->ApplyToAllCacheEntries(callback, thread_safe); - } - std::string GetPrintableOptions() const override { return cache_->GetPrintableOptions(); } diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 27793de9cc4..9476dba7a8d 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -11,8 +11,10 @@ #include #include +#include #include "cache/cache_key.h" +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "port/lang.h" @@ -22,40 +24,128 @@ namespace ROCKSDB_NAMESPACE { -namespace hyper_clock_cache { +namespace clock_cache { +namespace { inline uint64_t GetRefcount(uint64_t meta) { return ((meta >> ClockHandle::kAcquireCounterShift) - (meta >> ClockHandle::kReleaseCounterShift)) & ClockHandle::kCounterMask; } -void ClockHandleBasicData::FreeData() const { - if (deleter) { - UniqueId64x2 unhashed; - (*deleter)(ClockCacheShard::ReverseHash(hashed_key, &unhashed), value); +inline uint64_t GetInitialCountdown(Cache::Priority priority) { + // Set initial clock data from priority + // TODO: configuration parameters for priority handling and clock cycle + // count? + switch (priority) { + case Cache::Priority::HIGH: + return ClockHandle::kHighCountdown; + default: + assert(false); + FALLTHROUGH_INTENDED; + case Cache::Priority::LOW: + return ClockHandle::kLowCountdown; + case Cache::Priority::BOTTOM: + return ClockHandle::kBottomCountdown; } } -static_assert(sizeof(ClockHandle) == 64U, - "Expecting size / alignment with common cache line size"); +inline void FreeDataMarkEmpty(ClockHandle& h, MemoryAllocator* allocator) { + // NOTE: in theory there's more room for parallelism if we copy the handle + // data and delay actions like this until after marking the entry as empty, + // but performance tests only show a regression by copying the few words + // of data. + h.FreeData(allocator); -ClockHandleTable::ClockHandleTable(int hash_bits, bool initial_charge_metadata) - : length_bits_(hash_bits), +#ifndef NDEBUG + // Mark slot as empty, with assertion + uint64_t meta = h.meta.exchange(0, std::memory_order_release); + assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction); +#else + // Mark slot as empty + h.meta.store(0, std::memory_order_release); +#endif +} + +inline bool ClockUpdate(ClockHandle& h) { + uint64_t meta = h.meta.load(std::memory_order_relaxed); + + uint64_t acquire_count = + (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask; + uint64_t release_count = + (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask; + // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count, + // release_count, (unsigned)(meta >> ClockHandle::kStateShift)); + if (acquire_count != release_count) { + // Only clock update entries with no outstanding refs + return false; + } + if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) { + // Only clock update Shareable entries + return false; + } + if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && + acquire_count > 0) { + // Decrement clock + uint64_t new_count = + std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1); + // Compare-exchange in the decremented clock info, but + // not aggressively + uint64_t new_meta = + (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) | + (new_count << ClockHandle::kReleaseCounterShift) | + (new_count << ClockHandle::kAcquireCounterShift); + h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed); + return false; + } + // Otherwise, remove entry (either unreferenced invisible or + // unreferenced and expired visible). + if (h.meta.compare_exchange_strong( + meta, + uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, + std::memory_order_acquire)) { + // Took ownership. + return true; + } else { + // Compare-exchange failing probably + // indicates the entry was used, so skip it in that case. + return false; + } +} + +} // namespace + +void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const { + if (helper->del_cb) { + helper->del_cb(value, allocator); + } +} + +HyperClockTable::HyperClockTable( + size_t capacity, bool /*strict_capacity_limit*/, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, const Opts& opts) + : length_bits_(CalcHashBits(capacity, opts.estimated_value_size, + metadata_charge_policy)), length_bits_mask_((size_t{1} << length_bits_) - 1), occupancy_limit_(static_cast((uint64_t{1} << length_bits_) * kStrictLoadFactor)), - array_(new ClockHandle[size_t{1} << length_bits_]) { - if (initial_charge_metadata) { - usage_ += size_t{GetTableSize()} * sizeof(ClockHandle); + array_(new HandleImpl[size_t{1} << length_bits_]), + allocator_(allocator) { + if (metadata_charge_policy == + CacheMetadataChargePolicy::kFullChargeCacheMetadata) { + usage_ += size_t{GetTableSize()} * sizeof(HandleImpl); } + + static_assert(sizeof(HandleImpl) == 64U, + "Expecting size / alignment with common cache line size"); } -ClockHandleTable::~ClockHandleTable() { +HyperClockTable::~HyperClockTable() { // Assumes there are no references or active operations on any slot/element // in the table. for (size_t i = 0; i < GetTableSize(); i++) { - ClockHandle& h = array_[i]; + HandleImpl& h = array_[i]; switch (h.meta >> ClockHandle::kStateShift) { case ClockHandle::kStateEmpty: // noop @@ -63,11 +153,10 @@ ClockHandleTable::~ClockHandleTable() { case ClockHandle::kStateInvisible: // rare but possible case ClockHandle::kStateVisible: assert(GetRefcount(h.meta) == 0); - h.FreeData(); + h.FreeData(allocator_); #ifndef NDEBUG Rollback(h.hashed_key, &h); - usage_.fetch_sub(h.total_charge, std::memory_order_relaxed); - occupancy_.fetch_sub(1U, std::memory_order_relaxed); + ReclaimEntryUsage(h.GetTotalCharge()); #endif break; // otherwise @@ -84,7 +173,7 @@ ClockHandleTable::~ClockHandleTable() { #endif assert(usage_.load() == 0 || - usage_.load() == size_t{GetTableSize()} * sizeof(ClockHandle)); + usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl)); assert(occupancy_ == 0); } @@ -161,9 +250,141 @@ inline void CorrectNearOverflow(uint64_t old_meta, } } -Status ClockHandleTable::Insert(const ClockHandleBasicData& proto, - ClockHandle** handle, Cache::Priority priority, - size_t capacity, bool strict_capacity_limit) { +inline Status HyperClockTable::ChargeUsageMaybeEvictStrict( + size_t total_charge, size_t capacity, bool need_evict_for_occupancy) { + if (total_charge > capacity) { + return Status::MemoryLimit( + "Cache entry too large for a single cache shard: " + + std::to_string(total_charge) + " > " + std::to_string(capacity)); + } + // Grab any available capacity, and free up any more required. + size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t new_usage; + if (LIKELY(old_usage != capacity)) { + do { + new_usage = std::min(capacity, old_usage + total_charge); + } while (!usage_.compare_exchange_weak(old_usage, new_usage, + std::memory_order_relaxed)); + } else { + new_usage = old_usage; + } + // How much do we need to evict then? + size_t need_evict_charge = old_usage + total_charge - new_usage; + size_t request_evict_charge = need_evict_charge; + if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) { + // Require at least 1 eviction. + request_evict_charge = 1; + } + if (request_evict_charge > 0) { + size_t evicted_charge = 0; + size_t evicted_count = 0; + Evict(request_evict_charge, &evicted_charge, &evicted_count); + occupancy_.fetch_sub(evicted_count, std::memory_order_release); + if (LIKELY(evicted_charge > need_evict_charge)) { + assert(evicted_count > 0); + // Evicted more than enough + usage_.fetch_sub(evicted_charge - need_evict_charge, + std::memory_order_relaxed); + } else if (evicted_charge < need_evict_charge || + (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) { + // Roll back to old usage minus evicted + usage_.fetch_sub(evicted_charge + (new_usage - old_usage), + std::memory_order_relaxed); + if (evicted_charge < need_evict_charge) { + return Status::MemoryLimit( + "Insert failed because unable to evict entries to stay within " + "capacity limit."); + } else { + return Status::MemoryLimit( + "Insert failed because unable to evict entries to stay within " + "table occupancy limit."); + } + } + // If we needed to evict something and we are proceeding, we must have + // evicted something. + assert(evicted_count > 0); + } + return Status::OK(); +} + +inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict( + size_t total_charge, size_t capacity, bool need_evict_for_occupancy) { + // For simplicity, we consider that either the cache can accept the insert + // with no evictions, or we must evict enough to make (at least) enough + // space. It could lead to unnecessary failures or excessive evictions in + // some extreme cases, but allows a fast, simple protocol. If we allow a + // race to get us over capacity, then we might never get back to capacity + // limit if the sizes of entries allow each insertion to evict the minimum + // charge. Thus, we should evict some extra if it's not a signifcant + // portion of the shard capacity. This can have the side benefit of + // involving fewer threads in eviction. + size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t need_evict_charge; + // NOTE: if total_charge > old_usage, there isn't yet enough to evict + // `total_charge` amount. Even if we only try to evict `old_usage` amount, + // there's likely something referenced and we would eat CPU looking for + // enough to evict. + if (old_usage + total_charge <= capacity || total_charge > old_usage) { + // Good enough for me (might run over with a race) + need_evict_charge = 0; + } else { + // Try to evict enough space, and maybe some extra + need_evict_charge = total_charge; + if (old_usage > capacity) { + // Not too much to avoid thundering herd while avoiding strict + // synchronization, such as the compare_exchange used with strict + // capacity limit. + need_evict_charge += std::min(capacity / 1024, total_charge) + 1; + } + } + if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) { + // Special case: require at least 1 eviction if we only have to + // deal with occupancy + need_evict_charge = 1; + } + size_t evicted_charge = 0; + size_t evicted_count = 0; + if (need_evict_charge > 0) { + Evict(need_evict_charge, &evicted_charge, &evicted_count); + // Deal with potential occupancy deficit + if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) { + assert(evicted_charge == 0); + // Can't meet occupancy requirement + return false; + } else { + // Update occupancy for evictions + occupancy_.fetch_sub(evicted_count, std::memory_order_release); + } + } + // Track new usage even if we weren't able to evict enough + usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed); + // No underflow + assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + // Success + return true; +} + +inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert( + const ClockHandleBasicData& proto) { + // Heap allocated separate from table + HandleImpl* h = new HandleImpl(); + ClockHandleBasicData* h_alias = h; + *h_alias = proto; + h->SetDetached(); + // Single reference (detached entries only created if returning a refed + // Handle back to user) + uint64_t meta = uint64_t{ClockHandle::kStateInvisible} + << ClockHandle::kStateShift; + meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; + h->meta.store(meta, std::memory_order_release); + // Keep track of how much of usage is detached + detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed); + return h; +} + +Status HyperClockTable::Insert(const ClockHandleBasicData& proto, + HandleImpl** handle, Cache::Priority priority, + size_t capacity, bool strict_capacity_limit) { // Do we have the available occupancy? Optimistically assume we do // and deal with it if we don't. size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire); @@ -176,124 +397,31 @@ Status ClockHandleTable::Insert(const ClockHandleBasicData& proto, // Usage/capacity handling is somewhat different depending on // strict_capacity_limit, but mostly pessimistic. bool use_detached_insert = false; - const size_t total_charge = proto.total_charge; + const size_t total_charge = proto.GetTotalCharge(); if (strict_capacity_limit) { - if (total_charge > capacity) { - assert(!use_detached_insert); + Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity, + need_evict_for_occupancy); + if (!s.ok()) { revert_occupancy_fn(); - return Status::MemoryLimit( - "Cache entry too large for a single cache shard: " + - std::to_string(total_charge) + " > " + std::to_string(capacity)); - } - // Grab any available capacity, and free up any more required. - size_t old_usage = usage_.load(std::memory_order_relaxed); - size_t new_usage; - if (LIKELY(old_usage != capacity)) { - do { - new_usage = std::min(capacity, old_usage + total_charge); - } while (!usage_.compare_exchange_weak(old_usage, new_usage, - std::memory_order_relaxed)); - } else { - new_usage = old_usage; - } - // How much do we need to evict then? - size_t need_evict_charge = old_usage + total_charge - new_usage; - size_t request_evict_charge = need_evict_charge; - if (UNLIKELY(need_evict_for_occupancy) && request_evict_charge == 0) { - // Require at least 1 eviction. - request_evict_charge = 1; - } - if (request_evict_charge > 0) { - size_t evicted_charge = 0; - size_t evicted_count = 0; - Evict(request_evict_charge, &evicted_charge, &evicted_count); - occupancy_.fetch_sub(evicted_count, std::memory_order_release); - if (LIKELY(evicted_charge > need_evict_charge)) { - assert(evicted_count > 0); - // Evicted more than enough - usage_.fetch_sub(evicted_charge - need_evict_charge, - std::memory_order_relaxed); - } else if (evicted_charge < need_evict_charge || - (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) { - // Roll back to old usage minus evicted - usage_.fetch_sub(evicted_charge + (new_usage - old_usage), - std::memory_order_relaxed); - assert(!use_detached_insert); - revert_occupancy_fn(); - if (evicted_charge < need_evict_charge) { - return Status::MemoryLimit( - "Insert failed because unable to evict entries to stay within " - "capacity limit."); - } else { - return Status::MemoryLimit( - "Insert failed because unable to evict entries to stay within " - "table occupancy limit."); - } - } - // If we needed to evict something and we are proceeding, we must have - // evicted something. - assert(evicted_count > 0); + return s; } } else { // Case strict_capacity_limit == false - - // For simplicity, we consider that either the cache can accept the insert - // with no evictions, or we must evict enough to make (at least) enough - // space. It could lead to unnecessary failures or excessive evictions in - // some extreme cases, but allows a fast, simple protocol. If we allow a - // race to get us over capacity, then we might never get back to capacity - // limit if the sizes of entries allow each insertion to evict the minimum - // charge. Thus, we should evict some extra if it's not a signifcant - // portion of the shard capacity. This can have the side benefit of - // involving fewer threads in eviction. - size_t old_usage = usage_.load(std::memory_order_relaxed); - size_t need_evict_charge; - // NOTE: if total_charge > old_usage, there isn't yet enough to evict - // `total_charge` amount. Even if we only try to evict `old_usage` amount, - // there's likely something referenced and we would eat CPU looking for - // enough to evict. - if (old_usage + total_charge <= capacity || total_charge > old_usage) { - // Good enough for me (might run over with a race) - need_evict_charge = 0; - } else { - // Try to evict enough space, and maybe some extra - need_evict_charge = total_charge; - if (old_usage > capacity) { - // Not too much to avoid thundering herd while avoiding strict - // synchronization - need_evict_charge += std::min(capacity / 1024, total_charge) + 1; - } - } - if (UNLIKELY(need_evict_for_occupancy) && need_evict_charge == 0) { - // Special case: require at least 1 eviction if we only have to - // deal with occupancy - need_evict_charge = 1; - } - size_t evicted_charge = 0; - size_t evicted_count = 0; - if (need_evict_charge > 0) { - Evict(need_evict_charge, &evicted_charge, &evicted_count); - // Deal with potential occupancy deficit - if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) { - assert(evicted_charge == 0); - revert_occupancy_fn(); - if (handle == nullptr) { - // Don't insert the entry but still return ok, as if the entry - // inserted into cache and evicted immediately. - proto.FreeData(); - return Status::OK(); - } else { - use_detached_insert = true; - } + bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity, + need_evict_for_occupancy); + if (!success) { + revert_occupancy_fn(); + if (handle == nullptr) { + // Don't insert the entry but still return ok, as if the entry + // inserted into cache and evicted immediately. + proto.FreeData(allocator_); + return Status::OK(); } else { - // Update occupancy for evictions - occupancy_.fetch_sub(evicted_count, std::memory_order_release); + // Need to track usage of fallback detached insert + usage_.fetch_add(total_charge, std::memory_order_relaxed); + use_detached_insert = true; } } - // Track new usage even if we weren't able to evict enough - usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed); - // No underflow - assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); } auto revert_usage_fn = [&]() { usage_.fetch_sub(total_charge, std::memory_order_relaxed); @@ -310,30 +438,13 @@ Status ClockHandleTable::Insert(const ClockHandleBasicData& proto, // * Have to insert into a suboptimal location (more probes) so that the // old entry can be kept around as well. - // Set initial clock data from priority - // TODO: configuration parameters for priority handling and clock cycle - // count? - uint64_t initial_countdown; - switch (priority) { - case Cache::Priority::HIGH: - initial_countdown = ClockHandle::kHighCountdown; - break; - default: - assert(false); - FALLTHROUGH_INTENDED; - case Cache::Priority::LOW: - initial_countdown = ClockHandle::kLowCountdown; - break; - case Cache::Priority::BOTTOM: - initial_countdown = ClockHandle::kBottomCountdown; - break; - } + uint64_t initial_countdown = GetInitialCountdown(priority); assert(initial_countdown > 0); size_t probe = 0; - ClockHandle* e = FindSlot( + HandleImpl* e = FindSlot( proto.hashed_key, - [&](ClockHandle* h) { + [&](HandleImpl* h) { // Optimistically transition the slot from "empty" to // "under construction" (no effect on other states) uint64_t old_meta = @@ -414,8 +525,8 @@ Status ClockHandleTable::Insert(const ClockHandleBasicData& proto, (void)old_meta; return false; }, - [&](ClockHandle* /*h*/) { return false; }, - [&](ClockHandle* h) { + [&](HandleImpl* /*h*/) { return false; }, + [&](HandleImpl* h) { h->displacements.fetch_add(1, std::memory_order_relaxed); }, probe); @@ -444,7 +555,7 @@ Status ClockHandleTable::Insert(const ClockHandleBasicData& proto, if (handle == nullptr) { revert_usage_fn(); // As if unrefed entry immdiately evicted - proto.FreeData(); + proto.FreeData(allocator_); return Status::OK(); } } @@ -452,20 +563,8 @@ Status ClockHandleTable::Insert(const ClockHandleBasicData& proto, // Run detached insert assert(use_detached_insert); - ClockHandle* h = new ClockHandle(); - ClockHandleBasicData* h_alias = h; - *h_alias = proto; - h->detached = true; - // Single reference (detached entries only created if returning a refed - // Handle back to user) - uint64_t meta = uint64_t{ClockHandle::kStateInvisible} - << ClockHandle::kStateShift; - meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; - h->meta.store(meta, std::memory_order_release); - // Keep track of usage - detached_usage_.fetch_add(total_charge, std::memory_order_relaxed); + *handle = DetachedInsert(proto); - *handle = h; // The OkOverwritten status is used to count "redundant" insertions into // block cache. This implementation doesn't strictly check for redundant // insertions, but we instead are probably interested in how many insertions @@ -474,11 +573,12 @@ Status ClockHandleTable::Insert(const ClockHandleBasicData& proto, return Status::OkOverwritten(); } -ClockHandle* ClockHandleTable::Lookup(const UniqueId64x2& hashed_key) { +HyperClockTable::HandleImpl* HyperClockTable::Lookup( + const UniqueId64x2& hashed_key) { size_t probe = 0; - ClockHandle* e = FindSlot( + HandleImpl* e = FindSlot( hashed_key, - [&](ClockHandle* h) { + [&](HandleImpl* h) { // Mostly branch-free version (similar performance) /* uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, @@ -532,16 +632,16 @@ ClockHandle* ClockHandleTable::Lookup(const UniqueId64x2& hashed_key) { (void)old_meta; return false; }, - [&](ClockHandle* h) { + [&](HandleImpl* h) { return h->displacements.load(std::memory_order_relaxed) == 0; }, - [&](ClockHandle* /*h*/) {}, probe); + [&](HandleImpl* /*h*/) {}, probe); return e; } -bool ClockHandleTable::Release(ClockHandle* h, bool useful, - bool erase_if_last_ref) { +bool HyperClockTable::Release(HandleImpl* h, bool useful, + bool erase_if_last_ref) { // In contrast with LRUCache's Release, this function won't delete the handle // when the cache is above capacity and the reference is the last one. Space // is only freed up by EvictFromClock (called by Insert when space is needed) @@ -595,29 +695,18 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful, uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, std::memory_order_acquire)); // Took ownership - // TODO? Delay freeing? - h->FreeData(); - size_t total_charge = h->total_charge; - if (UNLIKELY(h->detached)) { + size_t total_charge = h->GetTotalCharge(); + if (UNLIKELY(h->IsDetached())) { + h->FreeData(allocator_); // Delete detached handle delete h; detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed); + usage_.fetch_sub(total_charge, std::memory_order_relaxed); } else { - UniqueId64x2 hashed_key = h->hashed_key; -#ifndef NDEBUG - // Mark slot as empty, with assertion - old_meta = h->meta.exchange(0, std::memory_order_release); - assert(old_meta >> ClockHandle::kStateShift == - ClockHandle::kStateConstruction); -#else - // Mark slot as empty - h->meta.store(0, std::memory_order_release); -#endif - occupancy_.fetch_sub(1U, std::memory_order_release); - Rollback(hashed_key, h); + Rollback(h->hashed_key, h); + FreeDataMarkEmpty(*h, allocator_); + ReclaimEntryUsage(total_charge); } - usage_.fetch_sub(total_charge, std::memory_order_relaxed); - assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); return true; } else { // Correct for possible (but rare) overflow @@ -626,7 +715,7 @@ bool ClockHandleTable::Release(ClockHandle* h, bool useful, } } -void ClockHandleTable::Ref(ClockHandle& h) { +void HyperClockTable::Ref(HandleImpl& h) { // Increment acquire counter uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, std::memory_order_acquire); @@ -638,7 +727,7 @@ void ClockHandleTable::Ref(ClockHandle& h) { (void)old_meta; } -void ClockHandleTable::TEST_RefN(ClockHandle& h, size_t n) { +void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) { // Increment acquire counter uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement, std::memory_order_acquire); @@ -648,7 +737,7 @@ void ClockHandleTable::TEST_RefN(ClockHandle& h, size_t n) { (void)old_meta; } -void ClockHandleTable::TEST_ReleaseN(ClockHandle* h, size_t n) { +void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { if (n > 0) { // Split into n - 1 and 1 steps. uint64_t old_meta = h->meta.fetch_add( @@ -661,11 +750,11 @@ void ClockHandleTable::TEST_ReleaseN(ClockHandle* h, size_t n) { } } -void ClockHandleTable::Erase(const UniqueId64x2& hashed_key) { +void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { size_t probe = 0; (void)FindSlot( hashed_key, - [&](ClockHandle* h) { + [&](HandleImpl* h) { // Could be multiple entries in rare cases. Erase them all. // Optimistically increment acquire counter uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, @@ -699,20 +788,11 @@ void ClockHandleTable::Erase(const UniqueId64x2& hashed_key) { std::memory_order_acq_rel)) { // Took ownership assert(hashed_key == h->hashed_key); - // TODO? Delay freeing? - h->FreeData(); - usage_.fetch_sub(h->total_charge, std::memory_order_relaxed); - assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); -#ifndef NDEBUG - // Mark slot as empty, with assertion - old_meta = h->meta.exchange(0, std::memory_order_release); - assert(old_meta >> ClockHandle::kStateShift == - ClockHandle::kStateConstruction); -#else - // Mark slot as empty - h->meta.store(0, std::memory_order_release); -#endif - occupancy_.fetch_sub(1U, std::memory_order_release); + size_t total_charge = h->GetTotalCharge(); + FreeDataMarkEmpty(*h, allocator_); + ReclaimEntryUsage(total_charge); + // We already have a copy of hashed_key in this case, so OK to + // delay Rollback until after releasing the entry Rollback(hashed_key, h); break; } @@ -735,14 +815,14 @@ void ClockHandleTable::Erase(const UniqueId64x2& hashed_key) { } return false; }, - [&](ClockHandle* h) { + [&](HandleImpl* h) { return h->displacements.load(std::memory_order_relaxed) == 0; }, - [&](ClockHandle* /*h*/) {}, probe); + [&](HandleImpl* /*h*/) {}, probe); } -void ClockHandleTable::ConstApplyToEntriesRange( - std::function func, size_t index_begin, +void HyperClockTable::ConstApplyToEntriesRange( + std::function func, size_t index_begin, size_t index_end, bool apply_if_will_be_deleted) const { uint64_t check_state_mask = ClockHandle::kStateShareableBit; if (!apply_if_will_be_deleted) { @@ -750,7 +830,7 @@ void ClockHandleTable::ConstApplyToEntriesRange( } for (size_t i = index_begin; i < index_end; i++) { - ClockHandle& h = array_[i]; + HandleImpl& h = array_[i]; // Note: to avoid using compare_exchange, we have to be extra careful. uint64_t old_meta = h.meta.load(std::memory_order_relaxed); @@ -782,9 +862,9 @@ void ClockHandleTable::ConstApplyToEntriesRange( } } -void ClockHandleTable::EraseUnRefEntries() { +void HyperClockTable::EraseUnRefEntries() { for (size_t i = 0; i <= this->length_bits_mask_; i++) { - ClockHandle& h = array_[i]; + HandleImpl& h = array_[i]; uint64_t old_meta = h.meta.load(std::memory_order_relaxed); if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} @@ -795,28 +875,18 @@ void ClockHandleTable::EraseUnRefEntries() { << ClockHandle::kStateShift, std::memory_order_acquire)) { // Took ownership - UniqueId64x2 hashed_key = h.hashed_key; - h.FreeData(); - usage_.fetch_sub(h.total_charge, std::memory_order_relaxed); -#ifndef NDEBUG - // Mark slot as empty, with assertion - old_meta = h.meta.exchange(0, std::memory_order_release); - assert(old_meta >> ClockHandle::kStateShift == - ClockHandle::kStateConstruction); -#else - // Mark slot as empty - h.meta.store(0, std::memory_order_release); -#endif - occupancy_.fetch_sub(1U, std::memory_order_release); - Rollback(hashed_key, &h); + size_t total_charge = h.GetTotalCharge(); + Rollback(h.hashed_key, &h); + FreeDataMarkEmpty(h, allocator_); + ReclaimEntryUsage(total_charge); } } } -ClockHandle* ClockHandleTable::FindSlot( - const UniqueId64x2& hashed_key, std::function match_fn, - std::function abort_fn, - std::function update_fn, size_t& probe) { +inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot( + const UniqueId64x2& hashed_key, std::function match_fn, + std::function abort_fn, + std::function update_fn, size_t& probe) { // NOTE: upper 32 bits of hashed_key[0] is used for sharding // // We use double-hashing probing. Every probe in the sequence is a @@ -832,7 +902,7 @@ ClockHandle* ClockHandleTable::FindSlot( size_t increment = static_cast(hashed_key[0]) | 1U; size_t current = ModTableSize(base + probe * increment); while (probe <= length_bits_mask_) { - ClockHandle* h = &array_[current]; + HandleImpl* h = &array_[current]; if (match_fn(h)) { probe++; return h; @@ -848,18 +918,29 @@ ClockHandle* ClockHandleTable::FindSlot( return nullptr; } -void ClockHandleTable::Rollback(const UniqueId64x2& hashed_key, - const ClockHandle* h) { +inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key, + const HandleImpl* h) { size_t current = ModTableSize(hashed_key[1]); size_t increment = static_cast(hashed_key[0]) | 1U; - for (size_t i = 0; &array_[current] != h; i++) { + while (&array_[current] != h) { array_[current].displacements.fetch_sub(1, std::memory_order_relaxed); current = ModTableSize(current + increment); } } -void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge, - size_t* freed_count) { +inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) { + auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release); + (void)old_occupancy; + // No underflow + assert(old_occupancy > 0); + auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed); + (void)old_usage; + // No underflow + assert(old_usage >= total_charge); +} + +inline void HyperClockTable::Evict(size_t requested_charge, + size_t* freed_charge, size_t* freed_count) { // precondition assert(requested_charge > 0); @@ -880,64 +961,13 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge, for (;;) { for (size_t i = 0; i < step_size; i++) { - ClockHandle& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))]; - uint64_t meta = h.meta.load(std::memory_order_relaxed); - - uint64_t acquire_count = (meta >> ClockHandle::kAcquireCounterShift) & - ClockHandle::kCounterMask; - uint64_t release_count = (meta >> ClockHandle::kReleaseCounterShift) & - ClockHandle::kCounterMask; - if (acquire_count != release_count) { - // Only clock update entries with no outstanding refs - continue; - } - if (!((meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit)) { - // Only clock update Shareable entries - continue; - } - if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && - acquire_count > 0) { - // Decrement clock - uint64_t new_count = std::min(acquire_count - 1, - uint64_t{ClockHandle::kMaxCountdown} - 1); - // Compare-exchange in the decremented clock info, but - // not aggressively - uint64_t new_meta = - (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) | - (new_count << ClockHandle::kReleaseCounterShift) | - (new_count << ClockHandle::kAcquireCounterShift); - h.meta.compare_exchange_strong(meta, new_meta, - std::memory_order_relaxed); - continue; - } - // Otherwise, remove entry (either unreferenced invisible or - // unreferenced and expired visible). Compare-exchange failing probably - // indicates the entry was used, so skip it in that case. - if (h.meta.compare_exchange_strong( - meta, - uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift, - std::memory_order_acquire)) { - // Took ownership. - // Save info about h to minimize dependences between atomic updates - // (e.g. fully relaxed Rollback after h released by marking empty) - const UniqueId64x2 h_hashed_key = h.hashed_key; - size_t h_total_charge = h.total_charge; - // TODO? Delay freeing? - h.FreeData(); -#ifndef NDEBUG - // Mark slot as empty, with assertion - meta = h.meta.exchange(0, std::memory_order_release); - assert(meta >> ClockHandle::kStateShift == - ClockHandle::kStateConstruction); -#else - // Mark slot as empty - h.meta.store(0, std::memory_order_release); -#endif + HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))]; + bool evicting = ClockUpdate(h); + if (evicting) { + Rollback(h.hashed_key, &h); + *freed_charge += h.GetTotalCharge(); *freed_count += 1; - *freed_charge += h_total_charge; - Rollback(h_hashed_key, &h); + FreeDataMarkEmpty(h, allocator_); } } @@ -955,25 +985,30 @@ void ClockHandleTable::Evict(size_t requested_charge, size_t* freed_charge, } } -ClockCacheShard::ClockCacheShard( - size_t capacity, size_t estimated_value_size, bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy) +template +ClockCacheShard::ClockCacheShard( + size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, const typename Table::Opts& opts) : CacheShardBase(metadata_charge_policy), - table_( - CalcHashBits(capacity, estimated_value_size, metadata_charge_policy), - /*initial_charge_metadata*/ metadata_charge_policy == - kFullChargeCacheMetadata), + table_(capacity, strict_capacity_limit, metadata_charge_policy, allocator, + opts), capacity_(capacity), strict_capacity_limit_(strict_capacity_limit) { // Initial charge metadata should not exceed capacity - assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(ClockHandle)); + assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl)); } -void ClockCacheShard::EraseUnRefEntries() { table_.EraseUnRefEntries(); } +template +void ClockCacheShard
::EraseUnRefEntries() { + table_.EraseUnRefEntries(); +} -void ClockCacheShard::ApplyToSomeEntries( - const std::function& callback, +template +void ClockCacheShard
::ApplyToSomeEntries( + const std::function& callback, size_t average_entries_per_lock, size_t* state) { // The state is essentially going to be the starting hash, which works // nicely even if we resize between calls because we use upper-most @@ -997,20 +1032,20 @@ void ClockCacheShard::ApplyToSomeEntries( } table_.ConstApplyToEntriesRange( - [callback](const ClockHandle& h) { + [callback](const HandleImpl& h) { UniqueId64x2 unhashed; - callback(ReverseHash(h.hashed_key, &unhashed), h.value, h.total_charge, - h.deleter); + callback(ReverseHash(h.hashed_key, &unhashed), h.value, + h.GetTotalCharge(), h.helper); }, index_begin, index_end, false); } -int ClockCacheShard::CalcHashBits( +int HyperClockTable::CalcHashBits( size_t capacity, size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy) { double average_slot_charge = estimated_value_size * kLoadFactor; if (metadata_charge_policy == kFullChargeCacheMetadata) { - average_slot_charge += sizeof(ClockHandle); + average_slot_charge += sizeof(HandleImpl); } assert(average_slot_charge > 0.0); uint64_t num_slots = @@ -1020,28 +1055,34 @@ int ClockCacheShard::CalcHashBits( if (metadata_charge_policy == kFullChargeCacheMetadata) { // For very small estimated value sizes, it's possible to overshoot while (hash_bits > 0 && - uint64_t{sizeof(ClockHandle)} << hash_bits > capacity) { + uint64_t{sizeof(HandleImpl)} << hash_bits > capacity) { hash_bits--; } } return hash_bits; } -void ClockCacheShard::SetCapacity(size_t capacity) { +template +void ClockCacheShard
::SetCapacity(size_t capacity) { capacity_.store(capacity, std::memory_order_relaxed); // next Insert will take care of any necessary evictions } -void ClockCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { +template +void ClockCacheShard
::SetStrictCapacityLimit( + bool strict_capacity_limit) { strict_capacity_limit_.store(strict_capacity_limit, std::memory_order_relaxed); // next Insert will take care of any necessary evictions } -Status ClockCacheShard::Insert(const Slice& key, const UniqueId64x2& hashed_key, - void* value, size_t charge, - Cache::DeleterFn deleter, ClockHandle** handle, - Cache::Priority priority) { +template +Status ClockCacheShard
::Insert(const Slice& key, + const UniqueId64x2& hashed_key, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, + size_t charge, HandleImpl** handle, + Cache::Priority priority) { if (UNLIKELY(key.size() != kCacheKeySize)) { return Status::NotSupported("ClockCache only supports key size " + std::to_string(kCacheKeySize) + "B"); @@ -1049,24 +1090,25 @@ Status ClockCacheShard::Insert(const Slice& key, const UniqueId64x2& hashed_key, ClockHandleBasicData proto; proto.hashed_key = hashed_key; proto.value = value; - proto.deleter = deleter; + proto.helper = helper; proto.total_charge = charge; - Status s = - table_.Insert(proto, reinterpret_cast(handle), priority, - capacity_.load(std::memory_order_relaxed), - strict_capacity_limit_.load(std::memory_order_relaxed)); + Status s = table_.Insert( + proto, handle, priority, capacity_.load(std::memory_order_relaxed), + strict_capacity_limit_.load(std::memory_order_relaxed)); return s; } -ClockHandle* ClockCacheShard::Lookup(const Slice& key, - const UniqueId64x2& hashed_key) { +template +typename ClockCacheShard
::HandleImpl* ClockCacheShard
::Lookup( + const Slice& key, const UniqueId64x2& hashed_key) { if (UNLIKELY(key.size() != kCacheKeySize)) { return nullptr; } return table_.Lookup(hashed_key); } -bool ClockCacheShard::Ref(ClockHandle* h) { +template +bool ClockCacheShard
::Ref(HandleImpl* h) { if (h == nullptr) { return false; } @@ -1074,36 +1116,57 @@ bool ClockCacheShard::Ref(ClockHandle* h) { return true; } -bool ClockCacheShard::Release(ClockHandle* handle, bool useful, - bool erase_if_last_ref) { +template +bool ClockCacheShard
::Release(HandleImpl* handle, bool useful, + bool erase_if_last_ref) { if (handle == nullptr) { return false; } return table_.Release(handle, useful, erase_if_last_ref); } -void ClockCacheShard::TEST_RefN(ClockHandle* h, size_t n) { +template +void ClockCacheShard
::TEST_RefN(HandleImpl* h, size_t n) { table_.TEST_RefN(*h, n); } -void ClockCacheShard::TEST_ReleaseN(ClockHandle* h, size_t n) { +template +void ClockCacheShard
::TEST_ReleaseN(HandleImpl* h, size_t n) { table_.TEST_ReleaseN(h, n); } -bool ClockCacheShard::Release(ClockHandle* handle, bool erase_if_last_ref) { +template +bool ClockCacheShard
::Release(HandleImpl* handle, + bool erase_if_last_ref) { return Release(handle, /*useful=*/true, erase_if_last_ref); } -void ClockCacheShard::Erase(const Slice& key, const UniqueId64x2& hashed_key) { +template +void ClockCacheShard
::Erase(const Slice& key, + const UniqueId64x2& hashed_key) { if (UNLIKELY(key.size() != kCacheKeySize)) { return; } table_.Erase(hashed_key); } -size_t ClockCacheShard::GetUsage() const { return table_.GetUsage(); } +template +size_t ClockCacheShard
::GetUsage() const { + return table_.GetUsage(); +} -size_t ClockCacheShard::GetPinnedUsage() const { +template +size_t ClockCacheShard
::GetDetachedUsage() const { + return table_.GetDetachedUsage(); +} + +template +size_t ClockCacheShard
::GetCapacity() const { + return capacity_; +} + +template +size_t ClockCacheShard
::GetPinnedUsage() const { // Computes the pinned usage by scanning the whole hash table. This // is slow, but avoids keeping an exact counter on the clock usage, // i.e., the number of not externally referenced elements. @@ -1114,15 +1177,15 @@ size_t ClockCacheShard::GetPinnedUsage() const { const bool charge_metadata = metadata_charge_policy_ == kFullChargeCacheMetadata; table_.ConstApplyToEntriesRange( - [&table_pinned_usage, charge_metadata](const ClockHandle& h) { + [&table_pinned_usage, charge_metadata](const HandleImpl& h) { uint64_t meta = h.meta.load(std::memory_order_relaxed); uint64_t refcount = GetRefcount(meta); // Holding one ref for ConstApplyToEntriesRange assert(refcount > 0); if (refcount > 1) { - table_pinned_usage += h.total_charge; + table_pinned_usage += h.GetTotalCharge(); if (charge_metadata) { - table_pinned_usage += sizeof(ClockHandle); + table_pinned_usage += sizeof(HandleImpl); } } }, @@ -1131,14 +1194,24 @@ size_t ClockCacheShard::GetPinnedUsage() const { return table_pinned_usage + table_.GetDetachedUsage(); } -size_t ClockCacheShard::GetOccupancyCount() const { +template +size_t ClockCacheShard
::GetOccupancyCount() const { return table_.GetOccupancy(); } -size_t ClockCacheShard::GetTableAddressCount() const { +template +size_t ClockCacheShard
::GetOccupancyLimit() const { + return table_.GetOccupancyLimit(); +} + +template +size_t ClockCacheShard
::GetTableAddressCount() const { return table_.GetTableSize(); } +// Explicit instantiation +template class ClockCacheShard; + HyperClockCache::HyperClockCache( size_t capacity, size_t estimated_value_size, int num_shard_bits, bool strict_capacity_limit, @@ -1151,26 +1224,159 @@ HyperClockCache::HyperClockCache( // TODO: should not need to go through two levels of pointer indirection to // get to table entries size_t per_shard = GetPerShardCapacity(); - InitShards([=](ClockCacheShard* cs) { - new (cs) ClockCacheShard(per_shard, estimated_value_size, - strict_capacity_limit, metadata_charge_policy); + MemoryAllocator* alloc = this->memory_allocator(); + InitShards([=](Shard* cs) { + HyperClockTable::Opts opts; + opts.estimated_value_size = estimated_value_size; + new (cs) Shard(per_shard, strict_capacity_limit, metadata_charge_policy, + alloc, opts); }); } -void* HyperClockCache::Value(Handle* handle) { - return reinterpret_cast(handle)->value; +Cache::ObjectPtr HyperClockCache::Value(Handle* handle) { + return reinterpret_cast(handle)->value; } size_t HyperClockCache::GetCharge(Handle* handle) const { - return reinterpret_cast(handle)->total_charge; + return reinterpret_cast(handle)->GetTotalCharge(); +} + +const Cache::CacheItemHelper* HyperClockCache::GetCacheItemHelper( + Handle* handle) const { + auto h = reinterpret_cast(handle); + return h->helper; } -Cache::DeleterFn HyperClockCache::GetDeleter(Handle* handle) const { - auto h = reinterpret_cast(handle); - return h->deleter; +namespace { + +// For each cache shard, estimate what the table load factor would be if +// cache filled to capacity with average entries. This is considered +// indicative of a potential problem if the shard is essentially operating +// "at limit", which we define as high actual usage (>80% of capacity) +// or actual occupancy very close to limit (>95% of limit). +// Also, for each shard compute the recommended estimated_entry_charge, +// and keep the minimum one for use as overall recommendation. +void AddShardEvaluation(const HyperClockCache::Shard& shard, + std::vector& predicted_load_factors, + size_t& min_recommendation) { + size_t usage = shard.GetUsage() - shard.GetDetachedUsage(); + size_t capacity = shard.GetCapacity(); + double usage_ratio = 1.0 * usage / capacity; + + size_t occupancy = shard.GetOccupancyCount(); + size_t occ_limit = shard.GetOccupancyLimit(); + double occ_ratio = 1.0 * occupancy / occ_limit; + if (usage == 0 || occupancy == 0 || (usage_ratio < 0.8 && occ_ratio < 0.95)) { + // Skip as described above + return; + } + + // If filled to capacity, what would the occupancy ratio be? + double ratio = occ_ratio / usage_ratio; + // Given max load factor, what that load factor be? + double lf = ratio * kStrictLoadFactor; + predicted_load_factors.push_back(lf); + + // Update min_recommendation also + size_t recommendation = usage / occupancy; + min_recommendation = std::min(min_recommendation, recommendation); +} + +} // namespace + +void HyperClockCache::ReportProblems( + const std::shared_ptr& info_log) const { + uint32_t shard_count = GetNumShards(); + std::vector predicted_load_factors; + size_t min_recommendation = SIZE_MAX; + const_cast(this)->ForEachShard( + [&](HyperClockCache::Shard* shard) { + AddShardEvaluation(*shard, predicted_load_factors, min_recommendation); + }); + + if (predicted_load_factors.empty()) { + // None operating "at limit" -> nothing to report + return; + } + std::sort(predicted_load_factors.begin(), predicted_load_factors.end()); + + // First, if the average load factor is within spec, we aren't going to + // complain about a few shards being out of spec. + // NOTE: this is only the average among cache shards operating "at limit," + // which should be representative of what we care about. It it normal, even + // desirable, for a cache to operate "at limit" so this should not create + // selection bias. See AddShardEvaluation(). + // TODO: Consider detecting cases where decreasing the number of shards + // would be good, e.g. serious imbalance among shards. + double average_load_factor = + std::accumulate(predicted_load_factors.begin(), + predicted_load_factors.end(), 0.0) / + shard_count; + + constexpr double kLowSpecLoadFactor = kLoadFactor / 2; + constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414; + if (average_load_factor > kLoadFactor) { + // Out of spec => Consider reporting load factor too high + // Estimate effective overall capacity loss due to enforcing occupancy limit + double lost_portion = 0.0; + int over_count = 0; + for (double lf : predicted_load_factors) { + if (lf > kStrictLoadFactor) { + ++over_count; + lost_portion += (lf - kStrictLoadFactor) / lf / shard_count; + } + } + // >= 20% loss -> error + // >= 10% loss -> consistent warning + // >= 1% loss -> intermittent warning + InfoLogLevel level = InfoLogLevel::INFO_LEVEL; + bool report = true; + if (lost_portion > 0.2) { + level = InfoLogLevel::ERROR_LEVEL; + } else if (lost_portion > 0.1) { + level = InfoLogLevel::WARN_LEVEL; + } else if (lost_portion > 0.01) { + int report_percent = static_cast(lost_portion * 100.0); + if (Random::GetTLSInstance()->PercentTrue(report_percent)) { + level = InfoLogLevel::WARN_LEVEL; + } + } else { + // don't report + report = false; + } + if (report) { + ROCKS_LOG_AT_LEVEL( + info_log, level, + "HyperClockCache@%p unable to use estimated %.1f%% capacity because " + "of " + "full occupancy in %d/%u cache shards (estimated_entry_charge too " + "high). Recommend estimated_entry_charge=%zu", + this, lost_portion * 100.0, over_count, (unsigned)shard_count, + min_recommendation); + } + } else if (average_load_factor < kLowSpecLoadFactor) { + // Out of spec => Consider reporting load factor too low + // But cautiously because low is not as big of a problem. + + // Only report if highest occupancy shard is also below + // spec and only if average is substantially out of spec + if (predicted_load_factors.back() < kLowSpecLoadFactor && + average_load_factor < kLowSpecLoadFactor / 1.414) { + InfoLogLevel level = InfoLogLevel::INFO_LEVEL; + if (average_load_factor < kLowSpecLoadFactor / 2) { + level = InfoLogLevel::WARN_LEVEL; + } + ROCKS_LOG_AT_LEVEL( + info_log, level, + "HyperClockCache@%p table has low occupancy at full capacity. Higher " + "estimated_entry_charge (about %.1fx) would likely improve " + "performance. Recommend estimated_entry_charge=%zu", + this, kMidSpecLoadFactor / average_load_factor, min_recommendation); + } + } } -} // namespace hyper_clock_cache +} // namespace clock_cache // DEPRECATED (see public API) std::shared_ptr NewClockCache( @@ -1193,7 +1399,7 @@ std::shared_ptr HyperClockCacheOptions::MakeSharedCache() const { constexpr size_t min_shard_size = 32U * 1024U * 1024U; my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size); } - return std::make_shared( + return std::make_shared( capacity, estimated_entry_charge, my_num_shard_bits, strict_capacity_limit, metadata_charge_policy, memory_allocator); } diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 53a9de5f0a2..01185849b6d 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -27,12 +27,13 @@ namespace ROCKSDB_NAMESPACE { -namespace hyper_clock_cache { +namespace clock_cache { // Forward declaration of friend class. class ClockCacheTest; -// HyperClockCache is an experimental alternative to LRUCache. +// HyperClockCache is an alternative to LRUCache specifically tailored for +// use as BlockBasedTableOptions::block_cache // // Benefits // -------- @@ -304,23 +305,29 @@ constexpr double kLoadFactor = 0.7; constexpr double kStrictLoadFactor = 0.84; struct ClockHandleBasicData { - void* value = nullptr; - Cache::DeleterFn deleter = nullptr; + Cache::ObjectPtr value = nullptr; + const Cache::CacheItemHelper* helper = nullptr; // A lossless, reversible hash of the fixed-size (16 byte) cache key. This // eliminates the need to store a hash separately. UniqueId64x2 hashed_key = kNullUniqueId64x2; size_t total_charge = 0; + // For total_charge_and_flags + // "Detached" means the handle is allocated separately from hash table. + static constexpr uint64_t kFlagDetached = uint64_t{1} << 63; + // Extract just the total charge + static constexpr uint64_t kTotalChargeMask = kFlagDetached - 1; + + inline size_t GetTotalCharge() const { return total_charge; } + // Calls deleter (if non-null) on cache key and value - void FreeData() const; + void FreeData(MemoryAllocator* allocator) const; // Required by concept HandleImpl const UniqueId64x2& GetHash() const { return hashed_key; } }; -// Target size to be exactly a common cache line size (see static_assert in -// clock_cache.cc) -struct ALIGN_AS(64U) ClockHandle : public ClockHandleBasicData { +struct ClockHandle : public ClockHandleBasicData { // Constants for handling the atomic `meta` word, which tracks most of the // state of the handle. The meta word looks like this: // low bits high bits @@ -372,32 +379,54 @@ struct ALIGN_AS(64U) ClockHandle : public ClockHandleBasicData { // See above std::atomic meta{}; - // The number of elements that hash to this slot or a lower one, but wind - // up in this slot or a higher one. - std::atomic displacements{}; - // True iff the handle is allocated separately from hash table. - bool detached = false; + // Anticipating use for SecondaryCache support + void* reserved_for_future_use = nullptr; }; // struct ClockHandle -class ClockHandleTable { +class HyperClockTable { public: - explicit ClockHandleTable(int hash_bits, bool initial_charge_metadata); - ~ClockHandleTable(); + // Target size to be exactly a common cache line size (see static_assert in + // clock_cache.cc) + struct ALIGN_AS(64U) HandleImpl : public ClockHandle { + // The number of elements that hash to this slot or a lower one, but wind + // up in this slot or a higher one. + std::atomic displacements{}; + + // Whether this is a "deteched" handle that is independently allocated + // with `new` (so must be deleted with `delete`). + // TODO: ideally this would be packed into some other data field, such + // as upper bits of total_charge, but that incurs a measurable performance + // regression. + bool detached = false; + + inline bool IsDetached() const { return detached; } + + inline void SetDetached() { detached = true; } + }; // struct HandleImpl + + struct Opts { + size_t estimated_value_size; + }; + + HyperClockTable(size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, const Opts& opts); + ~HyperClockTable(); - Status Insert(const ClockHandleBasicData& proto, ClockHandle** handle, + Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle, Cache::Priority priority, size_t capacity, bool strict_capacity_limit); - ClockHandle* Lookup(const UniqueId64x2& hashed_key); + HandleImpl* Lookup(const UniqueId64x2& hashed_key); - bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref); + bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref); - void Ref(ClockHandle& handle); + void Ref(HandleImpl& handle); void Erase(const UniqueId64x2& hashed_key); - void ConstApplyToEntriesRange(std::function func, + void ConstApplyToEntriesRange(std::function func, size_t index_begin, size_t index_end, bool apply_if_will_be_deleted) const; @@ -407,12 +436,12 @@ class ClockHandleTable { int GetLengthBits() const { return length_bits_; } - size_t GetOccupancyLimit() const { return occupancy_limit_; } - size_t GetOccupancy() const { return occupancy_.load(std::memory_order_relaxed); } + size_t GetOccupancyLimit() const { return occupancy_limit_; } + size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); } size_t GetDetachedUsage() const { @@ -420,8 +449,8 @@ class ClockHandleTable { } // Acquire/release N references - void TEST_RefN(ClockHandle& handle, size_t n); - void TEST_ReleaseN(ClockHandle* handle, size_t n); + void TEST_RefN(HandleImpl& handle, size_t n); + void TEST_ReleaseN(HandleImpl* handle, size_t n); private: // functions // Returns x mod 2^{length_bits_}. @@ -432,8 +461,8 @@ class ClockHandleTable { // Runs the clock eviction algorithm trying to reclaim at least // requested_charge. Returns how much is evicted, which could be less // if it appears impossible to evict the requested amount without blocking. - void Evict(size_t requested_charge, size_t* freed_charge, - size_t* freed_count); + inline void Evict(size_t requested_charge, size_t* freed_charge, + size_t* freed_count); // Returns the first slot in the probe sequence, starting from the given // probe number, with a handle e such that match(e) is true. At every @@ -446,15 +475,56 @@ class ClockHandleTable { // value of probe is one more than the last non-aborting probe during the // call. This is so that that the variable can be used to keep track of // progress across consecutive calls to FindSlot. - inline ClockHandle* FindSlot(const UniqueId64x2& hashed_key, - std::function match, - std::function stop, - std::function update, - size_t& probe); + inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, + std::function match, + std::function stop, + std::function update, + size_t& probe); // Re-decrement all displacements in probe path starting from beginning // until (not including) the given handle - void Rollback(const UniqueId64x2& hashed_key, const ClockHandle* h); + inline void Rollback(const UniqueId64x2& hashed_key, const HandleImpl* h); + + // Subtracts `total_charge` from `usage_` and 1 from `occupancy_`. + // Ideally this comes after releasing the entry itself so that we + // actually have the available occupancy/usage that is claimed. + // However, that means total_charge has to be saved from the handle + // before releasing it so that it can be provided to this function. + inline void ReclaimEntryUsage(size_t total_charge); + + // Helper for updating `usage_` for new entry with given `total_charge` + // and evicting if needed under strict_capacity_limit=true rules. This + // means the operation might fail with Status::MemoryLimit. If + // `need_evict_for_occupancy`, then eviction of at least one entry is + // required, and the operation should fail if not possible. + // NOTE: Otherwise, occupancy_ is not managed in this function + inline Status ChargeUsageMaybeEvictStrict(size_t total_charge, + size_t capacity, + bool need_evict_for_occupancy); + + // Helper for updating `usage_` for new entry with given `total_charge` + // and evicting if needed under strict_capacity_limit=false rules. This + // means that updating `usage_` always succeeds even if forced to exceed + // capacity. If `need_evict_for_occupancy`, then eviction of at least one + // entry is required, and the operation should return false if such eviction + // is not possible. `usage_` is not updated in that case. Otherwise, returns + // true, indicating success. + // NOTE: occupancy_ is not managed in this function + inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, + size_t capacity, + bool need_evict_for_occupancy); + + // Creates a "detached" handle for returning from an Insert operation that + // cannot be completed by actually inserting into the table. + // Updates `detached_usage_` but not `usage_` nor `occupancy_`. + inline HandleImpl* DetachedInsert(const ClockHandleBasicData& proto); + + MemoryAllocator* GetAllocator() const { return allocator_; } + + // Returns the number of bits used to hash an element in the hash + // table. + static int CalcHashBits(size_t capacity, size_t estimated_value_size, + CacheMetadataChargePolicy metadata_charge_policy); private: // data // Number of hash bits used for table index. @@ -468,7 +538,10 @@ class ClockHandleTable { const size_t occupancy_limit_; // Array of slots comprising the hash table. - const std::unique_ptr array_; + const std::unique_ptr array_; + + // From Cache, for deleter + MemoryAllocator* const allocator_; // We partition the following members into different cache lines // to avoid false sharing among Lookup, Release, Erase and Insert @@ -487,17 +560,18 @@ class ClockHandleTable { // Part of usage by detached entries (not in table) std::atomic detached_usage_{}; -}; // class ClockHandleTable +}; // class HyperClockTable // A single shard of sharded cache. +template class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { public: - ClockCacheShard(size_t capacity, size_t estimated_value_size, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy); + ClockCacheShard(size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, const typename Table::Opts& opts); // For CacheShard concept - using HandleImpl = ClockHandle; + using HandleImpl = typename Table::HandleImpl; // Hash is lossless hash of 128-bit key using HashVal = UniqueId64x2; using HashCref = const HashVal&; @@ -531,74 +605,62 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { void SetStrictCapacityLimit(bool strict_capacity_limit); - Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value, - size_t charge, Cache::DeleterFn deleter, ClockHandle** handle, - Cache::Priority priority); + Status Insert(const Slice& key, const UniqueId64x2& hashed_key, + Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, + size_t charge, HandleImpl** handle, Cache::Priority priority); - ClockHandle* Lookup(const Slice& key, const UniqueId64x2& hashed_key); + HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key); - bool Release(ClockHandle* handle, bool useful, bool erase_if_last_ref); + bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref); - bool Release(ClockHandle* handle, bool erase_if_last_ref = false); + bool Release(HandleImpl* handle, bool erase_if_last_ref = false); - bool Ref(ClockHandle* handle); + bool Ref(HandleImpl* handle); void Erase(const Slice& key, const UniqueId64x2& hashed_key); + size_t GetCapacity() const; + size_t GetUsage() const; + size_t GetDetachedUsage() const; + size_t GetPinnedUsage() const; size_t GetOccupancyCount() const; + size_t GetOccupancyLimit() const; + size_t GetTableAddressCount() const; void ApplyToSomeEntries( - const std::function& callback, + const std::function& callback, size_t average_entries_per_lock, size_t* state); void EraseUnRefEntries(); std::string GetPrintableOptions() const { return std::string{}; } - // SecondaryCache not yet supported - Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value, - const Cache::CacheItemHelper* helper, size_t charge, - ClockHandle** handle, Cache::Priority priority) { - return Insert(key, hashed_key, value, charge, helper->del_cb, handle, - priority); - } - - ClockHandle* Lookup(const Slice& key, const UniqueId64x2& hashed_key, - const Cache::CacheItemHelper* /*helper*/, - const Cache::CreateCallback& /*create_cb*/, - Cache::Priority /*priority*/, bool /*wait*/, - Statistics* /*stats*/) { + HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key, + const Cache::CacheItemHelper* /*helper*/, + Cache::CreateContext* /*create_context*/, + Cache::Priority /*priority*/, bool /*wait*/, + Statistics* /*stats*/) { return Lookup(key, hashed_key); } - bool IsReady(ClockHandle* /*handle*/) { return true; } + bool IsReady(HandleImpl* /*handle*/) { return true; } - void Wait(ClockHandle* /*handle*/) {} + void Wait(HandleImpl* /*handle*/) {} // Acquire/release N references - void TEST_RefN(ClockHandle* handle, size_t n); - void TEST_ReleaseN(ClockHandle* handle, size_t n); - - private: // functions - friend class ClockCache; - friend class ClockCacheTest; - - ClockHandle* DetachedInsert(const ClockHandleBasicData& h); - - // Returns the number of bits used to hash an element in the hash - // table. - static int CalcHashBits(size_t capacity, size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy); + void TEST_RefN(HandleImpl* handle, size_t n); + void TEST_ReleaseN(HandleImpl* handle, size_t n); private: // data - ClockHandleTable table_; + Table table_; // Maximum total charge of all elements stored in the table. std::atomic capacity_; @@ -611,8 +673,10 @@ class HyperClockCache #ifdef NDEBUG final #endif - : public ShardedCache { + : public ShardedCache> { public: + using Shard = ClockCacheShard; + HyperClockCache(size_t capacity, size_t estimated_value_size, int num_shard_bits, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy, @@ -620,13 +684,16 @@ class HyperClockCache const char* Name() const override { return "HyperClockCache"; } - void* Value(Handle* handle) override; + Cache::ObjectPtr Value(Handle* handle) override; size_t GetCharge(Handle* handle) const override; - DeleterFn GetDeleter(Handle* handle) const override; + const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override; + + void ReportProblems( + const std::shared_ptr& /*info_log*/) const override; }; // class HyperClockCache -} // namespace hyper_clock_cache +} // namespace clock_cache } // namespace ROCKSDB_NAMESPACE diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc index 7d1bdc78953..23154d4f2a5 100644 --- a/cache/compressed_secondary_cache.cc +++ b/cache/compressed_secondary_cache.cc @@ -37,8 +37,10 @@ CompressedSecondaryCache::CompressedSecondaryCache( CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); } std::unique_ptr CompressedSecondaryCache::Lookup( - const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/, - bool advise_erase, bool& is_in_sec_cache) { + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase, + bool& is_in_sec_cache) { + assert(helper); std::unique_ptr handle; is_in_sec_cache = false; Cache::Handle* lru_handle = cache_->Lookup(key); @@ -64,12 +66,14 @@ std::unique_ptr CompressedSecondaryCache::Lookup( ptr = reinterpret_cast(handle_value); handle_value_charge = cache_->GetCharge(lru_handle); } + MemoryAllocator* allocator = cache_options_.memory_allocator.get(); Status s; - void* value{nullptr}; + Cache::ObjectPtr value{nullptr}; size_t charge{0}; if (cache_options_.compression_type == kNoCompression) { - s = create_cb(ptr->get(), handle_value_charge, &value, &charge); + s = helper->create_cb(Slice(ptr->get(), handle_value_charge), + create_context, allocator, &value, &charge); } else { UncompressionContext uncompression_context(cache_options_.compression_type); UncompressionInfo uncompression_info(uncompression_context, @@ -79,14 +83,14 @@ std::unique_ptr CompressedSecondaryCache::Lookup( size_t uncompressed_size{0}; CacheAllocationPtr uncompressed = UncompressData( uncompression_info, (char*)ptr->get(), handle_value_charge, - &uncompressed_size, cache_options_.compress_format_version, - cache_options_.memory_allocator.get()); + &uncompressed_size, cache_options_.compress_format_version, allocator); if (!uncompressed) { cache_->Release(lru_handle, /*erase_if_last_ref=*/true); return nullptr; } - s = create_cb(uncompressed.get(), uncompressed_size, &value, &charge); + s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size), + create_context, allocator, &value, &charge); } if (!s.ok()) { @@ -98,8 +102,9 @@ std::unique_ptr CompressedSecondaryCache::Lookup( cache_->Release(lru_handle, /*erase_if_last_ref=*/true); // Insert a dummy handle. cache_ - ->Insert(key, /*value=*/nullptr, /*charge=*/0, - GetDeletionCallback(cache_options_.enable_custom_split_merge)) + ->Insert(key, /*obj=*/nullptr, + GetHelper(cache_options_.enable_custom_split_merge), + /*charge=*/0) .PermitUncheckedError(); } else { is_in_sec_cache = true; @@ -109,19 +114,20 @@ std::unique_ptr CompressedSecondaryCache::Lookup( return handle; } -Status CompressedSecondaryCache::Insert(const Slice& key, void* value, +Status CompressedSecondaryCache::Insert(const Slice& key, + Cache::ObjectPtr value, const Cache::CacheItemHelper* helper) { if (value == nullptr) { return Status::InvalidArgument(); } Cache::Handle* lru_handle = cache_->Lookup(key); - Cache::DeleterFn del_cb = - GetDeletionCallback(cache_options_.enable_custom_split_merge); + auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge); if (lru_handle == nullptr) { PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1); // Insert a dummy handle if the handle is evicted for the first time. - return cache_->Insert(key, /*value=*/nullptr, /*charge=*/0, del_cb); + return cache_->Insert(key, /*obj=*/nullptr, internal_helper, + /*charge=*/0); } else { cache_->Release(lru_handle, /*erase_if_last_ref=*/false); } @@ -169,10 +175,10 @@ Status CompressedSecondaryCache::Insert(const Slice& key, void* value, size_t charge{0}; CacheValueChunk* value_chunks_head = SplitValueIntoChunks(val, cache_options_.compression_type, charge); - return cache_->Insert(key, value_chunks_head, charge, del_cb); + return cache_->Insert(key, value_chunks_head, internal_helper, charge); } else { CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr)); - return cache_->Insert(key, buf, size, del_cb); + return cache_->Insert(key, buf, internal_helper, size); } } @@ -276,23 +282,29 @@ CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue( return ptr; } -Cache::DeleterFn CompressedSecondaryCache::GetDeletionCallback( - bool enable_custom_split_merge) { +const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper( + bool enable_custom_split_merge) const { if (enable_custom_split_merge) { - return [](const Slice& /*key*/, void* obj) { - CacheValueChunk* chunks_head = reinterpret_cast(obj); - while (chunks_head != nullptr) { - CacheValueChunk* tmp_chunk = chunks_head; - chunks_head = chunks_head->next; - tmp_chunk->Free(); - obj = nullptr; - }; - }; + static const Cache::CacheItemHelper kHelper{ + CacheEntryRole::kMisc, + [](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) { + CacheValueChunk* chunks_head = static_cast(obj); + while (chunks_head != nullptr) { + CacheValueChunk* tmp_chunk = chunks_head; + chunks_head = chunks_head->next; + tmp_chunk->Free(); + obj = nullptr; + }; + }}; + return &kHelper; } else { - return [](const Slice& /*key*/, void* obj) { - delete reinterpret_cast(obj); - obj = nullptr; - }; + static const Cache::CacheItemHelper kHelper{ + CacheEntryRole::kMisc, + [](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) { + delete static_cast(obj); + obj = nullptr; + }}; + return &kHelper; } } diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h index 4dee388021c..e38a1a861e1 100644 --- a/cache/compressed_secondary_cache.h +++ b/cache/compressed_secondary_cache.h @@ -21,7 +21,7 @@ namespace ROCKSDB_NAMESPACE { class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle { public: - CompressedSecondaryCacheResultHandle(void* value, size_t size) + CompressedSecondaryCacheResultHandle(Cache::ObjectPtr value, size_t size) : value_(value), size_(size) {} ~CompressedSecondaryCacheResultHandle() override = default; @@ -34,12 +34,12 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle { void Wait() override {} - void* Value() override { return value_; } + Cache::ObjectPtr Value() override { return value_; } size_t Size() override { return size_; } private: - void* value_; + Cache::ObjectPtr value_; size_t size_; }; @@ -83,12 +83,13 @@ class CompressedSecondaryCache : public SecondaryCache { const char* Name() const override { return "CompressedSecondaryCache"; } - Status Insert(const Slice& key, void* value, + Status Insert(const Slice& key, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper) override; std::unique_ptr Lookup( - const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/, - bool advise_erase, bool& is_in_sec_cache) override; + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase, + bool& is_in_sec_cache) override; bool SupportForceErase() const override { return true; } @@ -129,8 +130,8 @@ class CompressedSecondaryCache : public SecondaryCache { CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head, size_t& charge); - // An implementation of Cache::DeleterFn. - static Cache::DeleterFn GetDeletionCallback(bool enable_custom_split_merge); + // TODO: clean up to use cleaner interfaces in typed_cache.h + const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const; std::shared_ptr cache_; CompressedSecondaryCacheOptions cache_options_; mutable port::Mutex capacity_mutex_; diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc index 574c257a7db..c13b8b390fb 100644 --- a/cache/compressed_secondary_cache_test.cc +++ b/cache/compressed_secondary_cache_test.cc @@ -16,7 +16,8 @@ namespace ROCKSDB_NAMESPACE { -class CompressedSecondaryCacheTest : public testing::Test { +class CompressedSecondaryCacheTest : public testing::Test, + public Cache::CreateContext { public: CompressedSecondaryCacheTest() : fail_create_(false) {} ~CompressedSecondaryCacheTest() override = default; @@ -37,13 +38,13 @@ class CompressedSecondaryCacheTest : public testing::Test { size_t size_; }; - static size_t SizeCallback(void* obj) { - return reinterpret_cast(obj)->Size(); + static size_t SizeCallback(Cache::ObjectPtr obj) { + return static_cast(obj)->Size(); } - static Status SaveToCallback(void* from_obj, size_t from_offset, - size_t length, void* out) { - auto item = reinterpret_cast(from_obj); + static Status SaveToCallback(Cache::ObjectPtr from_obj, size_t from_offset, + size_t length, char* out) { + auto item = static_cast(from_obj); const char* buf = item->Buf(); EXPECT_EQ(length, item->Size()); EXPECT_EQ(from_offset, 0); @@ -51,30 +52,36 @@ class CompressedSecondaryCacheTest : public testing::Test { return Status::OK(); } - static void DeletionCallback(const Slice& /*key*/, void* obj) { - delete reinterpret_cast(obj); + static void DeletionCallback(Cache::ObjectPtr obj, + MemoryAllocator* /*alloc*/) { + delete static_cast(obj); obj = nullptr; } - static Cache::CacheItemHelper helper_; - - static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/, - size_t /*size*/, void* /*out*/) { + static Status SaveToCallbackFail(Cache::ObjectPtr /*obj*/, size_t /*offset*/, + size_t /*size*/, char* /*out*/) { return Status::NotSupported(); } - static Cache::CacheItemHelper helper_fail_; - - Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size, - void** out_obj, - size_t* charge) -> Status { - if (fail_create_) { + static Status CreateCallback(const Slice& data, Cache::CreateContext* context, + MemoryAllocator* /*allocator*/, + Cache::ObjectPtr* out_obj, size_t* out_charge) { + auto t = static_cast(context); + if (t->fail_create_) { return Status::NotSupported(); } - *out_obj = reinterpret_cast(new TestItem((char*)buf, size)); - *charge = size; + *out_obj = new TestItem(data.data(), data.size()); + *out_charge = data.size(); return Status::OK(); - }; + } + + static constexpr Cache::CacheItemHelper kHelper{ + CacheEntryRole::kMisc, &DeletionCallback, &SizeCallback, &SaveToCallback, + &CreateCallback}; + + static constexpr Cache::CacheItemHelper kHelperFail{ + CacheEntryRole::kMisc, &DeletionCallback, &SizeCallback, + &SaveToCallbackFail, &CreateCallback}; void SetFailCreate(bool fail) { fail_create_ = fail; } @@ -84,7 +91,7 @@ class CompressedSecondaryCacheTest : public testing::Test { bool is_in_sec_cache{true}; // Lookup an non-existent key. std::unique_ptr handle0 = sec_cache->Lookup( - "k0", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache); + "k0", &kHelper, this, true, /*advise_erase=*/true, is_in_sec_cache); ASSERT_EQ(handle0, nullptr); Random rnd(301); @@ -92,23 +99,21 @@ class CompressedSecondaryCacheTest : public testing::Test { std::string str1(rnd.RandomString(1000)); TestItem item1(str1.data(), str1.length()); // A dummy handle is inserted if the item is inserted for the first time. - ASSERT_OK(sec_cache->Insert("k1", &item1, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1); ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); std::unique_ptr handle1_1 = sec_cache->Lookup( - "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + "k1", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); ASSERT_EQ(handle1_1, nullptr); // Insert and Lookup the item k1 for the second time and advise erasing it. - ASSERT_OK(sec_cache->Insert("k1", &item1, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); std::unique_ptr handle1_2 = sec_cache->Lookup( - "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache); + "k1", &kHelper, this, true, /*advise_erase=*/true, is_in_sec_cache); ASSERT_NE(handle1_2, nullptr); ASSERT_FALSE(is_in_sec_cache); if (sec_cache_is_compressed) { @@ -128,21 +133,19 @@ class CompressedSecondaryCacheTest : public testing::Test { // Lookup the item k1 again. std::unique_ptr handle1_3 = sec_cache->Lookup( - "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache); + "k1", &kHelper, this, true, /*advise_erase=*/true, is_in_sec_cache); ASSERT_EQ(handle1_3, nullptr); // Insert and Lookup the item k2. std::string str2(rnd.RandomString(1000)); TestItem item2(str2.data(), str2.length()); - ASSERT_OK(sec_cache->Insert("k2", &item2, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k2", &item2, &kHelper)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2); std::unique_ptr handle2_1 = sec_cache->Lookup( - "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + "k2", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); ASSERT_EQ(handle2_1, nullptr); - ASSERT_OK(sec_cache->Insert("k2", &item2, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k2", &item2, &kHelper)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2); if (sec_cache_is_compressed) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, @@ -154,7 +157,7 @@ class CompressedSecondaryCacheTest : public testing::Test { ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); } std::unique_ptr handle2_2 = sec_cache->Lookup( - "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + "k2", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); ASSERT_NE(handle2_2, nullptr); std::unique_ptr val2 = std::unique_ptr(static_cast(handle2_2->Value())); @@ -223,28 +226,24 @@ class CompressedSecondaryCacheTest : public testing::Test { std::string str1(rnd.RandomString(1000)); TestItem item1(str1.data(), str1.length()); // Insert a dummy handle. - ASSERT_OK(sec_cache->Insert("k1", &item1, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); // Insert k1. - ASSERT_OK(sec_cache->Insert("k1", &item1, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); // Insert and Lookup the second item. std::string str2(rnd.RandomString(200)); TestItem item2(str2.data(), str2.length()); // Insert a dummy handle, k1 is not evicted. - ASSERT_OK(sec_cache->Insert("k2", &item2, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k2", &item2, &kHelper)); bool is_in_sec_cache{false}; std::unique_ptr handle1 = sec_cache->Lookup( - "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + "k1", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); ASSERT_EQ(handle1, nullptr); // Insert k2 and k1 is evicted. - ASSERT_OK(sec_cache->Insert("k2", &item2, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k2", &item2, &kHelper)); std::unique_ptr handle2 = sec_cache->Lookup( - "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + "k2", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); ASSERT_NE(handle2, nullptr); std::unique_ptr val2 = std::unique_ptr(static_cast(handle2->Value())); @@ -252,27 +251,24 @@ class CompressedSecondaryCacheTest : public testing::Test { ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0); // Insert k1 again and a dummy handle is inserted. - ASSERT_OK(sec_cache->Insert("k1", &item1, - &CompressedSecondaryCacheTest::helper_)); + ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); std::unique_ptr handle1_1 = sec_cache->Lookup( - "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache); + "k1", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); ASSERT_EQ(handle1_1, nullptr); // Create Fails. SetFailCreate(true); std::unique_ptr handle2_1 = sec_cache->Lookup( - "k2", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache); + "k2", &kHelper, this, true, /*advise_erase=*/true, is_in_sec_cache); ASSERT_EQ(handle2_1, nullptr); // Save Fails. std::string str3 = rnd.RandomString(10); TestItem item3(str3.data(), str3.length()); // The Status is OK because a dummy handle is inserted. - ASSERT_OK(sec_cache->Insert("k3", &item3, - &CompressedSecondaryCacheTest::helper_fail_)); - ASSERT_NOK(sec_cache->Insert("k3", &item3, - &CompressedSecondaryCacheTest::helper_fail_)); + ASSERT_OK(sec_cache->Insert("k3", &item3, &kHelperFail)); + ASSERT_NOK(sec_cache->Insert("k3", &item3, &kHelperFail)); sec_cache.reset(); } @@ -309,15 +305,13 @@ class CompressedSecondaryCacheTest : public testing::Test { Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1_1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert( - "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length())); + ASSERT_OK(cache->Insert("k1", item1_1, &kHelper, str1.length())); std::string str2 = rnd.RandomString(1012); auto item2_1 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's dummy item. - ASSERT_OK(cache->Insert( - "k2", item2_1, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_OK(cache->Insert("k2", item2_1, &kHelper, str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1); ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); @@ -326,22 +320,19 @@ class CompressedSecondaryCacheTest : public testing::Test { auto item3_1 = new TestItem(str3.data(), str3.length()); // After this Insert, primary cache contains k3 and secondary cache contains // k1's dummy item and k2's dummy item. - ASSERT_OK(cache->Insert( - "k3", item3_1, &CompressedSecondaryCacheTest::helper_, str3.length())); + ASSERT_OK(cache->Insert("k3", item3_1, &kHelper, str3.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2); // After this Insert, primary cache contains k1 and secondary cache contains // k1's dummy item, k2's dummy item, and k3's dummy item. auto item1_2 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert( - "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length())); + ASSERT_OK(cache->Insert("k1", item1_2, &kHelper, str1.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3); // After this Insert, primary cache contains k2 and secondary cache contains // k1's item, k2's dummy item, and k3's dummy item. auto item2_2 = new TestItem(str2.data(), str2.length()); - ASSERT_OK(cache->Insert( - "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_OK(cache->Insert("k2", item2_2, &kHelper, str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); if (sec_cache_is_compressed) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, @@ -356,8 +347,7 @@ class CompressedSecondaryCacheTest : public testing::Test { // After this Insert, primary cache contains k3 and secondary cache contains // k1's item and k2's item. auto item3_2 = new TestItem(str3.data(), str3.length()); - ASSERT_OK(cache->Insert( - "k3", item3_2, &CompressedSecondaryCacheTest::helper_, str3.length())); + ASSERT_OK(cache->Insert("k3", item3_2, &kHelper, str3.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2); if (sec_cache_is_compressed) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, @@ -370,8 +360,7 @@ class CompressedSecondaryCacheTest : public testing::Test { } Cache::Handle* handle; - handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, + handle = cache->Lookup("k3", &kHelper, this, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); auto val3 = static_cast(cache->Value(handle)); @@ -380,15 +369,13 @@ class CompressedSecondaryCacheTest : public testing::Test { cache->Release(handle); // Lookup an non-existent key. - handle = cache->Lookup("k0", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, + handle = cache->Lookup("k0", &kHelper, this, Cache::Priority::LOW, true, stats.get()); ASSERT_EQ(handle, nullptr); // This Lookup should just insert a dummy handle in the primary cache // and the k1 is still in the secondary cache. - handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, + handle = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1); @@ -400,8 +387,7 @@ class CompressedSecondaryCacheTest : public testing::Test { // This Lookup should erase k1 from the secondary cache and insert // it into primary cache; then k3 is demoted. // k2 and k3 are in secondary cache. - handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, + handle = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1); @@ -409,8 +395,7 @@ class CompressedSecondaryCacheTest : public testing::Test { cache->Release(handle); // k2 is still in secondary cache. - handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, + handle = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 2); @@ -418,8 +403,7 @@ class CompressedSecondaryCacheTest : public testing::Test { // Testing SetCapacity(). ASSERT_OK(secondary_cache->SetCapacity(0)); - handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, + handle = cache->Lookup("k3", &kHelper, this, Cache::Priority::LOW, true, stats.get()); ASSERT_EQ(handle, nullptr); @@ -429,35 +413,30 @@ class CompressedSecondaryCacheTest : public testing::Test { ASSERT_EQ(capacity, 7000); auto item1_3 = new TestItem(str1.data(), str1.length()); // After this Insert, primary cache contains k1. - ASSERT_OK(cache->Insert( - "k1", item1_3, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_OK(cache->Insert("k1", item1_3, &kHelper, str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 4); auto item2_3 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's dummy item. - ASSERT_OK(cache->Insert( - "k2", item2_3, &CompressedSecondaryCacheTest::helper_, str1.length())); + ASSERT_OK(cache->Insert("k2", item2_3, &kHelper, str1.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 4); auto item1_4 = new TestItem(str1.data(), str1.length()); // After this Insert, primary cache contains k1 and secondary cache contains // k1's dummy item and k2's dummy item. - ASSERT_OK(cache->Insert( - "k1", item1_4, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_OK(cache->Insert("k1", item1_4, &kHelper, str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 5); auto item2_4 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's real item and k2's dummy item. - ASSERT_OK(cache->Insert( - "k2", item2_4, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_OK(cache->Insert("k2", item2_4, &kHelper, str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 5); // This Lookup should just insert a dummy handle in the primary cache // and the k1 is still in the secondary cache. - handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, + handle = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); @@ -496,18 +475,13 @@ class CompressedSecondaryCacheTest : public testing::Test { Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1 = std::make_unique(str1.data(), str1.length()); - ASSERT_NOK(cache->Insert("k1", item1.get(), nullptr, str1.length())); - ASSERT_OK(cache->Insert("k1", item1.get(), - &CompressedSecondaryCacheTest::helper_, - str1.length())); + ASSERT_OK(cache->Insert("k1", item1.get(), &kHelper, str1.length())); item1.release(); // Appease clang-analyze "potential memory leak" Cache::Handle* handle; - handle = cache->Lookup("k2", nullptr, test_item_creator, - Cache::Priority::LOW, true); + handle = cache->Lookup("k2", nullptr, this, Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); - handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, false); + handle = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, false); ASSERT_EQ(handle, nullptr); cache.reset(); @@ -543,29 +517,25 @@ class CompressedSecondaryCacheTest : public testing::Test { Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1, - &CompressedSecondaryCacheTest::helper_fail_, - str1.length())); + ASSERT_OK(cache->Insert("k1", item1, &kHelperFail, str1.length())); std::string str2 = rnd.RandomString(1002); auto item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to the secondary cache. - ASSERT_OK(cache->Insert("k2", item2, - &CompressedSecondaryCacheTest::helper_fail_, - str2.length())); + ASSERT_OK(cache->Insert("k2", item2, &kHelperFail, str2.length())); Cache::Handle* handle; - handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + handle = + cache->Lookup("k2", &kHelperFail, this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 demotion would have failed. - handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + handle = + cache->Lookup("k1", &kHelperFail, this, Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); // Since k1 was not promoted, k2 should still be in cache. - handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + handle = + cache->Lookup("k2", &kHelperFail, this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); @@ -602,28 +572,23 @@ class CompressedSecondaryCacheTest : public testing::Test { Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1, &CompressedSecondaryCacheTest::helper_, - str1.length())); + ASSERT_OK(cache->Insert("k1", item1, &kHelper, str1.length())); std::string str2 = rnd.RandomString(1002); auto item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to the secondary cache. - ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_, - str2.length())); + ASSERT_OK(cache->Insert("k2", item2, &kHelper, str2.length())); Cache::Handle* handle; SetFailCreate(true); - handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + handle = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 creation would have failed - handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + handle = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); // Since k1 didn't get promoted, k2 should still be in cache - handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + handle = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); @@ -660,32 +625,27 @@ class CompressedSecondaryCacheTest : public testing::Test { Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1_1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert( - "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length())); + ASSERT_OK(cache->Insert("k1", item1_1, &kHelper, str1.length())); std::string str2 = rnd.RandomString(1002); std::string str2_clone{str2}; auto item2 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's dummy item. - ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_, - str2.length())); + ASSERT_OK(cache->Insert("k2", item2, &kHelper, str2.length())); // After this Insert, primary cache contains k1 and secondary cache contains // k1's dummy item and k2's dummy item. auto item1_2 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert( - "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length())); + ASSERT_OK(cache->Insert("k1", item1_2, &kHelper, str1.length())); auto item2_2 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's item and k2's dummy item. - ASSERT_OK(cache->Insert( - "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length())); + ASSERT_OK(cache->Insert("k2", item2_2, &kHelper, str2.length())); Cache::Handle* handle2; - handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + handle2 = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true); ASSERT_NE(handle2, nullptr); cache->Release(handle2); @@ -693,14 +653,12 @@ class CompressedSecondaryCacheTest : public testing::Test { // strict_capacity_limit is true, but the lookup should still succeed. // A k1's dummy item is inserted into primary cache. Cache::Handle* handle1; - handle1 = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + handle1 = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true); ASSERT_NE(handle1, nullptr); cache->Release(handle1); // Since k1 didn't get inserted, k2 should still be in cache - handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + handle2 = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true); ASSERT_NE(handle2, nullptr); cache->Release(handle2); @@ -741,7 +699,7 @@ class CompressedSecondaryCacheTest : public testing::Test { current_chunk = current_chunk->next; ASSERT_EQ(current_chunk->size, 98); - sec_cache->GetDeletionCallback(true)("dummy", chunks_head); + sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr); } void MergeChunksIntoValueTest() { @@ -822,23 +780,13 @@ class CompressedSecondaryCacheTest : public testing::Test { std::string value_str{value.get(), charge}; ASSERT_EQ(strcmp(value_str.data(), str.data()), 0); - sec_cache->GetDeletionCallback(true)("dummy", chunks_head); + sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr); } private: bool fail_create_; }; -Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_( - CompressedSecondaryCacheTest::SizeCallback, - CompressedSecondaryCacheTest::SaveToCallback, - CompressedSecondaryCacheTest::DeletionCallback); - -Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_fail_( - CompressedSecondaryCacheTest::SizeCallback, - CompressedSecondaryCacheTest::SaveToCallbackFail, - CompressedSecondaryCacheTest::DeletionCallback); - class CompressedSecCacheTestWithCompressAndAllocatorParam : public CompressedSecondaryCacheTest, public ::testing::WithParamInterface> { diff --git a/cache/fast_lru_cache.cc b/cache/fast_lru_cache.cc deleted file mode 100644 index 3a540f139ba..00000000000 --- a/cache/fast_lru_cache.cc +++ /dev/null @@ -1,580 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "cache/fast_lru_cache.h" - -#include -#include -#include -#include - -#include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" -#include "port/lang.h" -#include "util/distributed_mutex.h" -#include "util/hash.h" -#include "util/math.h" -#include "util/random.h" - -namespace ROCKSDB_NAMESPACE { - -namespace fast_lru_cache { - -LRUHandleTable::LRUHandleTable(int hash_bits) - : length_bits_(hash_bits), - length_bits_mask_((uint32_t{1} << length_bits_) - 1), - occupancy_(0), - occupancy_limit_(static_cast((uint32_t{1} << length_bits_) * - kStrictLoadFactor)), - array_(new LRUHandle[size_t{1} << length_bits_]) { - assert(hash_bits <= 32); -} - -LRUHandleTable::~LRUHandleTable() { - ApplyToEntriesRange([](LRUHandle* h) { h->FreeData(); }, 0, GetTableSize()); -} - -LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) { - int probe = 0; - int slot = FindVisibleElement(key, hash, probe, 0); - return (slot == -1) ? nullptr : &array_[slot]; -} - -LRUHandle* LRUHandleTable::Insert(LRUHandle* h, LRUHandle** old) { - int probe = 0; - int slot = FindVisibleElementOrAvailableSlot(h->key(), h->hash, probe, - 1 /*displacement*/); - *old = nullptr; - if (slot == -1) { - // TODO(Guido) Don't we need to roll back displacements here? - return nullptr; - } - - if (array_[slot].IsEmpty() || array_[slot].IsTombstone()) { - bool empty = array_[slot].IsEmpty(); - Assign(slot, h); - LRUHandle* new_entry = &array_[slot]; - if (empty) { - // This used to be an empty slot. - return new_entry; - } - // It used to be a tombstone, so there may already be a copy of the - // key in the table. - slot = FindVisibleElement(h->key(), h->hash, probe, 0 /*displacement*/); - if (slot == -1) { - // No existing copy of the key. - return new_entry; - } - *old = &array_[slot]; - return new_entry; - } else { - // There is an existing copy of the key. - *old = &array_[slot]; - // Find an available slot for the new element. - array_[slot].displacements++; - slot = FindAvailableSlot(h->key(), probe, 1 /*displacement*/); - if (slot == -1) { - // No available slots. Roll back displacements. - probe = 0; - slot = FindVisibleElement(h->key(), h->hash, probe, -1); - array_[slot].displacements--; - FindAvailableSlot(h->key(), probe, -1); - return nullptr; - } - Assign(slot, h); - return &array_[slot]; - } -} - -void LRUHandleTable::Remove(LRUHandle* h) { - assert(h->next == nullptr && - h->prev == nullptr); // Already off the LRU list. - int probe = 0; - FindSlot( - h->key(), [&h](LRUHandle* e) { return e == h; }, probe, - -1 /*displacement*/); - h->SetIsVisible(false); - h->SetIsElement(false); - occupancy_--; -} - -void LRUHandleTable::Assign(int slot, LRUHandle* h) { - LRUHandle* dst = &array_[slot]; - uint32_t disp = dst->displacements; - *dst = *h; - dst->displacements = disp; - dst->SetIsVisible(true); - dst->SetIsElement(true); - occupancy_++; -} - -void LRUHandleTable::Exclude(LRUHandle* h) { h->SetIsVisible(false); } - -int LRUHandleTable::FindVisibleElement(const Slice& key, uint32_t hash, - int& probe, int displacement) { - return FindSlot( - key, - [&](LRUHandle* h) { return h->Matches(key, hash) && h->IsVisible(); }, - probe, displacement); -} - -int LRUHandleTable::FindAvailableSlot(const Slice& key, int& probe, - int displacement) { - return FindSlot( - key, [](LRUHandle* h) { return h->IsEmpty() || h->IsTombstone(); }, probe, - displacement); -} - -int LRUHandleTable::FindVisibleElementOrAvailableSlot(const Slice& key, - uint32_t hash, int& probe, - int displacement) { - return FindSlot( - key, - [&](LRUHandle* h) { - return h->IsEmpty() || h->IsTombstone() || - (h->Matches(key, hash) && h->IsVisible()); - }, - probe, displacement); -} - -inline int LRUHandleTable::FindSlot(const Slice& key, - std::function cond, - int& probe, int displacement) { - uint32_t base = ModTableSize(Hash(key.data(), key.size(), kProbingSeed1)); - uint32_t increment = - ModTableSize((Hash(key.data(), key.size(), kProbingSeed2) << 1) | 1); - uint32_t current = ModTableSize(base + probe * increment); - while (true) { - LRUHandle* h = &array_[current]; - probe++; - if (current == base && probe > 1) { - // We looped back. - return -1; - } - if (cond(h)) { - return current; - } - if (h->IsEmpty()) { - // We check emptyness after the condition, because - // the condition may be emptyness. - return -1; - } - h->displacements += displacement; - current = ModTableSize(current + increment); - } -} - -LRUCacheShard::LRUCacheShard(size_t capacity, size_t estimated_value_size, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy) - : CacheShardBase(metadata_charge_policy), - capacity_(capacity), - strict_capacity_limit_(strict_capacity_limit), - table_( - CalcHashBits(capacity, estimated_value_size, metadata_charge_policy)), - usage_(0), - lru_usage_(0) { - // Make empty circular linked list. - lru_.next = &lru_; - lru_.prev = &lru_; - lru_low_pri_ = &lru_; -} - -void LRUCacheShard::EraseUnRefEntries() { - autovector last_reference_list; - { - DMutexLock l(mutex_); - while (lru_.next != &lru_) { - LRUHandle* old = lru_.next; - // LRU list contains only elements which can be evicted. - assert(old->IsVisible() && !old->HasRefs()); - LRU_Remove(old); - table_.Remove(old); - assert(usage_ >= old->total_charge); - usage_ -= old->total_charge; - last_reference_list.push_back(*old); - } - } - - // Free the entries here outside of mutex for performance reasons. - for (auto& h : last_reference_list) { - h.FreeData(); - } -} - -void LRUCacheShard::ApplyToSomeEntries( - const std::function& callback, - size_t average_entries_per_lock, size_t* state) { - // The state is essentially going to be the starting hash, which works - // nicely even if we resize between calls because we use upper-most - // hash bits for table indexes. - DMutexLock l(mutex_); - size_t length_bits = table_.GetLengthBits(); - size_t length = table_.GetTableSize(); - - assert(average_entries_per_lock > 0); - // Assuming we are called with same average_entries_per_lock repeatedly, - // this simplifies some logic (index_end will not overflow). - assert(average_entries_per_lock < length || *state == 0); - - size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits); - size_t index_end = index_begin + average_entries_per_lock; - if (index_end >= length) { - // Going to end - index_end = length; - *state = SIZE_MAX; - } else { - *state = index_end << (sizeof(size_t) * 8u - length_bits); - } - - table_.ApplyToEntriesRange( - [callback, - metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) { - callback(h->key(), h->value, h->GetCharge(metadata_charge_policy), - h->deleter); - }, - index_begin, index_end); -} - -void LRUCacheShard::LRU_Remove(LRUHandle* h) { - assert(h->next != nullptr); - assert(h->prev != nullptr); - h->next->prev = h->prev; - h->prev->next = h->next; - h->prev = h->next = nullptr; - assert(lru_usage_ >= h->total_charge); - lru_usage_ -= h->total_charge; -} - -void LRUCacheShard::LRU_Insert(LRUHandle* h) { - assert(h->next == nullptr); - assert(h->prev == nullptr); - // Insert h to head of LRU list. - h->next = &lru_; - h->prev = lru_.prev; - h->prev->next = h; - h->next->prev = h; - lru_usage_ += h->total_charge; -} - -void LRUCacheShard::EvictFromLRU(size_t charge, - autovector* deleted) { - while ((usage_ + charge) > capacity_ && lru_.next != &lru_) { - LRUHandle* old = lru_.next; - // LRU list contains only elements which can be evicted. - assert(old->IsVisible() && !old->HasRefs()); - LRU_Remove(old); - table_.Remove(old); - assert(usage_ >= old->total_charge); - usage_ -= old->total_charge; - deleted->push_back(*old); - } -} - -size_t LRUCacheShard::CalcEstimatedHandleCharge( - size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy) { - LRUHandle h; - h.CalcTotalCharge(estimated_value_size, metadata_charge_policy); - return h.total_charge; -} - -int LRUCacheShard::CalcHashBits( - size_t capacity, size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy) { - size_t handle_charge = - CalcEstimatedHandleCharge(estimated_value_size, metadata_charge_policy); - assert(handle_charge > 0); - uint32_t num_entries = - static_cast(capacity / (kLoadFactor * handle_charge)) + 1; - assert(num_entries <= uint32_t{1} << 31); - return FloorLog2((num_entries << 1) - 1); -} - -void LRUCacheShard::SetCapacity(size_t capacity) { - autovector last_reference_list; - { - DMutexLock l(mutex_); - if (capacity > capacity_) { - assert(false); // Not supported. - } - capacity_ = capacity; - EvictFromLRU(0, &last_reference_list); - } - - // Free the entries here outside of mutex for performance reasons. - for (auto& h : last_reference_list) { - h.FreeData(); - } -} - -void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { - DMutexLock l(mutex_); - strict_capacity_limit_ = strict_capacity_limit; -} - -Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, - size_t charge, Cache::DeleterFn deleter, - LRUHandle** handle, Cache::Priority /*priority*/) { - if (key.size() != kCacheKeySize) { - return Status::NotSupported("FastLRUCache only supports key size " + - std::to_string(kCacheKeySize) + "B"); - } - - LRUHandle tmp; - tmp.value = value; - tmp.deleter = deleter; - tmp.hash = hash; - tmp.CalcTotalCharge(charge, metadata_charge_policy_); - for (int i = 0; i < kCacheKeySize; i++) { - tmp.key_data[i] = key.data()[i]; - } - - Status s = Status::OK(); - autovector last_reference_list; - { - DMutexLock l(mutex_); - assert(table_.GetOccupancy() <= table_.GetOccupancyLimit()); - - // Free the space following strict LRU policy until enough space - // is freed or the lru list is empty. - EvictFromLRU(tmp.total_charge, &last_reference_list); - if ((usage_ + tmp.total_charge > capacity_ && - (strict_capacity_limit_ || handle == nullptr)) || - table_.GetOccupancy() == table_.GetOccupancyLimit()) { - // There are two measures of capacity: - // - Space (or charge) capacity: The maximum possible sum of the charges - // of the elements. - // - Table capacity: The number of slots in the hash table. - // These are incomparable, in the sense that one doesn't imply the other. - // Typically we will reach space capacity before table capacity--- - // if the user always inserts values with size equal to - // estimated_value_size, then at most a kLoadFactor fraction of slots - // will ever be occupied. But in some cases we may reach table capacity - // before space capacity---if the user initially claims a very large - // estimated_value_size but then inserts tiny values, more elements than - // initially estimated will be inserted. - - // TODO(Guido) Some tests (at least two from cache_test, as well as the - // stress tests) currently assume the table capacity is unbounded. - if (handle == nullptr) { - // Don't insert the entry but still return ok, as if the entry inserted - // into cache and get evicted immediately. - last_reference_list.push_back(tmp); - } else { - if (table_.GetOccupancy() == table_.GetOccupancyLimit()) { - // TODO: Consider using a distinct status for this case, but usually - // it will be handled the same way as reaching charge capacity limit - s = Status::MemoryLimit( - "Insert failed because all slots in the hash table are full."); - } else { - s = Status::MemoryLimit( - "Insert failed because the total charge has exceeded the " - "capacity."); - } - } - } else { - // Insert into the cache. Note that the cache might get larger than its - // capacity if not enough space was freed up. - LRUHandle* old; - LRUHandle* h = table_.Insert(&tmp, &old); - assert(h != nullptr); // We're below occupancy, so this insertion should - // never fail. - usage_ += h->total_charge; - if (old != nullptr) { - s = Status::OkOverwritten(); - assert(old->IsVisible()); - table_.Exclude(old); - if (!old->HasRefs()) { - // old is on LRU because it's in cache and its reference count is 0. - LRU_Remove(old); - table_.Remove(old); - assert(usage_ >= old->total_charge); - usage_ -= old->total_charge; - last_reference_list.push_back(*old); - } - } - if (handle == nullptr) { - LRU_Insert(h); - } else { - // If caller already holds a ref, no need to take one here. - if (!h->HasRefs()) { - h->Ref(); - } - *handle = h; - } - } - } - - // Free the entries here outside of mutex for performance reasons. - for (auto& h : last_reference_list) { - h.FreeData(); - } - - return s; -} - -LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) { - LRUHandle* h = nullptr; - { - DMutexLock l(mutex_); - h = table_.Lookup(key, hash); - if (h != nullptr) { - assert(h->IsVisible()); - if (!h->HasRefs()) { - // The entry is in LRU since it's in hash and has no external - // references. - LRU_Remove(h); - } - h->Ref(); - } - } - return h; -} - -bool LRUCacheShard::Ref(LRUHandle* h) { - DMutexLock l(mutex_); - // To create another reference - entry must be already externally referenced. - assert(h->HasRefs()); - h->Ref(); - return true; -} - -bool LRUCacheShard::Release(LRUHandle* h, bool erase_if_last_ref) { - if (h == nullptr) { - return false; - } - LRUHandle copy; - bool last_reference = false; - { - DMutexLock l(mutex_); - last_reference = h->Unref(); - if (last_reference && h->IsVisible()) { - // The item is still in cache, and nobody else holds a reference to it. - if (usage_ > capacity_ || erase_if_last_ref) { - // The LRU list must be empty since the cache is full. - assert(lru_.next == &lru_ || erase_if_last_ref); - // Take this opportunity and remove the item. - table_.Remove(h); - } else { - // Put the item back on the LRU list, and don't free it. - LRU_Insert(h); - last_reference = false; - } - } - // If it was the last reference, then decrement the cache usage. - if (last_reference) { - assert(usage_ >= h->total_charge); - usage_ -= h->total_charge; - copy = *h; - } - } - - // Free the entry here outside of mutex for performance reasons. - if (last_reference) { - copy.FreeData(); - } - return last_reference; -} - -void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { - LRUHandle copy; - bool last_reference = false; - { - DMutexLock l(mutex_); - LRUHandle* h = table_.Lookup(key, hash); - if (h != nullptr) { - table_.Exclude(h); - if (!h->HasRefs()) { - // The entry is in LRU since it's in cache and has no external - // references. - LRU_Remove(h); - table_.Remove(h); - assert(usage_ >= h->total_charge); - usage_ -= h->total_charge; - last_reference = true; - copy = *h; - } - } - } - // Free the entry here outside of mutex for performance reasons. - // last_reference will only be true if e != nullptr. - if (last_reference) { - copy.FreeData(); - } -} - -size_t LRUCacheShard::GetUsage() const { - DMutexLock l(mutex_); - return usage_; -} - -size_t LRUCacheShard::GetPinnedUsage() const { - DMutexLock l(mutex_); - assert(usage_ >= lru_usage_); - return usage_ - lru_usage_; -} - -size_t LRUCacheShard::GetOccupancyCount() const { - DMutexLock l(mutex_); - return table_.GetOccupancy(); -} - -size_t LRUCacheShard::GetTableAddressCount() const { - DMutexLock l(mutex_); - return table_.GetTableSize(); -} - -LRUCache::LRUCache(size_t capacity, size_t estimated_value_size, - int num_shard_bits, bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, - nullptr /*allocator*/) { - assert(estimated_value_size > 0 || - metadata_charge_policy != kDontChargeCacheMetadata); - size_t per_shard = GetPerShardCapacity(); - InitShards([=](LRUCacheShard* cs) { - new (cs) LRUCacheShard(per_shard, estimated_value_size, - strict_capacity_limit, metadata_charge_policy); - }); -} - -void* LRUCache::Value(Handle* handle) { - return reinterpret_cast(handle)->value; -} - -size_t LRUCache::GetCharge(Handle* handle) const { - return reinterpret_cast(handle)->GetCharge( - GetShard(0).metadata_charge_policy_); -} - -Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const { - auto h = reinterpret_cast(handle); - return h->deleter; -} - -} // namespace fast_lru_cache - -std::shared_ptr NewFastLRUCache( - size_t capacity, size_t estimated_value_size, int num_shard_bits, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy) { - if (num_shard_bits >= 20) { - return nullptr; // The cache cannot be sharded into too many fine pieces. - } - if (num_shard_bits < 0) { - num_shard_bits = GetDefaultCacheShardBits(capacity); - } - return std::make_shared( - capacity, estimated_value_size, num_shard_bits, strict_capacity_limit, - metadata_charge_policy); -} - -} // namespace ROCKSDB_NAMESPACE diff --git a/cache/fast_lru_cache.h b/cache/fast_lru_cache.h deleted file mode 100644 index 3cd55ca8694..00000000000 --- a/cache/fast_lru_cache.h +++ /dev/null @@ -1,477 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -#pragma once - -#include -#include -#include - -#include "cache/cache_key.h" -#include "cache/sharded_cache.h" -#include "port/lang.h" -#include "port/malloc.h" -#include "port/port.h" -#include "rocksdb/secondary_cache.h" -#include "util/autovector.h" -#include "util/distributed_mutex.h" - -namespace ROCKSDB_NAMESPACE { - -namespace fast_lru_cache { - -// Forward declaration of friend class. -class FastLRUCacheTest; - -// LRU cache implementation using an open-address hash table. -// -// Every slot in the hash table is an LRUHandle. Because handles can be -// referenced externally, we can't discard them immediately once they are -// deleted (via a delete or an LRU eviction) or replaced by a new version -// (via an insert of the same key). The state of an element is defined by -// the following two properties: -// (R) Referenced: An element can be referenced externally (refs > 0), or not. -// Importantly, an element can be evicted if and only if it's not -// referenced. In particular, when an element becomes referenced, it's -// temporarily taken out of the LRU list until all references to it -// are dropped. -// (V) Visible: An element can visible for lookups (IS_VISIBLE set), or not. -// Initially, every element is visible. An element that is not visible is -// called a ghost. -// These properties induce 4 different states, with transitions defined as -// follows: -// - V --> not V: When a visible element is deleted or replaced by a new -// version. -// - Not V --> V: This cannot happen. A ghost remains in that state until it's -// not referenced any more, at which point it's ready to be removed from the -// hash table. (A ghost simply waits to transition to the afterlife---it will -// never be visible again.) -// - R --> not R: When all references to an element are dropped. -// - Not R --> R: When an unreferenced element becomes referenced. This can only -// happen if the element is V, since references to an element can only be -// created when it's visible. -// -// Internally, the cache uses an open-addressed hash table to index the handles. -// We use tombstone counters to keep track of displacements. -// Because of the tombstones and the two possible visibility states of an -// element, the table slots can be in 4 different states: -// 1. Visible element (IS_ELEMENT set and IS_VISIBLE set): The slot contains a -// key-value element. -// 2. Ghost element (IS_ELEMENT set and IS_VISIBLE unset): The slot contains an -// element that has been removed, but it's still referenced. It's invisible -// to lookups. -// 3. Tombstone (IS_ELEMENT unset and displacements > 0): The slot contains a -// tombstone. -// 4. Empty (IS_ELEMENT unset and displacements == 0): The slot is unused. -// A slot that is an element can further have IS_VISIBLE set or not. -// When a ghost is removed from the table, it can either transition to being a -// tombstone or an empty slot, depending on the number of displacements of the -// slot. In any case, the slot becomes available. When a handle is inserted -// into that slot, it becomes a visible element again. - -// The load factor p is a real number in (0, 1) such that at all -// times at most a fraction p of all slots, without counting tombstones, -// are occupied by elements. This means that the probability that a -// random probe hits an empty slot is at most p, and thus at most 1/p probes -// are required on average. For example, p = 70% implies that between 1 and 2 -// probes are needed on average (bear in mind that this reasoning doesn't -// consider the effects of clustering over time). -// Because the size of the hash table is always rounded up to the next -// power of 2, p is really an upper bound on the actual load factor---the -// actual load factor is anywhere between p/2 and p. This is a bit wasteful, -// but bear in mind that slots only hold metadata, not actual values. -// Since space cost is dominated by the values (the LSM blocks), -// overprovisioning the table with metadata only increases the total cache space -// usage by a tiny fraction. -constexpr double kLoadFactor = 0.35; - -// The user can exceed kLoadFactor if the sizes of the inserted values don't -// match estimated_value_size, or if strict_capacity_limit == false. To -// avoid performance to plunge, we set a strict upper bound on the load factor. -constexpr double kStrictLoadFactor = 0.7; - -// Arbitrary seeds. -constexpr uint32_t kProbingSeed1 = 0xbc9f1d34; -constexpr uint32_t kProbingSeed2 = 0x7a2bb9d5; - -// An experimental (under development!) alternative to LRUCache - -struct LRUHandle { - void* value; - Cache::DeleterFn deleter; - LRUHandle* next; - LRUHandle* prev; - size_t total_charge; // TODO(opt): Only allow uint32_t? - // The hash of key(). Used for fast sharding and comparisons. - uint32_t hash; - // The number of external refs to this entry. - uint32_t refs; - - enum Flags : uint8_t { - // Whether the handle is visible to Lookups. - IS_VISIBLE = (1 << 0), - // Whether the slot is in use by an element. - IS_ELEMENT = (1 << 1), - }; - uint8_t flags; - - // The number of elements that hash to this slot or a lower one, - // but wind up in a higher slot. - uint32_t displacements; - - std::array key_data; - - LRUHandle() { - value = nullptr; - deleter = nullptr; - next = nullptr; - prev = nullptr; - total_charge = 0; - hash = 0; - refs = 0; - flags = 0; - displacements = 0; - key_data.fill(0); - } - - Slice key() const { return Slice(key_data.data(), kCacheKeySize); } - - // For HandleImpl concept - uint32_t GetHash() const { return hash; } - - // Increase the reference count by 1. - void Ref() { refs++; } - - // Just reduce the reference count by 1. Return true if it was last reference. - bool Unref() { - assert(refs > 0); - refs--; - return refs == 0; - } - - // Return true if there are external refs, false otherwise. - bool HasRefs() const { return refs > 0; } - - bool IsVisible() const { return flags & IS_VISIBLE; } - - void SetIsVisible(bool is_visible) { - if (is_visible) { - flags |= IS_VISIBLE; - } else { - flags &= ~IS_VISIBLE; - } - } - - bool IsElement() const { return flags & IS_ELEMENT; } - - void SetIsElement(bool is_element) { - if (is_element) { - flags |= IS_ELEMENT; - } else { - flags &= ~IS_ELEMENT; - } - } - - void FreeData() { - assert(refs == 0); - if (deleter) { - (*deleter)(key(), value); - } - } - - // Calculate the memory usage by metadata. - inline size_t CalcMetaCharge( - CacheMetadataChargePolicy metadata_charge_policy) const { - if (metadata_charge_policy != kFullChargeCacheMetadata) { - return 0; - } else { - // #ifdef ROCKSDB_MALLOC_USABLE_SIZE - // return malloc_usable_size( - // const_cast(static_cast(this))); - // #else - // TODO(Guido) malloc_usable_size only works when we call it on - // a pointer allocated with malloc. Because our handles are all - // allocated in a single shot as an array, the user can't call - // CalcMetaCharge (or CalcTotalCharge or GetCharge) on a handle - // pointer returned by the cache. Moreover, malloc_usable_size - // expects a heap-allocated handle, but sometimes in our code we - // wish to pass a stack-allocated handle (this is only a performance - // concern). - // What is the right way to compute metadata charges with pre-allocated - // handles? - return sizeof(LRUHandle); - // #endif - } - } - - inline void CalcTotalCharge( - size_t charge, CacheMetadataChargePolicy metadata_charge_policy) { - total_charge = charge + CalcMetaCharge(metadata_charge_policy); - } - - inline size_t GetCharge( - CacheMetadataChargePolicy metadata_charge_policy) const { - size_t meta_charge = CalcMetaCharge(metadata_charge_policy); - assert(total_charge >= meta_charge); - return total_charge - meta_charge; - } - - inline bool IsEmpty() { - return !this->IsElement() && this->displacements == 0; - } - - inline bool IsTombstone() { - return !this->IsElement() && this->displacements > 0; - } - - inline bool Matches(const Slice& some_key, uint32_t some_hash) { - return this->IsElement() && this->hash == some_hash && - this->key() == some_key; - } -}; - - -class LRUHandleTable { - public: - explicit LRUHandleTable(int hash_bits); - ~LRUHandleTable(); - - // Returns a pointer to a visible element matching the key/hash, or - // nullptr if not present. - LRUHandle* Lookup(const Slice& key, uint32_t hash); - - // Inserts a copy of h into the hash table. - // Returns a pointer to the inserted handle, or nullptr if no slot - // available was found. If an existing visible element matching the - // key/hash is already present in the hash table, the argument old - // is set to pointe to it; otherwise, it's set to nullptr. - LRUHandle* Insert(LRUHandle* h, LRUHandle** old); - - // Removes h from the hash table. The handle must already be off - // the LRU list. - void Remove(LRUHandle* h); - - // Turns a visible element h into a ghost (i.e., not visible). - void Exclude(LRUHandle* h); - - // Assigns a copy of h to the given slot. - void Assign(int slot, LRUHandle* h); - - template - void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) { - for (size_t i = index_begin; i < index_end; i++) { - LRUHandle* h = &array_[i]; - if (h->IsVisible()) { - func(h); - } - } - } - - uint32_t GetTableSize() const { return uint32_t{1} << length_bits_; } - - int GetLengthBits() const { return length_bits_; } - - uint32_t GetOccupancyLimit() const { return occupancy_limit_; } - - uint32_t GetOccupancy() const { return occupancy_; } - - // Returns x mod 2^{length_bits_}. - uint32_t ModTableSize(uint32_t x) { return x & length_bits_mask_; } - - private: - int FindVisibleElement(const Slice& key, uint32_t hash, int& probe, - int displacement); - - int FindAvailableSlot(const Slice& key, int& probe, int displacement); - - int FindVisibleElementOrAvailableSlot(const Slice& key, uint32_t hash, - int& probe, int displacement); - - // Returns the index of the first slot probed (hashing with - // the given key) with a handle e such that cond(e) is true. - // Otherwise, if no match is found, returns -1. - // For every handle e probed except the final slot, updates - // e->displacements += displacement. - // The argument probe is modified such that consecutive calls - // to FindSlot continue probing right after where the previous - // call left. - int FindSlot(const Slice& key, std::function cond, - int& probe, int displacement); - - // Number of hash bits used for table index. - // The size of the table is 1 << length_bits_. - int length_bits_; - - const uint32_t length_bits_mask_; - - // Number of elements in the table. - uint32_t occupancy_; - - // Maximum number of elements the user can store in the table. - uint32_t occupancy_limit_; - - std::unique_ptr array_; -}; - -// A single shard of sharded cache. -class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { - public: - LRUCacheShard(size_t capacity, size_t estimated_value_size, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy); - - // For CacheShard concept - using HandleImpl = LRUHandle; - - // Keep 32-bit hashing for now (FIXME: upgrade to 64-bit) - using HashVal = uint32_t; - using HashCref = uint32_t; - static inline HashVal ComputeHash(const Slice& key) { - return Lower32of64(GetSliceNPHash64(key)); - } - static inline uint32_t HashPieceForSharding(HashCref hash) { return hash; } - - // Separate from constructor so caller can easily make an array of LRUCache - // if current usage is more than new capacity, the function will attempt to - // free the needed space. - void SetCapacity(size_t capacity); - - // Set the flag to reject insertion if cache if full. - void SetStrictCapacityLimit(bool strict_capacity_limit); - - // Like Cache methods, but with an extra "hash" parameter. - // Insert an item into the hash table and, if handle is null, insert into - // the LRU list. Older items are evicted as necessary. If the cache is full - // and free_handle_on_fail is true, the item is deleted and handle is set to - // nullptr. - Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, - Cache::DeleterFn deleter, LRUHandle** handle, - Cache::Priority priority); - - Status Insert(const Slice& key, uint32_t hash, void* value, - const Cache::CacheItemHelper* helper, size_t charge, - LRUHandle** handle, Cache::Priority priority) { - return Insert(key, hash, value, charge, helper->del_cb, handle, priority); - } - - LRUHandle* Lookup(const Slice& key, uint32_t hash, - const Cache::CacheItemHelper* /*helper*/, - const Cache::CreateCallback& /*create_cb*/, - Cache::Priority /*priority*/, bool /*wait*/, - Statistics* /*stats*/) { - return Lookup(key, hash); - } - LRUHandle* Lookup(const Slice& key, uint32_t hash); - - bool Release(LRUHandle* handle, bool /*useful*/, bool erase_if_last_ref) { - return Release(handle, erase_if_last_ref); - } - bool IsReady(LRUHandle* /*handle*/) { return true; } - void Wait(LRUHandle* /*handle*/) {} - - bool Ref(LRUHandle* handle); - bool Release(LRUHandle* handle, bool erase_if_last_ref = false); - void Erase(const Slice& key, uint32_t hash); - - size_t GetUsage() const; - size_t GetPinnedUsage() const; - size_t GetOccupancyCount() const; - size_t GetTableAddressCount() const; - - void ApplyToSomeEntries( - const std::function& callback, - size_t average_entries_per_lock, size_t* state); - - void EraseUnRefEntries(); - - private: - friend class LRUCache; - friend class FastLRUCacheTest; - - void LRU_Remove(LRUHandle* e); - void LRU_Insert(LRUHandle* e); - - // Free some space following strict LRU policy until enough space - // to hold (usage_ + charge) is freed or the LRU list is empty - // This function is not thread safe - it needs to be executed while - // holding the mutex_. - void EvictFromLRU(size_t charge, autovector* deleted); - - // Returns the charge of a single handle. - static size_t CalcEstimatedHandleCharge( - size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy); - - // Returns the number of bits used to hash an element in the hash - // table. - static int CalcHashBits(size_t capacity, size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy); - - // Initialized before use. - size_t capacity_; - - // Whether to reject insertion if cache reaches its full capacity. - bool strict_capacity_limit_; - - // Dummy head of LRU list. - // lru.prev is newest entry, lru.next is oldest entry. - // LRU contains items which can be evicted, ie reference only by cache - LRUHandle lru_; - - // Pointer to head of low-pri pool in LRU list. - LRUHandle* lru_low_pri_; - - // ------------^^^^^^^^^^^^^----------- - // Not frequently modified data members - // ------------------------------------ - // - // We separate data members that are updated frequently from the ones that - // are not frequently updated so that they don't share the same cache line - // which will lead into false cache sharing - // - // ------------------------------------ - // Frequently modified data members - // ------------vvvvvvvvvvvvv----------- - LRUHandleTable table_; - - // Memory size for entries residing in the cache. - size_t usage_; - - // Memory size for entries residing only in the LRU list. - size_t lru_usage_; - - // mutex_ protects the following state. - // We don't count mutex_ as the cache's internal state so semantically we - // don't mind mutex_ invoking the non-const actions. - mutable DMutex mutex_; -}; - -class LRUCache -#ifdef NDEBUG - final -#endif - : public ShardedCache { - public: - LRUCache(size_t capacity, size_t estimated_value_size, int num_shard_bits, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy = - kDontChargeCacheMetadata); - const char* Name() const override { return "LRUCache"; } - void* Value(Handle* handle) override; - size_t GetCharge(Handle* handle) const override; - DeleterFn GetDeleter(Handle* handle) const override; -}; -} // namespace fast_lru_cache - -std::shared_ptr NewFastLRUCache( - size_t capacity, size_t estimated_value_size, int num_shard_bits, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy); - -} // namespace ROCKSDB_NAMESPACE diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index c8e4d29bab6..95cd320a7bc 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -22,20 +22,28 @@ namespace ROCKSDB_NAMESPACE { namespace lru_cache { +namespace { // A distinct pointer value for marking "dummy" cache entries -void* const kDummyValueMarker = const_cast("kDummyValueMarker"); - -LRUHandleTable::LRUHandleTable(int max_upper_hash_bits) +struct DummyValue { + char val[12] = "kDummyValue"; +}; +DummyValue kDummyValue{}; +} // namespace + +LRUHandleTable::LRUHandleTable(int max_upper_hash_bits, + MemoryAllocator* allocator) : length_bits_(/* historical starting size*/ 4), list_(new LRUHandle* [size_t{1} << length_bits_] {}), elems_(0), - max_length_bits_(max_upper_hash_bits) {} + max_length_bits_(max_upper_hash_bits), + allocator_(allocator) {} LRUHandleTable::~LRUHandleTable() { + auto alloc = allocator_; ApplyToEntriesRange( - [](LRUHandle* h) { + [alloc](LRUHandle* h) { if (!h->HasRefs()) { - h->Free(); + h->Free(alloc); } }, 0, size_t{1} << length_bits_); @@ -118,6 +126,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, double low_pri_pool_ratio, bool use_adaptive_mutex, CacheMetadataChargePolicy metadata_charge_policy, int max_upper_hash_bits, + MemoryAllocator* allocator, SecondaryCache* secondary_cache) : CacheShardBase(metadata_charge_policy), capacity_(0), @@ -128,7 +137,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, high_pri_pool_capacity_(0), low_pri_pool_ratio_(low_pri_pool_ratio), low_pri_pool_capacity_(0), - table_(max_upper_hash_bits), + table_(max_upper_hash_bits, allocator), usage_(0), lru_usage_(0), mutex_(use_adaptive_mutex), @@ -159,13 +168,14 @@ void LRUCacheShard::EraseUnRefEntries() { } for (auto entry : last_reference_list) { - entry->Free(); + entry->Free(table_.GetAllocator()); } } void LRUCacheShard::ApplyToSomeEntries( - const std::function& callback, + const std::function& callback, size_t average_entries_per_lock, size_t* state) { // The state is essentially going to be the starting hash, which works // nicely even if we resize between calls because we use upper-most @@ -192,11 +202,8 @@ void LRUCacheShard::ApplyToSomeEntries( table_.ApplyToEntriesRange( [callback, metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) { - DeleterFn deleter = h->IsSecondaryCacheCompatible() - ? h->info_.helper->del_cb - : h->info_.deleter; callback(h->key(), h->value, h->GetCharge(metadata_charge_policy), - deleter); + h->helper); }, index_begin, index_end); } @@ -339,11 +346,11 @@ void LRUCacheShard::TryInsertIntoSecondaryCache( for (auto entry : evicted_handles) { if (secondary_cache_ && entry->IsSecondaryCacheCompatible() && !entry->IsInSecondaryCache()) { - secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper) + secondary_cache_->Insert(entry->key(), entry->value, entry->helper) .PermitUncheckedError(); } // Free the entries here outside of mutex for performance reasons. - entry->Free(); + entry->Free(table_.GetAllocator()); } } @@ -464,7 +471,7 @@ void LRUCacheShard::Promote(LRUHandle* e) { TryInsertIntoSecondaryCache(last_reference_list); if (free_standalone_handle) { e->Unref(); - e->Free(); + e->Free(table_.GetAllocator()); e = nullptr; } else { PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1); @@ -476,9 +483,9 @@ void LRUCacheShard::Promote(LRUHandle* e) { // rare case that one exists Cache::Priority priority = e->IsHighPri() ? Cache::Priority::HIGH : Cache::Priority::LOW; - s = Insert(e->key(), e->hash, kDummyValueMarker, /*charge=*/0, - /*deleter=*/nullptr, /*helper=*/nullptr, /*handle=*/nullptr, - priority); + s = Insert(e->key(), e->hash, &kDummyValue, &kNoopCacheItemHelper, + /*charge=*/0, + /*handle=*/nullptr, priority); } else { e->SetInCache(true); LRUHandle* handle = e; @@ -508,7 +515,7 @@ void LRUCacheShard::Promote(LRUHandle* e) { LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash, const Cache::CacheItemHelper* helper, - const Cache::CreateCallback& create_cb, + Cache::CreateContext* create_context, Cache::Priority priority, bool wait, Statistics* stats) { LRUHandle* e = nullptr; @@ -518,7 +525,7 @@ LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash, e = table_.Lookup(key, hash); if (e != nullptr) { assert(e->InCache()); - if (e->value == kDummyValueMarker) { + if (e->value == &kDummyValue) { // For a dummy handle, if it was retrieved from secondary cache, // it may still exist in secondary cache. // If the handle exists in secondary cache, the value should be @@ -547,24 +554,17 @@ LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash, // standalone handle is returned to the caller. Only if the block is hit // again, we erase it from CompressedSecondaryCache and add it into the // primary cache. - if (!e && secondary_cache_ && helper && helper->saveto_cb) { - // For objects from the secondary cache, we expect the caller to provide - // a way to create/delete the primary cache object. The only case where - // a deleter would not be required is for dummy entries inserted for - // accounting purposes, which we won't demote to the secondary cache - // anyway. - assert(create_cb && helper->del_cb); + if (!e && secondary_cache_ && helper && helper->create_cb) { bool is_in_sec_cache{false}; std::unique_ptr secondary_handle = - secondary_cache_->Lookup(key, create_cb, wait, found_dummy_entry, - is_in_sec_cache); + secondary_cache_->Lookup(key, helper, create_context, wait, + found_dummy_entry, is_in_sec_cache); if (secondary_handle != nullptr) { e = static_cast(malloc(sizeof(LRUHandle) - 1 + key.size())); e->m_flags = 0; e->im_flags = 0; - e->SetSecondaryCacheCompatible(true); - e->info_.helper = helper; + e->helper = helper; e->key_length = key.size(); e->hash = hash; e->refs = 0; @@ -585,7 +585,7 @@ LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash, if (!e->value) { // The secondary cache returned a handle, but the lookup failed. e->Unref(); - e->Free(); + e->Free(table_.GetAllocator()); e = nullptr; } else { PERF_COUNTER_ADD(secondary_cache_hit_count, 1); @@ -669,16 +669,18 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/, // Free the entry here outside of mutex for performance reasons. if (last_reference) { - e->Free(); + e->Free(table_.GetAllocator()); } return last_reference; } -Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, - size_t charge, - void (*deleter)(const Slice& key, void* value), +Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, + Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, - LRUHandle** handle, Cache::Priority priority) { + size_t charge, LRUHandle** handle, + Cache::Priority priority) { + assert(helper); + // Allocate the memory here outside of the mutex. // If the cache is full, we'll have to release it. // It shouldn't happen very often though. @@ -688,17 +690,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, e->value = value; e->m_flags = 0; e->im_flags = 0; - if (helper) { - // Use only one of the two parameters - assert(deleter == nullptr); - // value == nullptr is reserved for indicating failure for when secondary - // cache compatible - assert(value != nullptr); - e->SetSecondaryCacheCompatible(true); - e->info_.helper = helper; - } else { - e->info_.deleter = deleter; - } + e->helper = helper; e->key_length = key.size(); e->hash = hash; e->refs = 0; @@ -708,6 +700,10 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value, memcpy(e->key_data, key.data(), key.size()); e->CalcTotalCharge(charge, metadata_charge_policy_); + // value == nullptr is reserved for indicating failure for when secondary + // cache compatible + assert(!(e->IsSecondaryCacheCompatible() && value == nullptr)); + return InsertItem(e, handle, /* free_handle_on_fail */ true); } @@ -733,7 +729,7 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { // Free the entry here outside of mutex for performance reasons. // last_reference will only be true if e != nullptr. if (last_reference) { - e->Free(); + e->Free(table_.GetAllocator()); } } @@ -793,18 +789,19 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits, secondary_cache_(std::move(_secondary_cache)) { size_t per_shard = GetPerShardCapacity(); SecondaryCache* secondary_cache = secondary_cache_.get(); + MemoryAllocator* alloc = memory_allocator(); InitShards([=](LRUCacheShard* cs) { new (cs) LRUCacheShard( per_shard, strict_capacity_limit, high_pri_pool_ratio, low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy, - /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache); + /* max_upper_hash_bits */ 32 - num_shard_bits, alloc, secondary_cache); }); } -void* LRUCache::Value(Handle* handle) { +Cache::ObjectPtr LRUCache::Value(Handle* handle) { auto h = reinterpret_cast(handle); assert(!h->IsPending() || h->value == nullptr); - assert(h->value != kDummyValueMarker); + assert(h->value != &kDummyValue); return h->value; } @@ -813,13 +810,10 @@ size_t LRUCache::GetCharge(Handle* handle) const { GetShard(0).metadata_charge_policy_); } -Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const { +const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper( + Handle* handle) const { auto h = reinterpret_cast(handle); - if (h->IsSecondaryCacheCompatible()) { - return h->info_.helper->del_cb; - } else { - return h->info_.deleter; - } + return h->helper; } size_t LRUCache::TEST_GetLRUSize() { diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 99b2f2b2040..1edccd0ce2e 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -13,6 +13,7 @@ #include "cache/sharded_cache.h" #include "port/lang.h" +#include "port/likely.h" #include "port/malloc.h" #include "port/port.h" #include "rocksdb/secondary_cache.h" @@ -48,13 +49,8 @@ namespace lru_cache { // While refs > 0, public properties like value and deleter must not change. struct LRUHandle { - void* value; - union Info { - Info() {} - ~Info() {} - Cache::DeleterFn deleter; - const Cache::CacheItemHelper* helper; - } info_; + Cache::ObjectPtr value; + const Cache::CacheItemHelper* helper; // An entry is not added to the LRUHandleTable until the secondary cache // lookup is complete, so its safe to have this union. union { @@ -93,14 +89,12 @@ struct LRUHandle { IM_IS_HIGH_PRI = (1 << 0), // Whether this entry is low priority entry. IM_IS_LOW_PRI = (1 << 1), - // Can this be inserted into the secondary cache. - IM_IS_SECONDARY_CACHE_COMPATIBLE = (1 << 2), // Is the handle still being read from a lower tier. - IM_IS_PENDING = (1 << 3), + IM_IS_PENDING = (1 << 2), // Whether this handle is still in a lower tier - IM_IS_IN_SECONDARY_CACHE = (1 << 4), + IM_IS_IN_SECONDARY_CACHE = (1 << 3), // Marks result handles that should not be inserted into cache - IM_IS_STANDALONE = (1 << 5), + IM_IS_STANDALONE = (1 << 4), }; // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!) @@ -130,9 +124,7 @@ struct LRUHandle { bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; } bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; } bool HasHit() const { return m_flags & M_HAS_HIT; } - bool IsSecondaryCacheCompatible() const { - return im_flags & IM_IS_SECONDARY_CACHE_COMPATIBLE; - } + bool IsSecondaryCacheCompatible() const { return helper->size_cb != nullptr; } bool IsPending() const { return im_flags & IM_IS_PENDING; } bool IsInSecondaryCache() const { return im_flags & IM_IS_IN_SECONDARY_CACHE; @@ -178,14 +170,6 @@ struct LRUHandle { void SetHit() { m_flags |= M_HAS_HIT; } - void SetSecondaryCacheCompatible(bool compat) { - if (compat) { - im_flags |= IM_IS_SECONDARY_CACHE_COMPATIBLE; - } else { - im_flags &= ~IM_IS_SECONDARY_CACHE_COMPATIBLE; - } - } - void SetIsPending(bool pending) { if (pending) { im_flags |= IM_IS_PENDING; @@ -210,22 +194,19 @@ struct LRUHandle { } } - void Free() { + void Free(MemoryAllocator* allocator) { assert(refs == 0); - if (!IsSecondaryCacheCompatible() && info_.deleter) { - (*info_.deleter)(key(), value); - } else if (IsSecondaryCacheCompatible()) { - if (IsPending()) { - assert(sec_handle != nullptr); - SecondaryCacheResultHandle* tmp_sec_handle = sec_handle; - tmp_sec_handle->Wait(); - value = tmp_sec_handle->Value(); - delete tmp_sec_handle; - } - if (value) { - (*info_.helper->del_cb)(key(), value); - } + if (UNLIKELY(IsPending())) { + assert(sec_handle != nullptr); + SecondaryCacheResultHandle* tmp_sec_handle = sec_handle; + tmp_sec_handle->Wait(); + value = tmp_sec_handle->Value(); + delete tmp_sec_handle; + } + assert(helper); + if (helper->del_cb) { + helper->del_cb(value, allocator); } free(this); @@ -267,7 +248,7 @@ struct LRUHandle { // 4.4.3's builtin hashtable. class LRUHandleTable { public: - explicit LRUHandleTable(int max_upper_hash_bits); + explicit LRUHandleTable(int max_upper_hash_bits, MemoryAllocator* allocator); ~LRUHandleTable(); LRUHandle* Lookup(const Slice& key, uint32_t hash); @@ -291,6 +272,8 @@ class LRUHandleTable { size_t GetOccupancyCount() const { return elems_; } + MemoryAllocator* GetAllocator() const { return allocator_; } + private: // Return a pointer to slot that points to a cache entry that // matches key/hash. If there is no such cache entry, return a @@ -312,6 +295,9 @@ class LRUHandleTable { // Set from max_upper_hash_bits (see constructor). const int max_length_bits_; + + // From Cache, needed for delete + MemoryAllocator* const allocator_; }; // A single shard of sharded cache. @@ -321,7 +307,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { double high_pri_pool_ratio, double low_pri_pool_ratio, bool use_adaptive_mutex, CacheMetadataChargePolicy metadata_charge_policy, - int max_upper_hash_bits, SecondaryCache* secondary_cache); + int max_upper_hash_bits, MemoryAllocator* allocator, + SecondaryCache* secondary_cache); public: // Type definitions expected as parameter to ShardedCache using HandleImpl = LRUHandle; @@ -348,26 +335,15 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { void SetLowPriorityPoolRatio(double low_pri_pool_ratio); // Like Cache methods, but with an extra "hash" parameter. - inline Status Insert(const Slice& key, uint32_t hash, void* value, - size_t charge, Cache::DeleterFn deleter, - LRUHandle** handle, Cache::Priority priority) { - return Insert(key, hash, value, charge, deleter, nullptr, handle, priority); - } - inline Status Insert(const Slice& key, uint32_t hash, void* value, - const Cache::CacheItemHelper* helper, size_t charge, - LRUHandle** handle, Cache::Priority priority) { - assert(helper); - return Insert(key, hash, value, charge, nullptr, helper, handle, priority); - } - // If helper_cb is null, the values of the following arguments don't matter. + Status Insert(const Slice& key, uint32_t hash, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, size_t charge, + LRUHandle** handle, Cache::Priority priority); + LRUHandle* Lookup(const Slice& key, uint32_t hash, const Cache::CacheItemHelper* helper, - const Cache::CreateCallback& create_cb, + Cache::CreateContext* create_context, Cache::Priority priority, bool wait, Statistics* stats); - inline LRUHandle* Lookup(const Slice& key, uint32_t hash) { - return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true, - nullptr); - } + bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref); bool IsReady(LRUHandle* /*handle*/); void Wait(LRUHandle* /*handle*/) {} @@ -384,8 +360,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { size_t GetTableAddressCount() const; void ApplyToSomeEntries( - const std::function& callback, + const std::function& callback, size_t average_entries_per_lock, size_t* state); void EraseUnRefEntries(); @@ -414,9 +391,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { // nullptr. Status InsertItem(LRUHandle* item, LRUHandle** handle, bool free_handle_on_fail); - Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, - DeleterFn deleter, const Cache::CacheItemHelper* helper, - LRUHandle** handle, Cache::Priority priority); // Promote an item looked up from the secondary cache to the LRU cache. // The item may be still in the secondary cache. // It is only inserted into the hash table and not the LRU list, and only @@ -521,9 +495,9 @@ class LRUCache kDontChargeCacheMetadata, std::shared_ptr secondary_cache = nullptr); const char* Name() const override { return "LRUCache"; } - void* Value(Handle* handle) override; + ObjectPtr Value(Handle* handle) override; size_t GetCharge(Handle* handle) const override; - DeleterFn GetDeleter(Handle* handle) const override; + const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override; void WaitAll(std::vector& handles) override; // Retrieves number of elements in LRU, for unit test purpose only. diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index fbf336f8733..f84312cb3f8 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -10,7 +10,7 @@ #include "cache/cache_key.h" #include "cache/clock_cache.h" -#include "cache/fast_lru_cache.h" +#include "cache_helpers.h" #include "db/db_test_util.h" #include "file/sst_file_manager_impl.h" #include "port/port.h" @@ -20,6 +20,7 @@ #include "rocksdb/sst_file_manager.h" #include "rocksdb/utilities/cache_dump_load.h" #include "test_util/testharness.h" +#include "typed_cache.h" #include "util/coding.h" #include "util/random.h" #include "utilities/cache_dump_load_impl.h" @@ -50,14 +51,15 @@ class LRUCacheTest : public testing::Test { high_pri_pool_ratio, low_pri_pool_ratio, use_adaptive_mutex, kDontChargeCacheMetadata, /*max_upper_hash_bits=*/24, + /*allocator*/ nullptr, /*secondary_cache=*/nullptr); } void Insert(const std::string& key, Cache::Priority priority = Cache::Priority::LOW) { - EXPECT_OK(cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/, - nullptr /*deleter*/, nullptr /*handle*/, - priority)); + EXPECT_OK(cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, + &kNoopCacheItemHelper, 1 /*charge*/, + nullptr /*handle*/, priority)); } void Insert(char key, Cache::Priority priority = Cache::Priority::LOW) { @@ -65,7 +67,8 @@ class LRUCacheTest : public testing::Test { } bool Lookup(const std::string& key) { - auto handle = cache_->Lookup(key, 0 /*hash*/); + auto handle = cache_->Lookup(key, 0 /*hash*/, nullptr, nullptr, + Cache::Priority::LOW, true, nullptr); if (handle) { cache_->Release(handle, true /*useful*/, false /*erase*/); return true; @@ -364,152 +367,14 @@ TEST_F(LRUCacheTest, EntriesWithPriority) { ValidateLRUList({"x", "y", "g", "z", "d", "m"}, 2, 2, 2); } -// TODO: FastLRUCache and ClockCache use the same tests. We can probably remove -// them from FastLRUCache after ClockCache becomes productive, and we don't plan -// to use or maintain FastLRUCache any more. -namespace fast_lru_cache { - -// TODO(guido) Replicate LRU policy tests from LRUCache here. -class FastLRUCacheTest : public testing::Test { - public: - FastLRUCacheTest() {} - ~FastLRUCacheTest() override { DeleteCache(); } - - void DeleteCache() { - if (cache_ != nullptr) { - cache_->~LRUCacheShard(); - port::cacheline_aligned_free(cache_); - cache_ = nullptr; - } - } - - void NewCache(size_t capacity) { - DeleteCache(); - cache_ = reinterpret_cast( - port::cacheline_aligned_alloc(sizeof(LRUCacheShard))); - new (cache_) LRUCacheShard(capacity, 1 /*estimated_value_size*/, - false /*strict_capacity_limit*/, - kDontChargeCacheMetadata); - } - - Status Insert(const std::string& key) { - return cache_->Insert(key, 0 /*hash*/, nullptr /*value*/, 1 /*charge*/, - nullptr /*deleter*/, nullptr /*handle*/, - Cache::Priority::LOW); - } - - Status Insert(char key, size_t len) { return Insert(std::string(len, key)); } - - size_t CalcEstimatedHandleChargeWrapper( - size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy) { - return LRUCacheShard::CalcEstimatedHandleCharge(estimated_value_size, - metadata_charge_policy); - } - - int CalcHashBitsWrapper(size_t capacity, size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy) { - return LRUCacheShard::CalcHashBits(capacity, estimated_value_size, - metadata_charge_policy); - } - - // Maximum number of items that a shard can hold. - double CalcMaxOccupancy(size_t capacity, size_t estimated_value_size, - CacheMetadataChargePolicy metadata_charge_policy) { - size_t handle_charge = LRUCacheShard::CalcEstimatedHandleCharge( - estimated_value_size, metadata_charge_policy); - return capacity / (kLoadFactor * handle_charge); - } - bool TableSizeIsAppropriate(int hash_bits, double max_occupancy) { - if (hash_bits == 0) { - return max_occupancy <= 1; - } else { - return (1 << hash_bits >= max_occupancy) && - (1 << (hash_bits - 1) <= max_occupancy); - } - } - - private: - LRUCacheShard* cache_ = nullptr; -}; - -TEST_F(FastLRUCacheTest, ValidateKeySize) { - NewCache(3); - EXPECT_OK(Insert('a', 16)); - EXPECT_NOK(Insert('b', 15)); - EXPECT_OK(Insert('b', 16)); - EXPECT_NOK(Insert('c', 17)); - EXPECT_NOK(Insert('d', 1000)); - EXPECT_NOK(Insert('e', 11)); - EXPECT_NOK(Insert('f', 0)); -} - -TEST_F(FastLRUCacheTest, CalcHashBitsTest) { - size_t capacity; - size_t estimated_value_size; - double max_occupancy; - int hash_bits; - CacheMetadataChargePolicy metadata_charge_policy; - // Vary the cache capacity, fix the element charge. - for (int i = 0; i < 2048; i++) { - capacity = i; - estimated_value_size = 0; - metadata_charge_policy = kFullChargeCacheMetadata; - max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size, - metadata_charge_policy); - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy)); - } - // Fix the cache capacity, vary the element charge. - for (int i = 0; i < 1024; i++) { - capacity = 1024; - estimated_value_size = i; - metadata_charge_policy = kFullChargeCacheMetadata; - max_occupancy = CalcMaxOccupancy(capacity, estimated_value_size, - metadata_charge_policy); - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy)); - } - // Zero-capacity cache, and only values have charge. - capacity = 0; - estimated_value_size = 1; - metadata_charge_policy = kDontChargeCacheMetadata; - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */)); - // Zero-capacity cache, and only metadata has charge. - capacity = 0; - estimated_value_size = 0; - metadata_charge_policy = kFullChargeCacheMetadata; - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */)); - // Small cache, large elements. - capacity = 1024; - estimated_value_size = 8192; - metadata_charge_policy = kFullChargeCacheMetadata; - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, 0 /* max_occupancy */)); - // Large capacity. - capacity = 31924172; - estimated_value_size = 8192; - metadata_charge_policy = kFullChargeCacheMetadata; - max_occupancy = - CalcMaxOccupancy(capacity, estimated_value_size, metadata_charge_policy); - hash_bits = CalcHashBitsWrapper(capacity, estimated_value_size, - metadata_charge_policy); - EXPECT_TRUE(TableSizeIsAppropriate(hash_bits, max_occupancy)); -} - -} // namespace fast_lru_cache - -namespace hyper_clock_cache { +namespace clock_cache { class ClockCacheTest : public testing::Test { public: + using Shard = HyperClockCache::Shard; + using Table = HyperClockTable; + using HandleImpl = Shard::HandleImpl; + ClockCacheTest() {} ~ClockCacheTest() override { DeleteShard(); } @@ -523,17 +388,20 @@ class ClockCacheTest : public testing::Test { void NewShard(size_t capacity, bool strict_capacity_limit = true) { DeleteShard(); - shard_ = reinterpret_cast( - port::cacheline_aligned_alloc(sizeof(ClockCacheShard))); - new (shard_) ClockCacheShard(capacity, 1, strict_capacity_limit, - kDontChargeCacheMetadata); + shard_ = + reinterpret_cast(port::cacheline_aligned_alloc(sizeof(Shard))); + + Table::Opts opts; + opts.estimated_value_size = 1; + new (shard_) Shard(capacity, strict_capacity_limit, + kDontChargeCacheMetadata, /*allocator*/ nullptr, opts); } Status Insert(const UniqueId64x2& hashed_key, Cache::Priority priority = Cache::Priority::LOW) { return shard_->Insert(TestKey(hashed_key), hashed_key, nullptr /*value*/, - 1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/, - priority); + &kNoopCacheItemHelper, 1 /*charge*/, + nullptr /*handle*/, priority); } Status Insert(char key, Cache::Priority priority = Cache::Priority::LOW) { @@ -543,8 +411,8 @@ class ClockCacheTest : public testing::Test { Status InsertWithLen(char key, size_t len) { std::string skey(len, key); return shard_->Insert(skey, TestHashedKey(key), nullptr /*value*/, - 1 /*charge*/, nullptr /*deleter*/, nullptr /*handle*/, - Cache::Priority::LOW); + &kNoopCacheItemHelper, 1 /*charge*/, + nullptr /*handle*/, Cache::Priority::LOW); } bool Lookup(const Slice& key, const UniqueId64x2& hashed_key, @@ -580,7 +448,7 @@ class ClockCacheTest : public testing::Test { return {(static_cast(key) << 56) + 1234U, 5678U}; } - ClockCacheShard* shard_ = nullptr; + Shard* shard_ = nullptr; }; TEST_F(ClockCacheTest, Misc) { @@ -604,7 +472,8 @@ TEST_F(ClockCacheTest, Misc) { } TEST_F(ClockCacheTest, Limits) { - NewShard(3, false /*strict_capacity_limit*/); + constexpr size_t kCapacity = 3; + NewShard(kCapacity, false /*strict_capacity_limit*/); for (bool strict_capacity_limit : {false, true, false}) { SCOPED_TRACE("strict_capacity_limit = " + std::to_string(strict_capacity_limit)); @@ -617,7 +486,7 @@ TEST_F(ClockCacheTest, Limits) { // Single entry charge beyond capacity { Status s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, - 5 /*charge*/, nullptr /*deleter*/, + &kNoopCacheItemHelper, 5 /*charge*/, nullptr /*handle*/, Cache::Priority::LOW); if (strict_capacity_limit) { EXPECT_TRUE(s.IsMemoryLimit()); @@ -628,9 +497,9 @@ TEST_F(ClockCacheTest, Limits) { // Single entry fills capacity { - ClockHandle* h; + HandleImpl* h; ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, - 3 /*charge*/, nullptr /*deleter*/, &h, + &kNoopCacheItemHelper, 3 /*charge*/, &h, Cache::Priority::LOW)); // Try to insert more Status s = Insert('a'); @@ -644,16 +513,19 @@ TEST_F(ClockCacheTest, Limits) { shard_->Release(h, false /*useful*/, false /*erase_if_last_ref*/); } - // Insert more than table size can handle (cleverly using zero-charge - // entries) to exceed occupancy limit. + // Insert more than table size can handle to exceed occupancy limit. + // (Cleverly using mostly zero-charge entries, but some non-zero to + // verify usage tracking on detached entries.) { size_t n = shard_->GetTableAddressCount() + 1; - std::unique_ptr ha { new ClockHandle* [n] {} }; + std::unique_ptr ha { new HandleImpl* [n] {} }; Status s; for (size_t i = 0; i < n && s.ok(); ++i) { hkey[1] = i; - s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, 0 /*charge*/, - nullptr /*deleter*/, &ha[i], Cache::Priority::LOW); + s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, + &kNoopCacheItemHelper, + (i + kCapacity < n) ? 0 : 1 /*charge*/, &ha[i], + Cache::Priority::LOW); if (i == 0) { EXPECT_OK(s); } @@ -791,18 +663,25 @@ TEST_F(ClockCacheTest, ClockEvictionTest) { } } -void IncrementIntDeleter(const Slice& /*key*/, void* value) { - *reinterpret_cast(value) += 1; -} +namespace { +struct DeleteCounter { + int deleted = 0; +}; +const Cache::CacheItemHelper kDeleteCounterHelper{ + CacheEntryRole::kMisc, + [](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) { + static_cast(value)->deleted += 1; + }}; +} // namespace // Testing calls to CorrectNearOverflow in Release TEST_F(ClockCacheTest, ClockCounterOverflowTest) { NewShard(6, /*strict_capacity_limit*/ false); - ClockHandle* h; - int deleted = 0; + HandleImpl* h; + DeleteCounter val; UniqueId64x2 hkey = TestHashedKey('x'); - ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &deleted, 1, - IncrementIntDeleter, &h, Cache::Priority::HIGH)); + ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &val, &kDeleteCounterHelper, 1, + &h, Cache::Priority::HIGH)); // Some large number outstanding shard_->TEST_RefN(h, 123456789); @@ -822,36 +701,36 @@ TEST_F(ClockCacheTest, ClockCounterOverflowTest) { // Free all but last 1 shard_->TEST_ReleaseN(h, 123456789); // Still alive - ASSERT_EQ(deleted, 0); + ASSERT_EQ(val.deleted, 0); // Free last ref, which will finalize erasure shard_->Release(h); // Deleted - ASSERT_EQ(deleted, 1); + ASSERT_EQ(val.deleted, 1); } // This test is mostly to exercise some corner case logic, by forcing two // keys to have the same hash, and more TEST_F(ClockCacheTest, CollidingInsertEraseTest) { NewShard(6, /*strict_capacity_limit*/ false); - int deleted = 0; + DeleteCounter val; UniqueId64x2 hkey1 = TestHashedKey('x'); Slice key1 = TestKey(hkey1); UniqueId64x2 hkey2 = TestHashedKey('y'); Slice key2 = TestKey(hkey2); UniqueId64x2 hkey3 = TestHashedKey('z'); Slice key3 = TestKey(hkey3); - ClockHandle* h1; - ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter, &h1, + HandleImpl* h1; + ASSERT_OK(shard_->Insert(key1, hkey1, &val, &kDeleteCounterHelper, 1, &h1, Cache::Priority::HIGH)); - ClockHandle* h2; - ASSERT_OK(shard_->Insert(key2, hkey2, &deleted, 1, IncrementIntDeleter, &h2, + HandleImpl* h2; + ASSERT_OK(shard_->Insert(key2, hkey2, &val, &kDeleteCounterHelper, 1, &h2, Cache::Priority::HIGH)); - ClockHandle* h3; - ASSERT_OK(shard_->Insert(key3, hkey3, &deleted, 1, IncrementIntDeleter, &h3, + HandleImpl* h3; + ASSERT_OK(shard_->Insert(key3, hkey3, &val, &kDeleteCounterHelper, 1, &h3, Cache::Priority::HIGH)); // Can repeatedly lookup+release despite the hash collision - ClockHandle* tmp_h; + HandleImpl* tmp_h; for (bool erase_if_last_ref : {true, false}) { // but not last ref tmp_h = shard_->Lookup(key1, hkey1); ASSERT_EQ(h1, tmp_h); @@ -872,7 +751,7 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) { shard_->Erase(key1, hkey1); // All still alive - ASSERT_EQ(deleted, 0); + ASSERT_EQ(val.deleted, 0); // Invisible to Lookup tmp_h = shard_->Lookup(key1, hkey1); @@ -890,8 +769,8 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) { } // Also Insert with invisible entry there - ASSERT_OK(shard_->Insert(key1, hkey1, &deleted, 1, IncrementIntDeleter, - nullptr, Cache::Priority::HIGH)); + ASSERT_OK(shard_->Insert(key1, hkey1, &val, &kDeleteCounterHelper, 1, nullptr, + Cache::Priority::HIGH)); tmp_h = shard_->Lookup(key1, hkey1); // Found but distinct handle ASSERT_NE(nullptr, tmp_h); @@ -899,13 +778,13 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) { ASSERT_TRUE(shard_->Release(tmp_h, /*erase_if_last_ref*/ true)); // tmp_h deleted - ASSERT_EQ(deleted--, 1); + ASSERT_EQ(val.deleted--, 1); // Release last ref on h1 (already invisible) ASSERT_TRUE(shard_->Release(h1, /*erase_if_last_ref*/ false)); // h1 deleted - ASSERT_EQ(deleted--, 1); + ASSERT_EQ(val.deleted--, 1); h1 = nullptr; // Can still find h2, h3 @@ -923,7 +802,7 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) { ASSERT_FALSE(shard_->Release(h2, /*erase_if_last_ref*/ false)); // h2 still not deleted (unreferenced in cache) - ASSERT_EQ(deleted, 0); + ASSERT_EQ(val.deleted, 0); // Can still find it tmp_h = shard_->Lookup(key2, hkey2); @@ -933,7 +812,7 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) { ASSERT_TRUE(shard_->Release(h2, /*erase_if_last_ref*/ true)); // h2 deleted - ASSERT_EQ(deleted--, 1); + ASSERT_EQ(val.deleted--, 1); tmp_h = shard_->Lookup(key2, hkey2); ASSERT_EQ(nullptr, tmp_h); @@ -948,13 +827,13 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) { ASSERT_FALSE(shard_->Release(h3, /*erase_if_last_ref*/ false)); // h3 still not deleted (unreferenced in cache) - ASSERT_EQ(deleted, 0); + ASSERT_EQ(val.deleted, 0); // Explicit erase shard_->Erase(key3, hkey3); // h3 deleted - ASSERT_EQ(deleted--, 1); + ASSERT_EQ(val.deleted--, 1); tmp_h = shard_->Lookup(key3, hkey3); ASSERT_EQ(nullptr, tmp_h); } @@ -999,7 +878,7 @@ TEST_F(ClockCacheTest, TableSizesTest) { } } -} // namespace hyper_clock_cache +} // namespace clock_cache class TestSecondaryCache : public SecondaryCache { public: @@ -1017,12 +896,12 @@ class TestSecondaryCache : public SecondaryCache { using ResultMap = std::unordered_map; explicit TestSecondaryCache(size_t capacity) - : num_inserts_(0), num_lookups_(0), inject_failure_(false) { - cache_ = - NewLRUCache(capacity, 0, false, 0.5 /* high_pri_pool_ratio */, nullptr, - kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); - } - ~TestSecondaryCache() override { cache_.reset(); } + : cache_(NewLRUCache(capacity, 0, false, 0.5 /* high_pri_pool_ratio */, + nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata)), + num_inserts_(0), + num_lookups_(0), + inject_failure_(false) {} const char* Name() const override { return "TestSecondaryCache"; } @@ -1030,7 +909,7 @@ class TestSecondaryCache : public SecondaryCache { void ResetInjectFailure() { inject_failure_ = false; } - Status Insert(const Slice& key, void* value, + Status Insert(const Slice& key, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper) override { if (inject_failure_) { return Status::Corruption("Insertion Data Corrupted"); @@ -1049,14 +928,12 @@ class TestSecondaryCache : public SecondaryCache { delete[] buf; return s; } - return cache_->Insert(key, buf, size, - [](const Slice& /*key*/, void* val) -> void { - delete[] static_cast(val); - }); + return cache_.Insert(key, buf, size); } std::unique_ptr Lookup( - const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/, + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool /*wait*/, bool /*advise_erase*/, bool& is_in_sec_cache) override { std::string key_str = key.ToString(); TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str); @@ -1072,24 +949,25 @@ class TestSecondaryCache : public SecondaryCache { return secondary_handle; } - Cache::Handle* handle = cache_->Lookup(key); + TypedHandle* handle = cache_.Lookup(key); num_lookups_++; if (handle) { - void* value = nullptr; + Cache::ObjectPtr value = nullptr; size_t charge = 0; Status s; if (type != ResultType::DEFER_AND_FAIL) { - char* ptr = (char*)cache_->Value(handle); + char* ptr = cache_.Value(handle); size_t size = DecodeFixed64(ptr); ptr += sizeof(uint64_t); - s = create_cb(ptr, size, &value, &charge); + s = helper->create_cb(Slice(ptr, size), create_context, + /*alloc*/ nullptr, &value, &charge); } if (s.ok()) { secondary_handle.reset(new TestSecondaryCacheResultHandle( cache_.get(), handle, value, charge, type)); is_in_sec_cache = true; } else { - cache_->Release(handle); + cache_.Release(handle); } } return secondary_handle; @@ -1128,7 +1006,8 @@ class TestSecondaryCache : public SecondaryCache { class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle { public: TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle, - void* value, size_t size, ResultType type) + Cache::ObjectPtr value, size_t size, + ResultType type) : cache_(cache), handle_(handle), value_(value), @@ -1145,7 +1024,7 @@ class TestSecondaryCache : public SecondaryCache { void Wait() override {} - void* Value() override { + Cache::ObjectPtr Value() override { assert(is_ready_); return value_; } @@ -1157,12 +1036,15 @@ class TestSecondaryCache : public SecondaryCache { private: Cache* cache_; Cache::Handle* handle_; - void* value_; + Cache::ObjectPtr value_; size_t size_; bool is_ready_; }; - std::shared_ptr cache_; + using SharedCache = + BasicTypedSharedCacheInterface; + using TypedHandle = SharedCache::TypedHandle; + SharedCache cache_; uint32_t num_inserts_; uint32_t num_lookups_; bool inject_failure_; @@ -1182,7 +1064,8 @@ class DBSecondaryCacheTest : public DBTestBase { std::unique_ptr fault_env_; }; -class LRUCacheSecondaryCacheTest : public LRUCacheTest { +class LRUCacheSecondaryCacheTest : public LRUCacheTest, + public Cache::CreateContext { public: LRUCacheSecondaryCacheTest() : fail_create_(false) {} ~LRUCacheSecondaryCacheTest() {} @@ -1204,13 +1087,13 @@ class LRUCacheSecondaryCacheTest : public LRUCacheTest { size_t size_; }; - static size_t SizeCallback(void* obj) { - return reinterpret_cast(obj)->Size(); + static size_t SizeCallback(Cache::ObjectPtr obj) { + return static_cast(obj)->Size(); } - static Status SaveToCallback(void* from_obj, size_t from_offset, - size_t length, void* out) { - TestItem* item = reinterpret_cast(from_obj); + static Status SaveToCallback(Cache::ObjectPtr from_obj, size_t from_offset, + size_t length, char* out) { + TestItem* item = static_cast(from_obj); char* buf = item->Buf(); EXPECT_EQ(length, item->Size()); EXPECT_EQ(from_offset, 0); @@ -1218,27 +1101,30 @@ class LRUCacheSecondaryCacheTest : public LRUCacheTest { return Status::OK(); } - static void DeletionCallback(const Slice& /*key*/, void* obj) { - delete reinterpret_cast(obj); + static void DeletionCallback(Cache::ObjectPtr obj, + MemoryAllocator* /*alloc*/) { + delete static_cast(obj); } static Cache::CacheItemHelper helper_; - static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/, - size_t /*size*/, void* /*out*/) { + static Status SaveToCallbackFail(Cache::ObjectPtr /*from_obj*/, + size_t /*from_offset*/, size_t /*length*/, + char* /*out*/) { return Status::NotSupported(); } static Cache::CacheItemHelper helper_fail_; - Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size, - void** out_obj, - size_t* charge) -> Status { - if (fail_create_) { + static Status CreateCallback(const Slice& data, Cache::CreateContext* context, + MemoryAllocator* /*allocator*/, + Cache::ObjectPtr* out_obj, size_t* out_charge) { + auto t = static_cast(context); + if (t->fail_create_) { return Status::NotSupported(); } - *out_obj = reinterpret_cast(new TestItem((char*)buf, size)); - *charge = size; + *out_obj = new TestItem(data.data(), data.size()); + *out_charge = data.size(); return Status::OK(); }; @@ -1248,15 +1134,17 @@ class LRUCacheSecondaryCacheTest : public LRUCacheTest { bool fail_create_; }; -Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_( +Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_{ + CacheEntryRole::kMisc, LRUCacheSecondaryCacheTest::DeletionCallback, LRUCacheSecondaryCacheTest::SizeCallback, LRUCacheSecondaryCacheTest::SaveToCallback, - LRUCacheSecondaryCacheTest::DeletionCallback); + LRUCacheSecondaryCacheTest::CreateCallback}; -Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_fail_( +Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_fail_{ + CacheEntryRole::kMisc, LRUCacheSecondaryCacheTest::DeletionCallback, LRUCacheSecondaryCacheTest::SizeCallback, LRUCacheSecondaryCacheTest::SaveToCallbackFail, - LRUCacheSecondaryCacheTest::DeletionCallback); + LRUCacheSecondaryCacheTest::CreateCallback}; TEST_F(LRUCacheSecondaryCacheTest, BasicTest) { LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, @@ -1265,14 +1153,19 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicTest) { nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); std::shared_ptr secondary_cache = - std::make_shared(2048); + std::make_shared(4096); opts.secondary_cache = secondary_cache; std::shared_ptr cache = NewLRUCache(opts); std::shared_ptr stats = CreateDBStatistics(); CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k3 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); Random rnd(301); + // Start with warming k3 + std::string str3 = rnd.RandomString(1021); + ASSERT_OK(secondary_cache->InsertSaved(k3.AsSlice(), str3)); + std::string str1 = rnd.RandomString(1020); TestItem* item1 = new TestItem(str1.data(), str1.length()); ASSERT_OK(cache->Insert(k1.AsSlice(), item1, @@ -1287,17 +1180,29 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicTest) { Cache::Handle* handle; handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, stats.get()); + /*context*/ this, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str2.size()); cache->Release(handle); + // This lookup should promote k1 and demote k2 handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true, stats.get()); + /*context*/ this, Cache::Priority::LOW, true, stats.get()); ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str1.size()); cache->Release(handle); - ASSERT_EQ(secondary_cache->num_inserts(), 2u); - ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + // This lookup should promote k3 and demote k1 + handle = + cache->Lookup(k3.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, + /*context*/ this, Cache::Priority::LOW, true, stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str3.size()); + cache->Release(handle); + + ASSERT_EQ(secondary_cache->num_inserts(), 3u); + ASSERT_EQ(secondary_cache->num_lookups(), 2u); ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_HITS), secondary_cache->num_lookups()); PerfContext perf_ctx = *get_perf_context(); @@ -1323,18 +1228,19 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicFailTest) { Random rnd(301); std::string str1 = rnd.RandomString(1020); auto item1 = std::make_unique(str1.data(), str1.length()); - ASSERT_TRUE(cache->Insert(k1.AsSlice(), item1.get(), nullptr, str1.length()) - .IsInvalidArgument()); + // NOTE: changed to assert helper != nullptr for efficiency / code size + // ASSERT_TRUE(cache->Insert(k1.AsSlice(), item1.get(), nullptr, + // str1.length()).IsInvalidArgument()); ASSERT_OK(cache->Insert(k1.AsSlice(), item1.get(), &LRUCacheSecondaryCacheTest::helper_, str1.length())); item1.release(); // Appease clang-analyze "potential memory leak" Cache::Handle* handle; - handle = cache->Lookup(k2.AsSlice(), nullptr, test_item_creator, + handle = cache->Lookup(k2.AsSlice(), nullptr, /*context*/ this, Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, false); + /*context*/ this, Cache::Priority::LOW, false); ASSERT_EQ(handle, nullptr); cache.reset(); @@ -1372,18 +1278,18 @@ TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) { Cache::Handle* handle; handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 demotion would have failed handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); // Since k1 didn't get promoted, k2 should still be in cache handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); ASSERT_EQ(secondary_cache->num_inserts(), 1u); @@ -1420,16 +1326,16 @@ TEST_F(LRUCacheSecondaryCacheTest, CreateFailTest) { Cache::Handle* handle; SetFailCreate(true); handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 creation would have failed handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_EQ(handle, nullptr); // Since k1 didn't get promoted, k2 should still be in cache handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); ASSERT_EQ(secondary_cache->num_inserts(), 1u); @@ -1465,19 +1371,19 @@ TEST_F(LRUCacheSecondaryCacheTest, FullCapacityTest) { Cache::Handle* handle; handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); // k1 promotion should fail due to the block cache being at capacity, // but the lookup should still succeed Cache::Handle* handle2; handle2 = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_NE(handle2, nullptr); // Since k1 didn't get inserted, k2 should still be in cache cache->Release(handle); cache->Release(handle2); handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, true); + /*context*/ this, Cache::Priority::LOW, true); ASSERT_NE(handle, nullptr); cache->Release(handle); ASSERT_EQ(secondary_cache->num_inserts(), 1u); @@ -1878,6 +1784,43 @@ TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) { Destroy(options); } +TEST_F(DBSecondaryCacheTest, TestSecondaryWithCompressedCache) { + if (!Snappy_Supported()) { + ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support"); + return; + } + LRUCacheOptions opts(2000 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, + 0.5 /* high_pri_pool_ratio */, + nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata); + std::shared_ptr secondary_cache( + new TestSecondaryCache(2048 * 1024)); + opts.secondary_cache = secondary_cache; + std::shared_ptr cache = NewLRUCache(opts); + BlockBasedTableOptions table_options; + table_options.block_cache_compressed = cache; + table_options.no_block_cache = true; + table_options.block_size = 1234; + Options options = GetDefaultOptions(); + options.compression = kSnappyCompression; + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + Random rnd(301); + const int N = 6; + for (int i = 0; i < N; i++) { + // Partly compressible + std::string p_v = rnd.RandomString(507) + std::string(500, ' '); + ASSERT_OK(Put(Key(i), p_v)); + } + ASSERT_OK(Flush()); + for (int i = 0; i < 2 * N; i++) { + std::string v = Get(Key(i % N)); + ASSERT_EQ(1007, v.size()); + } +} + TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { LRUCacheOptions opts(1024 /* capacity */, 2 /* num_shard_bits */, false /* strict_capacity_limit */, @@ -1917,7 +1860,7 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { for (int i = 0; i < 6; ++i) { results.emplace_back(cache->Lookup( ock.WithOffset(i).AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - test_item_creator, Cache::Priority::LOW, false)); + /*context*/ this, Cache::Priority::LOW, false)); } cache->WaitAll(results); for (int i = 0; i < 6; ++i) { @@ -2043,26 +1986,18 @@ class LRUCacheWithStat : public LRUCache { } ~LRUCacheWithStat() {} - Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter, - Handle** handle, Priority priority) override { - insert_count_++; - return LRUCache::Insert(key, value, charge, deleter, handle, priority); - } - Status Insert(const Slice& key, void* value, const CacheItemHelper* helper, - size_t charge, Handle** handle = nullptr, + Status Insert(const Slice& key, Cache::ObjectPtr value, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, Priority priority = Priority::LOW) override { insert_count_++; return LRUCache::Insert(key, value, helper, charge, handle, priority); } - Handle* Lookup(const Slice& key, Statistics* stats) override { - lookup_count_++; - return LRUCache::Lookup(key, stats); - } Handle* Lookup(const Slice& key, const CacheItemHelper* helper, - const CreateCallback& create_cb, Priority priority, bool wait, + CreateContext* create_context, Priority priority, bool wait, Statistics* stats = nullptr) override { lookup_count_++; - return LRUCache::Lookup(key, helper, create_cb, priority, wait, stats); + return LRUCache::Lookup(key, helper, create_context, priority, wait, stats); } uint32_t GetInsertCount() { return insert_count_; } diff --git a/cache/secondary_cache.cc b/cache/secondary_cache.cc new file mode 100644 index 00000000000..eb4972f8f01 --- /dev/null +++ b/cache/secondary_cache.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/secondary_cache.h" + +#include "cache/cache_entry_roles.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +void NoopDelete(Cache::ObjectPtr, MemoryAllocator*) {} + +size_t SliceSize(Cache::ObjectPtr obj) { + return static_cast(obj)->size(); +} + +Status SliceSaveTo(Cache::ObjectPtr from_obj, size_t from_offset, size_t length, + char* out) { + const Slice& slice = *static_cast(from_obj); + std::memcpy(out, slice.data() + from_offset, length); + return Status::OK(); +} + +Status FailCreate(const Slice&, Cache::CreateContext*, MemoryAllocator*, + Cache::ObjectPtr*, size_t*) { + return Status::NotSupported("Only for dumping data into SecondaryCache"); +} + +} // namespace + +Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) { + static Cache::CacheItemHelper helper{CacheEntryRole::kMisc, &NoopDelete, + &SliceSize, &SliceSaveTo, &FailCreate}; + // NOTE: depends on Insert() being synchronous, not keeping pointer `&saved` + return Insert(key, const_cast(&saved), &helper); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index e3271cc7bd3..65764579fea 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -49,16 +49,12 @@ class CacheShardBase { HashCref GetHash() const; ... }; - Status Insert(const Slice& key, HashCref hash, void* value, size_t charge, - DeleterFn deleter, HandleImpl** handle, - Cache::Priority priority) = 0; - Status Insert(const Slice& key, HashCref hash, void* value, + Status Insert(const Slice& key, HashCref hash, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, size_t charge, HandleImpl** handle, Cache::Priority priority) = 0; - HandleImpl* Lookup(const Slice& key, HashCref hash) = 0; HandleImpl* Lookup(const Slice& key, HashCref hash, const Cache::CacheItemHelper* helper, - const Cache::CreateCallback& create_cb, + Cache::CreateContext* create_context, Cache::Priority priority, bool wait, Statistics* stats) = 0; bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0; @@ -77,8 +73,9 @@ class CacheShardBase { // *state == 0 and implementation sets *state = SIZE_MAX to indicate // completion. void ApplyToSomeEntries( - const std::function& callback, + const std::function& callback, size_t average_entries_per_lock, size_t* state) = 0; void EraseUnRefEntries() = 0; */ @@ -172,36 +169,24 @@ class ShardedCache : public ShardedCacheBase { [s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); }); } - Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter, - Handle** handle, Priority priority) override { - HashVal hash = CacheShard::ComputeHash(key); - auto h_out = reinterpret_cast(handle); - return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out, - priority); - } - Status Insert(const Slice& key, void* value, const CacheItemHelper* helper, - size_t charge, Handle** handle = nullptr, + Status Insert(const Slice& key, ObjectPtr value, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, Priority priority = Priority::LOW) override { - if (!helper) { - return Status::InvalidArgument(); - } + assert(helper); HashVal hash = CacheShard::ComputeHash(key); auto h_out = reinterpret_cast(handle); return GetShard(hash).Insert(key, hash, value, helper, charge, h_out, priority); } - Handle* Lookup(const Slice& key, Statistics* /*stats*/) override { - HashVal hash = CacheShard::ComputeHash(key); - HandleImpl* result = GetShard(hash).Lookup(key, hash); - return reinterpret_cast(result); - } - Handle* Lookup(const Slice& key, const CacheItemHelper* helper, - const CreateCallback& create_cb, Priority priority, bool wait, + Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr, + CreateContext* create_context = nullptr, + Priority priority = Priority::LOW, bool wait = true, Statistics* stats = nullptr) override { HashVal hash = CacheShard::ComputeHash(key); - HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb, - priority, wait, stats); + HandleImpl* result = GetShard(hash).Lookup( + key, hash, helper, create_context, priority, wait, stats); return reinterpret_cast(result); } @@ -244,8 +229,8 @@ class ShardedCache : public ShardedCacheBase { return SumOverShards2(&CacheShard::GetTableAddressCount); } void ApplyToAllEntries( - const std::function& callback, + const std::function& callback, const ApplyToAllEntriesOptions& opts) override { uint32_t num_shards = GetNumShards(); // Iterate over part of each shard, rotating between shards, to diff --git a/cache/typed_cache.h b/cache/typed_cache.h new file mode 100644 index 00000000000..76c82b4a05d --- /dev/null +++ b/cache/typed_cache.h @@ -0,0 +1,339 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// APIs for accessing Cache in a type-safe and convenient way. Cache is kept +// at a low, thin level of abstraction so that different implementations can +// be plugged in, but these wrappers provide clean, convenient access to the +// most common operations. +// +// A number of template classes are needed for sharing common structure. The +// key classes are these: +// +// * PlaceholderCacheInterface - Used for making cache reservations, with +// entries that have a charge but no value. +// * BasicTypedCacheInterface - Used for primary cache storage of +// objects of type TValue. +// * FullTypedCacheHelper - Used for secondary cache +// compatible storage of objects of type TValue. +// * For each of these, there's a "Shared" version +// (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, +// rather than assuming external ownership by holding only a raw `Cache*`. + +#pragma once + +#include +#include +#include +#include + +#include "cache/cache_helpers.h" +#include "rocksdb/advanced_options.h" +#include "rocksdb/cache.h" + +namespace ROCKSDB_NAMESPACE { + +// For future consideration: +// * Pass in value to Insert with std::unique_ptr& to simplify ownership +// transfer logic in callers +// * Make key type a template parameter (e.g. useful for table cache) +// * Closer integration with CacheHandleGuard (opt-in, so not always +// paying the extra overhead) + +#define CACHE_TYPE_DEFS() \ + using Priority = Cache::Priority; \ + using Handle = Cache::Handle; \ + using ObjectPtr = Cache::ObjectPtr; \ + using CreateContext = Cache::CreateContext; \ + using CacheItemHelper = Cache::CacheItemHelper /* caller ; */ + +template +class BaseCacheInterface { + public: + CACHE_TYPE_DEFS(); + + /*implicit*/ BaseCacheInterface(CachePtr cache) : cache_(std::move(cache)) {} + + inline void Release(Handle* handle) { cache_->Release(handle); } + + inline void ReleaseAndEraseIfLastRef(Handle* handle) { + cache_->Release(handle, /*erase_if_last_ref*/ true); + } + + inline void RegisterReleaseAsCleanup(Handle* handle, Cleanable& cleanable) { + cleanable.RegisterCleanup(&ReleaseCacheHandleCleanup, get(), handle); + } + + inline Cache* get() const { return &*cache_; } + + explicit inline operator bool() const noexcept { return cache_ != nullptr; } + + protected: + CachePtr cache_; +}; + +// PlaceholderCacheInterface - Used for making cache reservations, with +// entries that have a charge but no value. CacheEntryRole is required as +// a template parameter. +template +class PlaceholderCacheInterface : public BaseCacheInterface { + public: + CACHE_TYPE_DEFS(); + using BaseCacheInterface::BaseCacheInterface; + + inline Status Insert(const Slice& key, size_t charge, Handle** handle) { + return this->cache_->Insert(key, /*value=*/nullptr, &kHelper, charge, + handle); + } + + static constexpr Cache::CacheItemHelper kHelper{kRole}; +}; + +template +using PlaceholderSharedCacheInterface = + PlaceholderCacheInterface>; + +template +class BasicTypedCacheHelperFns { + public: + CACHE_TYPE_DEFS(); + // E.g. char* for char[] + using TValuePtr = std::remove_extent_t*; + + protected: + inline static ObjectPtr UpCastValue(TValuePtr value) { return value; } + inline static TValuePtr DownCastValue(ObjectPtr value) { + return static_cast(value); + } + + static void Delete(ObjectPtr value, MemoryAllocator* allocator) { + // FIXME: Currently, no callers actually allocate the ObjectPtr objects + // using the custom allocator, just subobjects that keep a reference to + // the allocator themselves (with CacheAllocationPtr). + if (/*DISABLED*/ false && allocator) { + if constexpr (std::is_destructible_v) { + DownCastValue(value)->~TValue(); + } + allocator->Deallocate(value); + } else { + // Like delete but properly handles TValue=char[] etc. + std::default_delete{}(DownCastValue(value)); + } + } +}; + +// In its own class to try to minimize the number of distinct CacheItemHelper +// instances (e.g. don't vary by CachePtr) +template +class BasicTypedCacheHelper : public BasicTypedCacheHelperFns { + public: + static constexpr Cache::CacheItemHelper kBasicHelper{ + kRole, &BasicTypedCacheHelper::Delete}; +}; + +// BasicTypedCacheInterface - Used for primary cache storage of objects of +// type TValue, which can be cleaned up with std::default_delete. The +// role is provided by TValue::kCacheEntryRole or given in an optional +// template parameter. +template +class BasicTypedCacheInterface : public BaseCacheInterface, + public BasicTypedCacheHelper { + public: + CACHE_TYPE_DEFS(); + using typename BasicTypedCacheHelperFns::TValuePtr; + struct TypedHandle : public Handle {}; + using BasicTypedCacheHelper::kBasicHelper; + // ctor + using BaseCacheInterface::BaseCacheInterface; + + inline Status Insert(const Slice& key, TValuePtr value, size_t charge, + TypedHandle** handle = nullptr, + Priority priority = Priority::LOW) { + auto untyped_handle = reinterpret_cast(handle); + return this->cache_->Insert( + key, BasicTypedCacheHelperFns::UpCastValue(value), + &kBasicHelper, charge, untyped_handle, priority); + } + + inline TypedHandle* Lookup(const Slice& key, Statistics* stats = nullptr) { + return reinterpret_cast( + this->cache_->BasicLookup(key, stats)); + } + + inline CacheHandleGuard Guard(TypedHandle* handle) { + if (handle) { + return CacheHandleGuard(&*this->cache_, handle); + } else { + return {}; + } + } + + inline std::shared_ptr SharedGuard(TypedHandle* handle) { + if (handle) { + return MakeSharedCacheHandleGuard(&*this->cache_, handle); + } else { + return {}; + } + } + + inline TValuePtr Value(TypedHandle* handle) { + return BasicTypedCacheHelperFns::DownCastValue( + this->cache_->Value(handle)); + } +}; + +// BasicTypedSharedCacheInterface - Like BasicTypedCacheInterface but with a +// shared_ptr for keeping Cache alive. +template +using BasicTypedSharedCacheInterface = + BasicTypedCacheInterface>; + +// TValue must implement ContentSlice() and ~TValue +// TCreateContext must implement Create(std::unique_ptr*, ...) +template +class FullTypedCacheHelperFns : public BasicTypedCacheHelperFns { + public: + CACHE_TYPE_DEFS(); + + protected: + using typename BasicTypedCacheHelperFns::TValuePtr; + using BasicTypedCacheHelperFns::DownCastValue; + using BasicTypedCacheHelperFns::UpCastValue; + + static size_t Size(ObjectPtr v) { + TValuePtr value = DownCastValue(v); + auto slice = value->ContentSlice(); + return slice.size(); + } + + static Status SaveTo(ObjectPtr v, size_t from_offset, size_t length, + char* out) { + TValuePtr value = DownCastValue(v); + auto slice = value->ContentSlice(); + assert(from_offset < slice.size()); + assert(from_offset + length <= slice.size()); + std::copy_n(slice.data() + from_offset, length, out); + return Status::OK(); + } + + static Status Create(const Slice& data, CreateContext* context, + MemoryAllocator* allocator, ObjectPtr* out_obj, + size_t* out_charge) { + std::unique_ptr value = nullptr; + if constexpr (sizeof(TCreateContext) > 0) { + TCreateContext* tcontext = static_cast(context); + tcontext->Create(&value, out_charge, data, allocator); + } else { + TCreateContext::Create(&value, out_charge, data, allocator); + } + *out_obj = UpCastValue(value.release()); + return Status::OK(); + } +}; + +// In its own class to try to minimize the number of distinct CacheItemHelper +// instances (e.g. don't vary by CachePtr) +template +class FullTypedCacheHelper + : public FullTypedCacheHelperFns { + public: + static constexpr Cache::CacheItemHelper kFullHelper{ + kRole, &FullTypedCacheHelper::Delete, &FullTypedCacheHelper::Size, + &FullTypedCacheHelper::SaveTo, &FullTypedCacheHelper::Create}; +}; + +// FullTypedCacheHelper - Used for secondary cache compatible storage of +// objects of type TValue. In addition to BasicTypedCacheInterface constraints, +// we require TValue::ContentSlice() to return persistable data. This +// simplifies usage for the normal case of simple secondary cache compatibility +// (can give you a Slice to the data already in memory). In addition to +// TCreateContext performing the role of Cache::CreateContext, it is also +// expected to provide a function Create(std::unique_ptr* value, +// size_t* out_charge, const Slice& data, MemoryAllocator* allocator) for +// creating new TValue. +template +class FullTypedCacheInterface + : public BasicTypedCacheInterface, + public FullTypedCacheHelper { + public: + CACHE_TYPE_DEFS(); + using typename BasicTypedCacheInterface::TypedHandle; + using typename BasicTypedCacheHelperFns::TValuePtr; + using BasicTypedCacheHelper::kBasicHelper; + using FullTypedCacheHelper::kFullHelper; + using BasicTypedCacheHelperFns::UpCastValue; + using BasicTypedCacheHelperFns::DownCastValue; + // ctor + using BasicTypedCacheInterface::BasicTypedCacheInterface; + + // Insert with SecondaryCache compatibility (subject to CacheTier). + // (Basic Insert() also inherited.) + inline Status InsertFull( + const Slice& key, TValuePtr value, size_t charge, + TypedHandle** handle = nullptr, Priority priority = Priority::LOW, + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) { + auto untyped_handle = reinterpret_cast(handle); + auto helper = lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier + ? &kFullHelper + : &kBasicHelper; + return this->cache_->Insert(key, UpCastValue(value), helper, charge, + untyped_handle, priority); + } + + // Like SecondaryCache::InsertSaved, with SecondaryCache compatibility + // (subject to CacheTier). + inline Status InsertSaved( + const Slice& key, const Slice& data, TCreateContext* create_context, + Priority priority = Priority::LOW, + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier, + size_t* out_charge = nullptr) { + ObjectPtr value; + size_t charge; + Status st = kFullHelper.create_cb(data, create_context, + this->cache_->memory_allocator(), &value, + &charge); + if (out_charge) { + *out_charge = charge; + } + if (st.ok()) { + st = InsertFull(key, DownCastValue(value), charge, nullptr /*handle*/, + priority, lowest_used_cache_tier); + } else { + kFullHelper.del_cb(value, this->cache_->memory_allocator()); + } + return st; + } + + // Lookup with SecondaryCache support (subject to CacheTier). + // (Basic Lookup() also inherited.) + inline TypedHandle* LookupFull( + const Slice& key, TCreateContext* create_context = nullptr, + Priority priority = Priority::LOW, bool wait = true, + Statistics* stats = nullptr, + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) { + if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) { + return reinterpret_cast(this->cache_->Lookup( + key, &kFullHelper, create_context, priority, wait, stats)); + } else { + return BasicTypedCacheInterface::Lookup(key, + stats); + } + } +}; + +// FullTypedSharedCacheInterface - Like FullTypedCacheInterface but with a +// shared_ptr for keeping Cache alive. +template +using FullTypedSharedCacheInterface = + FullTypedCacheInterface>; + +#undef CACHE_TYPE_DEFS + +} // namespace ROCKSDB_NAMESPACE diff --git a/cloud/cloud_file_cache.cc b/cloud/cloud_file_cache.cc deleted file mode 100644 index 598006369f7..00000000000 --- a/cloud/cloud_file_cache.cc +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) 2021 Rockset. -#ifndef ROCKSDB_LITE - -#include - -#include "cloud/cloud_file_system_impl.h" - -namespace ROCKSDB_NAMESPACE { - -namespace { - -// The Value inside every cached entry -struct Value { - std::string path; - CloudFileSystemImpl* cfs; - - Value(const std::string& _path, CloudFileSystemImpl* _cfs) - : path(_path), cfs(_cfs) {} -}; - -// static method to use as a callback from the cache. -static void DeleteEntry(const Slice& key, void* v) { - Value* value = reinterpret_cast(v); - std::string filename(key.data(), key.size()); - value->cfs->FileCacheDeleter(filename); - delete value; -} - -// These are used to retrieve all the values from the cache. -// Only used for unit tests. -static Value* DecodeValue(void* v) { - return static_cast(reinterpret_cast(v)); -} - -static std::vector> callback_state; -static void callback(void* entry, size_t charge) { - callback_state.push_back({DecodeValue(entry), charge}); -} -static void clear_callback_state() { callback_state.clear(); } -} // namespace - -// -// Touch the file so that is the the most-recent LRU item in cache. -// -void CloudFileSystemImpl::FileCacheAccess(const std::string& fname) { - if (!cloud_fs_options.hasSstFileCache()) { - return; - } - Slice key(fname); - Cache::Handle* handle = cloud_fs_options.sst_file_cache->Lookup(key); - if (handle) { - cloud_fs_options.sst_file_cache->Release(handle); - } - log(InfoLogLevel::DEBUG_LEVEL, fname, "access"); -} - -// -// Record the file into the cache. -// -void CloudFileSystemImpl::FileCacheInsert(const std::string& fname, - uint64_t filesize) { - if (!cloud_fs_options.hasSstFileCache()) { - return; - } - - // insert into cache, key is the file path. - Slice key(fname); - cloud_fs_options.sst_file_cache->Insert(key, new Value(fname, this), filesize, - DeleteEntry); - log(InfoLogLevel::INFO_LEVEL, fname, "insert"); -} - -// -// Remove a specific entry from the cache. -// -void CloudFileSystemImpl::FileCacheErase(const std::string& fname) { - // We erase from the cache even if the cache size is zero. This is needed - // to protect against the when the cache size was dynamically reduced to zero - // on a running database. - if (!cloud_fs_options.sst_file_cache) { - return; - } - - Slice key(fname); - cloud_fs_options.sst_file_cache->Erase(key); - log(InfoLogLevel::INFO_LEVEL, fname, "erased"); -} - -// -// When the cache is full, delete files from local store -// -void CloudFileSystemImpl::FileCacheDeleter(const std::string& fname) { - base_fs_->DeleteFile(fname, IOOptions(), nullptr /*dbg*/); - log(InfoLogLevel::INFO_LEVEL, fname, "purged"); -} - -// -// Get total charge in the cache. -// This is not thread-safe and is used only for unit tests. -// -uint64_t CloudFileSystemImpl::FileCacheGetCharge() { - clear_callback_state(); - cloud_fs_options.sst_file_cache->ApplyToAllCacheEntries(callback, true); - uint64_t total = 0; - for (auto& it : callback_state) { - total += it.second; - } - return total; -} - -// -// Get total number of items in the cache. -// This is not thread-safe and is used only for unit tests. -// -uint64_t CloudFileSystemImpl::FileCacheGetNumItems() { - clear_callback_state(); - cloud_fs_options.sst_file_cache->ApplyToAllCacheEntries(callback, true); - return callback_state.size(); -} - -// Removes all items for the env from the cache. -// This is not thread-safe. -void CloudFileSystemImpl::FileCachePurge() { - // We erase from the cache even if the cache size is zero. This is needed - // to protect against the when the cache size was dynamically reduced to zero - // on a running database. - if (!cloud_fs_options.sst_file_cache) { - return; - } - // fetch all items from cache - clear_callback_state(); - cloud_fs_options.sst_file_cache->ApplyToAllCacheEntries(callback, true); - // for all those items that have a matching cfs, remove them from cache. - for (auto& it : callback_state) { - Value* value = it.first; - if (value->cfs == this) { - Slice key(value->path); - cloud_fs_options.sst_file_cache->Erase(key); - } - } - log(InfoLogLevel::INFO_LEVEL, "ENV-DELETE", "purged"); -} - -void CloudFileSystemImpl::log(InfoLogLevel level, const std::string& fname, - const std::string& msg) { - uint64_t usage = cloud_fs_options.sst_file_cache->GetUsage(); - uint64_t capacity = cloud_fs_options.sst_file_cache->GetCapacity(); - long percent = (capacity > 0 ? (100L * usage / capacity) : 0); - Log(level, info_log_, - "[%s] FileCache %s %s cache-used %" PRIu64 "/%" PRIu64 "(%ld%%) bytes", - Name(), fname.c_str(), msg.c_str(), usage, capacity, percent); -} - -} // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/cloud/cloud_file_system.cc b/cloud/cloud_file_system.cc index a7219afd105..6c859532168 100644 --- a/cloud/cloud_file_system.cc +++ b/cloud/cloud_file_system.cc @@ -84,10 +84,6 @@ void CloudFileSystemOptions::Dump(Logger* log) const { Header(log, " COptions.cloud_file_deletion_delay: %ld", cloud_file_deletion_delay->count()); } - if (sst_file_cache != nullptr) { - Header(log, " COptions.sst_file_cache size: %ld bytes", - sst_file_cache->GetCapacity()); - } } bool CloudFileSystemOptions::GetNameFromEnvironment(const char* name, diff --git a/cloud/cloud_file_system_impl.cc b/cloud/cloud_file_system_impl.cc index c2730dcb3a2..87369f7d62a 100644 --- a/cloud/cloud_file_system_impl.cc +++ b/cloud/cloud_file_system_impl.cc @@ -40,9 +40,6 @@ CloudFileSystemImpl::CloudFileSystemImpl( } CloudFileSystemImpl::~CloudFileSystemImpl() { - // remove items from the file cache - FileCachePurge(); - if (cloud_fs_options.cloud_log_controller) { cloud_fs_options.cloud_log_controller->StopTailingStream(); } @@ -186,8 +183,6 @@ IOStatus CloudFileSystemImpl::NewSequentialFile( result->reset(file.release()); } } - // Do not update the sst_file_cache for sequential read patterns. - // These are mostly used by compaction. Log(InfoLogLevel::DEBUG_LEVEL, info_log_, "[%s] NewSequentialFile file %s %s", Name(), fname.c_str(), st.ToString().c_str()); @@ -239,23 +234,10 @@ IOStatus CloudFileSystemImpl::NewRandomAccessFile( const IOOptions io_opts; if (sstfile || manifest || identity) { - if (cloud_fs_options.keep_local_sst_files || - cloud_fs_options.hasSstFileCache() || !sstfile) { + if (cloud_fs_options.keep_local_sst_files || !sstfile) { // Read from local storage and then from cloud storage. st = base_fs_->NewRandomAccessFile(fname, file_opts, result, dbg); - // Found in local storage. Update LRU cache. - // There is a loose coupling between the sst_file_cache and the files on - // local storage. The sst_file_cache is only used for accounting of sst - // files. We do not keep a reference to the LRU cache handle when the sst - // file remains open by the db. If the LRU policy causes the file to be - // evicted, it will be deleted from local storage, but because the db - // already has an open file handle to it, it can continue to occupy local - // storage space until the time the db decides to close the sst file. - if (sstfile && st.ok()) { - FileCacheAccess(fname); - } - if (!st.ok() && !base_fs_->FileExists(fname, io_opts, dbg).IsNotFound()) { // if status is not OK, but file does exist locally, something is wrong return st; @@ -268,14 +250,6 @@ IOStatus CloudFileSystemImpl::NewRandomAccessFile( // we successfully copied the file, try opening it locally now st = base_fs_->NewRandomAccessFile(fname, file_opts, result, dbg); } - // Update the size of our local sst file cache - if (st.ok() && sstfile && cloud_fs_options.hasSstFileCache()) { - uint64_t local_size; - auto statx = base_fs_->GetFileSize(fname, io_opts, &local_size, dbg); - if (statx.ok()) { - FileCacheInsert(fname, local_size); - } - } } // If we are being paranoic, then we validate that our file size is // the same as in cloud storage. @@ -756,11 +730,6 @@ IOStatus CloudFileSystemImpl::DeleteFile(const std::string& logical_fname, // delete from local, too. Ignore the result, though. The file might not be // there locally. base_fs_->DeleteFile(fname, io_opts, dbg); - - // remove from sst_file_cache - if (sstfile) { - FileCacheErase(fname); - } } else if (logfile && !cloud_fs_options.keep_local_log_files) { // read from Log Controller st = status_to_io_status( @@ -1106,12 +1075,6 @@ Status CloudFileSystemImpl::CheckOption(const FileOptions& file_opts) { std::string msg = "Mmap only if keep_local_sst_files is set"; return Status::InvalidArgument(msg); } - if (cloud_fs_options.hasSstFileCache() && - cloud_fs_options.keep_local_sst_files) { - std::string msg = - "Only one of sst_file_cache or keep_local_sst_files can be set"; - return Status::InvalidArgument(msg); - } return Status::OK(); } @@ -1659,13 +1622,6 @@ IOStatus CloudFileSystemImpl::SanitizeDirectory(const DBOptions& options, local_fs->CreateDirIfMissing(local_name, io_opts, dbg); } - if (cloud_fs_options.hasSstFileCache() && - cloud_fs_options.keep_local_sst_files) { - std::string msg = - "Only one of sst_file_cache or keep_local_sst_files can be set"; - return IOStatus::InvalidArgument(msg); - } - // Shall we reinitialize the clone dir? bool do_reinit = true; auto st = NeedsReinitialization(local_name, &do_reinit); diff --git a/cloud/cloud_file_system_impl.h b/cloud/cloud_file_system_impl.h index fc30603937b..0dffdfeb196 100644 --- a/cloud/cloud_file_system_impl.h +++ b/cloud/cloud_file_system_impl.h @@ -243,12 +243,6 @@ class CloudFileSystemImpl : public CloudFileSystem { Status ValidateOptions(const DBOptions& /*db_opts*/, const ColumnFamilyOptions& /*cf_opts*/) const override; - void FileCacheDeleter(const std::string& fname); - void FileCacheErase(const std::string& fname); - void FileCachePurge(); - uint64_t FileCacheGetCharge(); - uint64_t FileCacheGetNumItems(); - std::string CloudManifestFile(const std::string& dbname); // Apply cloud manifest delta to in-memory cloud manifest. Does not change the @@ -349,10 +343,6 @@ class CloudFileSystemImpl : public CloudFileSystem { IOStatus RollNewEpoch(const std::string& local_dbname); - // helper methods to access the file cache - void FileCacheAccess(const std::string& fname); - void FileCacheInsert(const std::string& fname, uint64_t filesize); - // The dbid of the source database that is cloned std::string src_dbid_; diff --git a/cloud/db_cloud_test.cc b/cloud/db_cloud_test.cc index 2a8cba7d42f..f04d3216aab 100644 --- a/cloud/db_cloud_test.cc +++ b/cloud/db_cloud_test.cc @@ -1875,125 +1875,6 @@ TEST_F(CloudTest, SharedBlockCache) { cloud_fs_options_.src_bucket.GetObjectPath() + "-clone"); } -// Verify that sst_file_cache and file_cache cannot be set together -TEST_F(CloudTest, KeepLocalFilesAndFileCache) { - cloud_fs_options_.sst_file_cache = NewLRUCache(1024); // 1 KB cache - cloud_fs_options_.keep_local_sst_files = true; - ASSERT_TRUE(checkOpen().IsInvalidArgument()); -} - -// Verify that sst_file_cache can be disabled -TEST_F(CloudTest, FileCacheZero) { - cloud_fs_options_.sst_file_cache = NewLRUCache(0); // zero size - OpenDB(); - auto* cimpl = GetCloudFileSystemImpl(); - ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); - ASSERT_OK(db_->Flush(FlushOptions())); - ASSERT_OK(db_->Put(WriteOptions(), "c", "d")); - ASSERT_OK(db_->Flush(FlushOptions())); - auto local_files = GetSSTFiles(dbname_); - EXPECT_EQ(local_files.size(), 0); - EXPECT_EQ(cimpl->FileCacheGetCharge(), 0); - - std::string value; - ASSERT_OK(db_->Get(ReadOptions(), "a", &value)); - ASSERT_TRUE(value.compare("b") == 0); - ASSERT_OK(db_->Get(ReadOptions(), "c", &value)); - ASSERT_TRUE(value.compare("d") == 0); - CloseDB(); -} - -// Verify that sst_file_cache is very small, so no files are local. -TEST_F(CloudTest, FileCacheSmall) { - cloud_fs_options_.sst_file_cache = NewLRUCache(10); // Practically zero size - OpenDB(); - auto* cimpl = GetCloudFileSystemImpl(); - ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); - ASSERT_OK(db_->Flush(FlushOptions())); - ASSERT_OK(db_->Put(WriteOptions(), "c", "d")); - ASSERT_OK(db_->Flush(FlushOptions())); - auto local_files = GetSSTFiles(dbname_); - EXPECT_EQ(local_files.size(), 0); - EXPECT_EQ(cimpl->FileCacheGetCharge(), 0); - CloseDB(); -} - -// Relatively large sst_file cache, so all files are local. -TEST_F(CloudTest, FileCacheLarge) { - size_t capacity = 10240L; - std::shared_ptr cache = NewLRUCache(capacity); - cloud_fs_options_.sst_file_cache = cache; - - // generate two sst files. - OpenDB(); - auto* cimpl = GetCloudFileSystemImpl(); - ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); - ASSERT_OK(db_->Flush(FlushOptions())); - ASSERT_OK(db_->Put(WriteOptions(), "c", "d")); - ASSERT_OK(db_->Flush(FlushOptions())); - - // check that local sst files exist - auto local_files = GetSSTFiles(dbname_); - EXPECT_EQ(local_files.size(), 2); - - // check that local sst files have non zero size - uint64_t totalFileSize = 0; - GetSSTFilesTotalSize(dbname_, &totalFileSize); - EXPECT_GT(totalFileSize, 0); - EXPECT_GE(capacity, totalFileSize); - - // check that cache has two entries - EXPECT_EQ(cimpl->FileCacheGetNumItems(), 2); - - // check that cache charge matches total local sst file size - EXPECT_EQ(cimpl->FileCacheGetNumItems(), 2); - EXPECT_EQ(cimpl->FileCacheGetCharge(), totalFileSize); - CloseDB(); -} - -// Cache will have a few files only. -TEST_F(CloudTest, FileCacheOnDemand) { - size_t capacity = 3000; - int num_shard_bits = 0; // 1 shard - bool strict_capacity_limit = false; - double high_pri_pool_ratio = 0; - - std::shared_ptr cache = - NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio, nullptr, kDefaultToAdaptiveMutex, - CacheMetadataChargePolicy::kDontChargeCacheMetadata); - cloud_fs_options_.sst_file_cache = cache; - options_.level0_file_num_compaction_trigger = 100; // never compact - - OpenDB(); - auto* cimpl = GetCloudFileSystemImpl(); - - // generate four sst files, each of size about 884 bytes - ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); - ASSERT_OK(db_->Flush(FlushOptions())); - ASSERT_OK(db_->Put(WriteOptions(), "c", "d")); - ASSERT_OK(db_->Flush(FlushOptions())); - ASSERT_OK(db_->Put(WriteOptions(), "e", "f")); - ASSERT_OK(db_->Flush(FlushOptions())); - ASSERT_OK(db_->Put(WriteOptions(), "g", "h")); - ASSERT_OK(db_->Flush(FlushOptions())); - - // The db should have 4 sst files in the manifest. - std::vector flist; - db_->GetLiveFilesMetaData(&flist); - EXPECT_EQ(flist.size(), 4); - - // verify that there are only two entries in the cache - EXPECT_EQ(cimpl->FileCacheGetNumItems(), 2); - EXPECT_EQ(cimpl->FileCacheGetCharge(), cache->GetUsage()); - - // There should be only two local sst files. - auto local_files = GetSSTFiles(dbname_); - EXPECT_EQ(local_files.size(), 2); - - CloseDB(); -} - TEST_F(CloudTest, FindLiveFilesFetchManifestTest) { OpenDB(); ASSERT_OK(db_->Put({}, "a", "1")); diff --git a/crash_test.mk b/crash_test.mk index b7b00a65b2b..5e8b3573a22 100644 --- a/crash_test.mk +++ b/crash_test.mk @@ -8,7 +8,7 @@ DB_STRESS_CMD?=./db_stress include common.mk CRASHTEST_MAKE=$(MAKE) -f crash_test.mk -CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) +CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)' .PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \ crash_test_with_best_efforts_recovery crash_test_with_ts \ diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc index d3161c22824..607403ccc32 100644 --- a/db/arena_wrapped_db_iter.cc +++ b/db/arena_wrapped_db_iter.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/arena_wrapped_db_iter.h" + #include "memory/arena.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h index 3275e42df20..f15be306d22 100644 --- a/db/arena_wrapped_db_iter.h +++ b/db/arena_wrapped_db_iter.h @@ -9,7 +9,9 @@ #pragma once #include + #include + #include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/range_del_aggregator.h" diff --git a/db/blob/blob_contents.cc b/db/blob/blob_contents.cc index 9015609e7fe..8b793c5d264 100644 --- a/db/blob/blob_contents.cc +++ b/db/blob/blob_contents.cc @@ -13,12 +13,6 @@ namespace ROCKSDB_NAMESPACE { -std::unique_ptr BlobContents::Create( - CacheAllocationPtr&& allocation, size_t size) { - return std::unique_ptr( - new BlobContents(std::move(allocation), size)); -} - size_t BlobContents::ApproximateMemoryUsage() const { size_t usage = 0; @@ -45,46 +39,4 @@ size_t BlobContents::ApproximateMemoryUsage() const { return usage; } -size_t BlobContents::SizeCallback(void* obj) { - assert(obj); - - return static_cast(obj)->size(); -} - -Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset, - size_t length, void* out) { - assert(from_obj); - - const BlobContents* buf = static_cast(from_obj); - assert(buf->size() >= from_offset + length); - - memcpy(out, buf->data().data() + from_offset, length); - - return Status::OK(); -} - -Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() { - static Cache::CacheItemHelper cache_helper( - &SizeCallback, &SaveToCallback, - GetCacheEntryDeleterForRole()); - - return &cache_helper; -} - -Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation, - const void* buf, size_t size, - void** out_obj, size_t* charge) { - assert(allocation); - - memcpy(allocation.get(), buf, size); - - std::unique_ptr obj = Create(std::move(allocation), size); - BlobContents* const contents = obj.release(); - - *out_obj = contents; - *charge = contents->ApproximateMemoryUsage(); - - return Status::OK(); -} - } // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_contents.h b/db/blob/blob_contents.h index 9b7c5b96963..18ed27c6925 100644 --- a/db/blob/blob_contents.h +++ b/db/blob/blob_contents.h @@ -18,8 +18,8 @@ namespace ROCKSDB_NAMESPACE { // A class representing a single uncompressed value read from a blob file. class BlobContents { public: - static std::unique_ptr Create(CacheAllocationPtr&& allocation, - size_t size); + BlobContents(CacheAllocationPtr&& allocation, size_t size) + : allocation_(std::move(allocation)), data_(allocation_.get(), size) {} BlobContents(const BlobContents&) = delete; BlobContents& operator=(const BlobContents&) = delete; @@ -34,23 +34,26 @@ class BlobContents { size_t ApproximateMemoryUsage() const; - // Callbacks for secondary cache - static size_t SizeCallback(void* obj); - - static Status SaveToCallback(void* from_obj, size_t from_offset, - size_t length, void* out); - - static Cache::CacheItemHelper* GetCacheItemHelper(); - - static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf, - size_t size, void** out_obj, size_t* charge); + // For TypedCacheInterface + const Slice& ContentSlice() const { return data_; } + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kBlobValue; private: - BlobContents(CacheAllocationPtr&& allocation, size_t size) - : allocation_(std::move(allocation)), data_(allocation_.get(), size) {} - CacheAllocationPtr allocation_; Slice data_; }; +class BlobContentsCreator : public Cache::CreateContext { + public: + static void Create(std::unique_ptr* out, size_t* out_charge, + const Slice& contents, MemoryAllocator* alloc) { + auto raw = new BlobContents(AllocateAndCopyBlock(contents, alloc), + contents.size()); + out->reset(raw); + if (out_charge) { + *out_charge = raw->ApproximateMemoryUsage(); + } + } +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc index 5e0e7f6cb4a..952a5676bff 100644 --- a/db/blob/blob_file_builder.cc +++ b/db/blob/blob_file_builder.cc @@ -13,6 +13,7 @@ #include "db/blob/blob_index.h" #include "db/blob/blob_log_format.h" #include "db/blob/blob_log_writer.h" +#include "db/blob/blob_source.h" #include "db/event_helpers.h" #include "db/version_set.h" #include "file/filename.h" @@ -393,7 +394,7 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob, uint64_t blob_offset) const { Status s = Status::OK(); - auto blob_cache = immutable_options_->blob_cache; + BlobSource::SharedCacheInterface blob_cache{immutable_options_->blob_cache}; auto statistics = immutable_options_->statistics.get(); bool warm_cache = prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly && @@ -407,34 +408,12 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob, const Cache::Priority priority = Cache::Priority::BOTTOM; - // Objects to be put into the cache have to be heap-allocated and - // self-contained, i.e. own their contents. The Cache has to be able to - // take unique ownership of them. - CacheAllocationPtr allocation = - AllocateBlock(blob.size(), blob_cache->memory_allocator()); - memcpy(allocation.get(), blob.data(), blob.size()); - std::unique_ptr buf = - BlobContents::Create(std::move(allocation), blob.size()); - - Cache::CacheItemHelper* const cache_item_helper = - BlobContents::GetCacheItemHelper(); - assert(cache_item_helper); - - if (immutable_options_->lowest_used_cache_tier == - CacheTier::kNonVolatileBlockTier) { - s = blob_cache->Insert(key, buf.get(), cache_item_helper, - buf->ApproximateMemoryUsage(), - nullptr /* cache_handle */, priority); - } else { - s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(), - cache_item_helper->del_cb, - nullptr /* cache_handle */, priority); - } + s = blob_cache.InsertSaved(key, blob, nullptr /*context*/, priority, + immutable_options_->lowest_used_cache_tier); if (s.ok()) { RecordTick(statistics, BLOB_DB_CACHE_ADD); - RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size()); - buf.release(); + RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, blob.size()); } else { RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES); } diff --git a/db/blob/blob_file_cache.cc b/db/blob/blob_file_cache.cc index 1a6cdf6880c..19757946d6d 100644 --- a/db/blob/blob_file_cache.cc +++ b/db/blob/blob_file_cache.cc @@ -42,13 +42,13 @@ Status BlobFileCache::GetBlobFileReader( assert(blob_file_reader); assert(blob_file_reader->IsEmpty()); - const Slice key = GetSlice(&blob_file_number); + const Slice key = GetSliceForKey(&blob_file_number); assert(cache_); - Cache::Handle* handle = cache_->Lookup(key); + TypedHandle* handle = cache_.Lookup(key); if (handle) { - *blob_file_reader = CacheHandleGuard(cache_, handle); + *blob_file_reader = cache_.Guard(handle); return Status::OK(); } @@ -57,9 +57,9 @@ Status BlobFileCache::GetBlobFileReader( // Check again while holding mutex MutexLock lock(mutex_.get(key)); - handle = cache_->Lookup(key); + handle = cache_.Lookup(key); if (handle) { - *blob_file_reader = CacheHandleGuard(cache_, handle); + *blob_file_reader = cache_.Guard(handle); return Status::OK(); } @@ -84,8 +84,7 @@ Status BlobFileCache::GetBlobFileReader( { constexpr size_t charge = 1; - const Status s = cache_->Insert(key, reader.get(), charge, - &DeleteCacheEntry, &handle); + const Status s = cache_.Insert(key, reader.get(), charge, &handle); if (!s.ok()) { RecordTick(statistics, NO_FILE_ERRORS); return s; @@ -94,7 +93,7 @@ Status BlobFileCache::GetBlobFileReader( reader.release(); - *blob_file_reader = CacheHandleGuard(cache_, handle); + *blob_file_reader = cache_.Guard(handle); return Status::OK(); } diff --git a/db/blob/blob_file_cache.h b/db/blob/blob_file_cache.h index 8eec05f184e..6281897d601 100644 --- a/db/blob/blob_file_cache.h +++ b/db/blob/blob_file_cache.h @@ -7,7 +7,8 @@ #include -#include "cache/cache_helpers.h" +#include "cache/typed_cache.h" +#include "db/blob/blob_file_reader.h" #include "rocksdb/rocksdb_namespace.h" #include "util/mutexlock.h" @@ -18,7 +19,6 @@ struct ImmutableOptions; struct FileOptions; class HistogramImpl; class Status; -class BlobFileReader; class Slice; class IOTracer; @@ -36,7 +36,10 @@ class BlobFileCache { CacheHandleGuard* blob_file_reader); private: - Cache* cache_; + using CacheInterface = + BasicTypedCacheInterface; + using TypedHandle = CacheInterface::TypedHandle; + CacheInterface cache_; // Note: mutex_ below is used to guard against multiple threads racing to open // the same file. Striped mutex_; diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc index a4eabb60599..da7f2bb12eb 100644 --- a/db/blob/blob_file_reader.cc +++ b/db/blob/blob_file_reader.cc @@ -569,12 +569,7 @@ Status BlobFileReader::UncompressBlobIfNeeded( assert(result); if (compression_type == kNoCompression) { - CacheAllocationPtr allocation = - AllocateBlock(value_slice.size(), allocator); - memcpy(allocation.get(), value_slice.data(), value_slice.size()); - - *result = BlobContents::Create(std::move(allocation), value_slice.size()); - + BlobContentsCreator::Create(result, nullptr, value_slice, allocator); return Status::OK(); } @@ -602,7 +597,7 @@ Status BlobFileReader::UncompressBlobIfNeeded( return Status::Corruption("Unable to uncompress blob"); } - *result = BlobContents::Create(std::move(output), uncompressed_size); + result->reset(new BlobContents(std::move(output), uncompressed_size)); return Status::OK(); } diff --git a/db/blob/blob_log_format.h b/db/blob/blob_log_format.h index 4caa5a5f86e..607db23678a 100644 --- a/db/blob/blob_log_format.h +++ b/db/blob/blob_log_format.h @@ -22,6 +22,8 @@ constexpr uint32_t kVersion1 = 1; using ExpirationRange = std::pair; +// clang-format off + // Format of blob log file header (30 bytes): // // +--------------+---------+---------+-------+-------------+-------------------+ @@ -35,6 +37,9 @@ using ExpirationRange = std::pair; // // Expiration range in the header is a rough range based on // blob_db_options.ttl_range_secs. + +// clang-format on + struct BlobLogHeader { static constexpr size_t kSize = 30; @@ -57,6 +62,8 @@ struct BlobLogHeader { Status DecodeFrom(Slice slice); }; +// clang-format off + // Format of blob log file footer (32 bytes): // // +--------------+------------+-------------------+------------+ @@ -69,6 +76,9 @@ struct BlobLogHeader { // // Unlike the same field in file header, expiration range in the footer is the // range of smallest and largest expiration of the data in this file. + +// clang-format on + struct BlobLogFooter { static constexpr size_t kSize = 32; @@ -81,6 +91,8 @@ struct BlobLogFooter { Status DecodeFrom(Slice slice); }; +// clang-format off + // Blob record format (32 bytes header + key + value): // // +------------+--------------+------------+------------+----------+---------+-----------+ @@ -100,6 +112,9 @@ struct BlobLogFooter { // // We could use variable length encoding (Varint64) to save more space, but it // make reader more complicated. + +// clang-format on + struct BlobLogRecord { // header include fields up to blob CRC static constexpr size_t kHeaderSize = 32; diff --git a/db/blob/blob_source.cc b/db/blob/blob_source.cc index bfade2507ee..19cfb1f89a1 100644 --- a/db/blob/blob_source.cc +++ b/db/blob/blob_source.cc @@ -36,8 +36,8 @@ BlobSource::BlobSource(const ImmutableOptions* immutable_options, if (bbto && bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache) .charged == CacheEntryRoleOptions::Decision::kEnabled) { - blob_cache_ = std::make_shared(immutable_options->blob_cache, - bbto->block_cache); + blob_cache_ = SharedCacheInterface{std::make_shared( + immutable_options->blob_cache, bbto->block_cache)}; } #endif // ROCKSDB_LITE } @@ -82,9 +82,8 @@ Status BlobSource::PutBlobIntoCache( assert(cached_blob); assert(cached_blob->IsEmpty()); - Cache::Handle* cache_handle = nullptr; + TypedHandle* cache_handle = nullptr; const Status s = InsertEntryIntoCache(cache_key, blob->get(), - (*blob)->ApproximateMemoryUsage(), &cache_handle, Cache::Priority::BOTTOM); if (s.ok()) { blob->release(); @@ -106,26 +105,10 @@ Status BlobSource::PutBlobIntoCache( return s; } -Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const { - Cache::Handle* cache_handle = nullptr; - - if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) { - Cache::CreateCallback create_cb = - [allocator = blob_cache_->memory_allocator()]( - const void* buf, size_t size, void** out_obj, - size_t* charge) -> Status { - return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf, - size, out_obj, charge); - }; - - cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(), - create_cb, Cache::Priority::BOTTOM, - true /* wait_for_cache */, statistics_); - } else { - cache_handle = blob_cache_->Lookup(key, statistics_); - } - - return cache_handle; +BlobSource::TypedHandle* BlobSource::GetEntryFromCache(const Slice& key) const { + return blob_cache_.LookupFull( + key, nullptr /* context */, Cache::Priority::BOTTOM, + true /* wait_for_cache */, statistics_, lowest_used_cache_tier_); } void BlobSource::PinCachedBlob(CacheHandleGuard* cached_blob, @@ -166,24 +149,11 @@ void BlobSource::PinOwnedBlob(std::unique_ptr* owned_blob, } Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value, - size_t charge, - Cache::Handle** cache_handle, + TypedHandle** cache_handle, Cache::Priority priority) const { - Status s; - - Cache::CacheItemHelper* const cache_item_helper = - BlobContents::GetCacheItemHelper(); - assert(cache_item_helper); - - if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) { - s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle, - priority); - } else { - s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb, - cache_handle, priority); - } - - return s; + return blob_cache_.InsertFull(key, value, value->ApproximateMemoryUsage(), + cache_handle, priority, + lowest_used_cache_tier_); } Status BlobSource::GetBlob(const ReadOptions& read_options, @@ -252,9 +222,10 @@ Status BlobSource::GetBlob(const ReadOptions& read_options, return Status::Corruption("Compression type mismatch when reading blob"); } - MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache) - ? blob_cache_->memory_allocator() - : nullptr; + MemoryAllocator* const allocator = + (blob_cache_ && read_options.fill_cache) + ? blob_cache_.get()->memory_allocator() + : nullptr; uint64_t read_size = 0; s = blob_file_reader.GetValue()->GetBlob( @@ -418,9 +389,10 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, assert(blob_file_reader.GetValue()); - MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache) - ? blob_cache_->memory_allocator() - : nullptr; + MemoryAllocator* const allocator = + (blob_cache_ && read_options.fill_cache) + ? blob_cache_.get()->memory_allocator() + : nullptr; blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator, _blob_reqs, &_bytes_read); diff --git a/db/blob/blob_source.h b/db/blob/blob_source.h index 2ed296eeb46..cdc21874784 100644 --- a/db/blob/blob_source.h +++ b/db/blob/blob_source.h @@ -8,8 +8,9 @@ #include #include -#include "cache/cache_helpers.h" #include "cache/cache_key.h" +#include "cache/typed_cache.h" +#include "db/blob/blob_contents.h" #include "db/blob/blob_file_cache.h" #include "db/blob/blob_read_request.h" #include "rocksdb/cache.h" @@ -23,7 +24,6 @@ struct ImmutableOptions; class Status; class FilePrefetchBuffer; class Slice; -class BlobContents; // BlobSource is a class that provides universal access to blobs, regardless of // whether they are in the blob cache, secondary cache, or (remote) storage. @@ -106,6 +106,14 @@ class BlobSource { bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size, uint64_t offset, size_t* charge = nullptr) const; + // For TypedSharedCacheInterface + void Create(BlobContents** out, const char* buf, size_t size, + MemoryAllocator* alloc); + + using SharedCacheInterface = + FullTypedSharedCacheInterface; + using TypedHandle = SharedCacheInterface::TypedHandle; + private: Status GetBlobFromCache(const Slice& cache_key, CacheHandleGuard* cached_blob) const; @@ -120,10 +128,10 @@ class BlobSource { static void PinOwnedBlob(std::unique_ptr* owned_blob, PinnableSlice* value); - Cache::Handle* GetEntryFromCache(const Slice& key) const; + TypedHandle* GetEntryFromCache(const Slice& key) const; Status InsertEntryIntoCache(const Slice& key, BlobContents* value, - size_t charge, Cache::Handle** cache_handle, + TypedHandle** cache_handle, Cache::Priority priority) const; inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/, @@ -141,7 +149,7 @@ class BlobSource { BlobFileCache* blob_file_cache_; // A cache to store uncompressed blobs. - std::shared_ptr blob_cache_; + mutable SharedCacheInterface blob_cache_; // The control option of how the cache tiers will be used. Currently rocksdb // support block/blob cache (volatile tier) and secondary cache (this tier diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index a85ed86462d..4a1ba84eeab 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -1150,15 +1150,6 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { auto blob_cache = options_.blob_cache; auto secondary_cache = lru_cache_opts_.secondary_cache; - Cache::CreateCallback create_cb = [](const void* buf, size_t size, - void** out_obj, - size_t* charge) -> Status { - CacheAllocationPtr allocation(new char[size]); - - return BlobContents::CreateCallback(std::move(allocation), buf, size, - out_obj, charge); - }; - { // GetBlob std::vector values(keys.size()); @@ -1219,14 +1210,15 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { { CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]); const Slice key0 = cache_key.AsSlice(); - auto handle0 = blob_cache->Lookup(key0, statistics); + auto handle0 = blob_cache->BasicLookup(key0, statistics); ASSERT_EQ(handle0, nullptr); // key0's item should be in the secondary cache. bool is_in_sec_cache = false; - auto sec_handle0 = - secondary_cache->Lookup(key0, create_cb, true, - /*advise_erase=*/true, is_in_sec_cache); + auto sec_handle0 = secondary_cache->Lookup( + key0, &BlobSource::SharedCacheInterface::kFullHelper, + /*context*/ nullptr, true, + /*advise_erase=*/true, is_in_sec_cache); ASSERT_FALSE(is_in_sec_cache); ASSERT_NE(sec_handle0, nullptr); ASSERT_TRUE(sec_handle0->IsReady()); @@ -1246,14 +1238,15 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { { CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]); const Slice key1 = cache_key.AsSlice(); - auto handle1 = blob_cache->Lookup(key1, statistics); + auto handle1 = blob_cache->BasicLookup(key1, statistics); ASSERT_NE(handle1, nullptr); blob_cache->Release(handle1); bool is_in_sec_cache = false; - auto sec_handle1 = - secondary_cache->Lookup(key1, create_cb, true, - /*advise_erase=*/true, is_in_sec_cache); + auto sec_handle1 = secondary_cache->Lookup( + key1, &BlobSource::SharedCacheInterface::kFullHelper, + /*context*/ nullptr, true, + /*advise_erase=*/true, is_in_sec_cache); ASSERT_FALSE(is_in_sec_cache); ASSERT_EQ(sec_handle1, nullptr); @@ -1276,7 +1269,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { // key0 should be in the primary cache. CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]); const Slice key0 = cache_key0.AsSlice(); - auto handle0 = blob_cache->Lookup(key0, statistics); + auto handle0 = blob_cache->BasicLookup(key0, statistics); ASSERT_NE(handle0, nullptr); auto value = static_cast(blob_cache->Value(handle0)); ASSERT_NE(value, nullptr); @@ -1286,12 +1279,12 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { // key1 is not in the primary cache and is in the secondary cache. CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]); const Slice key1 = cache_key1.AsSlice(); - auto handle1 = blob_cache->Lookup(key1, statistics); + auto handle1 = blob_cache->BasicLookup(key1, statistics); ASSERT_EQ(handle1, nullptr); // erase key0 from the primary cache. blob_cache->Erase(key0); - handle0 = blob_cache->Lookup(key0, statistics); + handle0 = blob_cache->BasicLookup(key0, statistics); ASSERT_EQ(handle0, nullptr); // key1 promotion should succeed due to the primary cache being empty. we @@ -1307,7 +1300,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { // in the secondary cache. So, the primary cache's Lookup() without // secondary cache support cannot see it. (NOTE: The dummy handle used // to be a leaky abstraction but not anymore.) - handle1 = blob_cache->Lookup(key1, statistics); + handle1 = blob_cache->BasicLookup(key1, statistics); ASSERT_EQ(handle1, nullptr); // But after another access, it is promoted to primary cache @@ -1315,7 +1308,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { blob_offsets[1])); // And Lookup() can find it (without secondary cache support) - handle1 = blob_cache->Lookup(key1, statistics); + handle1 = blob_cache->BasicLookup(key1, statistics); ASSERT_NE(handle1, nullptr); ASSERT_NE(blob_cache->Value(handle1), nullptr); blob_cache->Release(handle1); diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc index c7fbc332ab2..e6832a2ae44 100644 --- a/db/blob/db_blob_basic_test.cc +++ b/db/blob/db_blob_basic_test.cc @@ -779,7 +779,7 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { Options options = GetDefaultOptions(); LRUCacheOptions co; - co.capacity = 2 << 20; // 2MB + co.capacity = 2 << 20; // 2MB co.num_shard_bits = 2; co.metadata_charge_policy = kDontChargeCacheMetadata; auto backing_cache = NewLRUCache(co); diff --git a/db/builder.cc b/db/builder.cc index d4bb395b111..a84bd5a45f4 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -71,8 +71,9 @@ Status BuildTable( int job_id, const Env::IOPriority io_priority, TableProperties* table_properties, Env::WriteLifeTimeHint write_hint, const std::string* full_history_ts_low, - BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries, - uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) { + BlobFileCompletionCallback* blob_callback, Version* version, + uint64_t* num_input_entries, uint64_t* memtable_payload_bytes, + uint64_t* memtable_garbage_bytes) { assert((tboptions.column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == tboptions.column_family_name.empty()); @@ -175,10 +176,10 @@ Status BuildTable( builder = NewTableBuilder(tboptions, file_writer.get()); } + auto ucmp = tboptions.internal_comparator.user_comparator(); MergeHelper merge( - env, tboptions.internal_comparator.user_comparator(), - ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger, - true /* internal key corruption is not ok */, + env, ucmp, ioptions.merge_operator.get(), compaction_filter.get(), + ioptions.logger, true /* internal key corruption is not ok */, snapshots.empty() ? 0 : snapshots.back(), snapshot_checker); std::unique_ptr blob_file_builder( @@ -196,9 +197,8 @@ Status BuildTable( const std::atomic kManualCompactionCanceledFalse{false}; CompactionIterator c_iter( - iter, tboptions.internal_comparator.user_comparator(), &merge, - kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot, - job_snapshot, snapshot_checker, env, + iter, ucmp, &merge, kMaxSequenceNumber, &snapshots, + earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.stats), true /* internal key corruption is not ok */, range_del_agg.get(), blob_file_builder.get(), ioptions.allow_data_in_errors, @@ -241,14 +241,28 @@ Status BuildTable( if (s.ok()) { auto range_del_it = range_del_agg->NewIterator(); + Slice last_tombstone_start_user_key{}; for (range_del_it->SeekToFirst(); range_del_it->Valid(); range_del_it->Next()) { auto tombstone = range_del_it->Tombstone(); auto kv = tombstone.Serialize(); builder->Add(kv.first.Encode(), kv.second); - meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(), - tombstone.seq_, + InternalKey tombstone_end = tombstone.SerializeEndKey(); + meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_, tboptions.internal_comparator); + if (version) { + if (last_tombstone_start_user_key.empty() || + ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key, + range_del_it->start_key()) < 0) { + SizeApproximationOptions approx_opts; + approx_opts.files_size_error_margin = 0.1; + meta->compensated_range_deletion_size += versions->ApproximateSize( + approx_opts, version, kv.first.Encode(), tombstone_end.Encode(), + 0 /* start_level */, -1 /* end_level */, + TableReaderCaller::kFlush); + } + last_tombstone_start_user_key = range_del_it->start_key(); + } } } @@ -281,7 +295,8 @@ Status BuildTable( meta->fd.file_size = file_size; meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); - tp = builder->GetTableProperties(); // refresh now that builder is finished + tp = builder + ->GetTableProperties(); // refresh now that builder is finished if (memtable_payload_bytes != nullptr && memtable_garbage_bytes != nullptr) { const CompactionIterationStats& ci_stats = c_iter.iter_stats(); diff --git a/db/builder.h b/db/builder.h index a028fd2ba38..063da5ca9ed 100644 --- a/db/builder.h +++ b/db/builder.h @@ -13,6 +13,7 @@ #include "db/range_tombstone_fragmenter.h" #include "db/seqno_to_time_mapping.h" #include "db/table_properties_collector.h" +#include "db/version_set.h" #include "logging/event_logger.h" #include "options/cf_options.h" #include "rocksdb/comparator.h" @@ -70,7 +71,7 @@ extern Status BuildTable( Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET, const std::string* full_history_ts_low = nullptr, BlobFileCompletionCallback* blob_callback = nullptr, - uint64_t* num_input_entries = nullptr, + Version* version = nullptr, uint64_t* num_input_entries = nullptr, uint64_t* memtable_payload_bytes = nullptr, uint64_t* memtable_garbage_bytes = nullptr); diff --git a/db/c.cc b/db/c.cc index 3ce2780615e..9615791a83c 100644 --- a/db/c.cc +++ b/db/c.cc @@ -125,28 +125,48 @@ using ROCKSDB_NAMESPACE::WriteBatch; using ROCKSDB_NAMESPACE::WriteBatchWithIndex; using ROCKSDB_NAMESPACE::WriteOptions; -using std::vector; using std::unordered_set; +using std::vector; extern "C" { -struct rocksdb_t { DB* rep; }; -struct rocksdb_backup_engine_t { BackupEngine* rep; }; -struct rocksdb_backup_engine_info_t { std::vector rep; }; -struct rocksdb_restore_options_t { RestoreOptions rep; }; -struct rocksdb_iterator_t { Iterator* rep; }; -struct rocksdb_writebatch_t { WriteBatch rep; }; -struct rocksdb_writebatch_wi_t { WriteBatchWithIndex* rep; }; -struct rocksdb_snapshot_t { const Snapshot* rep; }; -struct rocksdb_flushoptions_t { FlushOptions rep; }; -struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; }; +struct rocksdb_t { + DB* rep; +}; +struct rocksdb_backup_engine_t { + BackupEngine* rep; +}; +struct rocksdb_backup_engine_info_t { + std::vector rep; +}; +struct rocksdb_restore_options_t { + RestoreOptions rep; +}; +struct rocksdb_iterator_t { + Iterator* rep; +}; +struct rocksdb_writebatch_t { + WriteBatch rep; +}; +struct rocksdb_writebatch_wi_t { + WriteBatchWithIndex* rep; +}; +struct rocksdb_snapshot_t { + const Snapshot* rep; +}; +struct rocksdb_flushoptions_t { + FlushOptions rep; +}; +struct rocksdb_fifo_compaction_options_t { + CompactionOptionsFIFO rep; +}; struct rocksdb_readoptions_t { - ReadOptions rep; - // stack variables to set pointers to in ReadOptions - Slice upper_bound; - Slice lower_bound; - Slice timestamp; - Slice iter_start_ts; + ReadOptions rep; + // stack variables to set pointers to in ReadOptions + Slice upper_bound; + Slice lower_bound; + Slice timestamp; + Slice iter_start_ts; }; struct rocksdb_writeoptions_t { WriteOptions rep; @@ -164,12 +184,24 @@ struct rocksdb_block_based_table_options_t { struct rocksdb_cuckoo_table_options_t { CuckooTableOptions rep; }; -struct rocksdb_seqfile_t { SequentialFile* rep; }; -struct rocksdb_randomfile_t { RandomAccessFile* rep; }; -struct rocksdb_writablefile_t { WritableFile* rep; }; -struct rocksdb_wal_iterator_t { TransactionLogIterator* rep; }; -struct rocksdb_wal_readoptions_t { TransactionLogIterator::ReadOptions rep; }; -struct rocksdb_filelock_t { FileLock* rep; }; +struct rocksdb_seqfile_t { + SequentialFile* rep; +}; +struct rocksdb_randomfile_t { + RandomAccessFile* rep; +}; +struct rocksdb_writablefile_t { + WritableFile* rep; +}; +struct rocksdb_wal_iterator_t { + TransactionLogIterator* rep; +}; +struct rocksdb_wal_readoptions_t { + TransactionLogIterator::ReadOptions rep; +}; +struct rocksdb_filelock_t { + FileLock* rep; +}; struct rocksdb_logger_t { std::shared_ptr rep; }; @@ -182,8 +214,12 @@ struct rocksdb_memory_allocator_t { struct rocksdb_cache_t { std::shared_ptr rep; }; -struct rocksdb_livefiles_t { std::vector rep; }; -struct rocksdb_column_family_handle_t { ColumnFamilyHandle* rep; }; +struct rocksdb_livefiles_t { + std::vector rep; +}; +struct rocksdb_column_family_handle_t { + ColumnFamilyHandle* rep; +}; struct rocksdb_column_family_metadata_t { ColumnFamilyMetaData rep; }; @@ -193,13 +229,21 @@ struct rocksdb_level_metadata_t { struct rocksdb_sst_file_metadata_t { const SstFileMetaData* rep; }; -struct rocksdb_envoptions_t { EnvOptions rep; }; -struct rocksdb_ingestexternalfileoptions_t { IngestExternalFileOptions rep; }; -struct rocksdb_sstfilewriter_t { SstFileWriter* rep; }; +struct rocksdb_envoptions_t { + EnvOptions rep; +}; +struct rocksdb_ingestexternalfileoptions_t { + IngestExternalFileOptions rep; +}; +struct rocksdb_sstfilewriter_t { + SstFileWriter* rep; +}; struct rocksdb_ratelimiter_t { std::shared_ptr rep; }; -struct rocksdb_perfcontext_t { PerfContext* rep; }; +struct rocksdb_perfcontext_t { + PerfContext* rep; +}; struct rocksdb_pinnableslice_t { PinnableSlice rep; }; @@ -235,13 +279,10 @@ struct rocksdb_compactionfiltercontext_t { struct rocksdb_compactionfilter_t : public CompactionFilter { void* state_; void (*destructor_)(void*); - unsigned char (*filter_)( - void*, - int level, - const char* key, size_t key_length, - const char* existing_value, size_t value_length, - char** new_value, size_t *new_value_length, - unsigned char* value_changed); + unsigned char (*filter_)(void*, int level, const char* key, size_t key_length, + const char* existing_value, size_t value_length, + char** new_value, size_t* new_value_length, + unsigned char* value_changed); const char* (*name_)(void*); unsigned char ignore_snapshots_; @@ -252,12 +293,10 @@ struct rocksdb_compactionfilter_t : public CompactionFilter { char* c_new_value = nullptr; size_t new_value_length = 0; unsigned char c_value_changed = 0; - unsigned char result = (*filter_)( - state_, - level, - key.data(), key.size(), - existing_value.data(), existing_value.size(), - &c_new_value, &new_value_length, &c_value_changed); + unsigned char result = + (*filter_)(state_, level, key.data(), key.size(), existing_value.data(), + existing_value.size(), &c_new_value, &new_value_length, + &c_value_changed); if (c_value_changed) { new_value->assign(c_new_value, new_value_length); *value_changed = true; @@ -350,20 +389,16 @@ struct rocksdb_mergeoperator_t : public MergeOperator { void* state_; void (*destructor_)(void*); const char* (*name_)(void*); - char* (*full_merge_)( - void*, - const char* key, size_t key_length, - const char* existing_value, size_t existing_value_length, - const char* const* operands_list, const size_t* operands_list_length, - int num_operands, - unsigned char* success, size_t* new_value_length); + char* (*full_merge_)(void*, const char* key, size_t key_length, + const char* existing_value, size_t existing_value_length, + const char* const* operands_list, + const size_t* operands_list_length, int num_operands, + unsigned char* success, size_t* new_value_length); char* (*partial_merge_)(void*, const char* key, size_t key_length, const char* const* operands_list, const size_t* operands_list_length, int num_operands, unsigned char* success, size_t* new_value_length); - void (*delete_value_)( - void*, - const char* value, size_t value_length); + void (*delete_value_)(void*, const char* value, size_t value_length); ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); } @@ -447,16 +482,10 @@ struct rocksdb_slicetransform_t : public SliceTransform { void* state_; void (*destructor_)(void*); const char* (*name_)(void*); - char* (*transform_)( - void*, - const char* key, size_t length, - size_t* dst_length); - unsigned char (*in_domain_)( - void*, - const char* key, size_t length); - unsigned char (*in_range_)( - void*, - const char* key, size_t length); + char* (*transform_)(void*, const char* key, size_t length, + size_t* dst_length); + unsigned char (*in_domain_)(void*, const char* key, size_t length); + unsigned char (*in_range_)(void*, const char* key, size_t length); ~rocksdb_slicetransform_t() override { (*destructor_)(state_); } @@ -502,10 +531,8 @@ static char* CopyString(const std::string& str) { return result; } -rocksdb_t* rocksdb_open( - const rocksdb_options_t* options, - const char* name, - char** errptr) { +rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name, + char** errptr) { DB* db; if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { return nullptr; @@ -515,11 +542,8 @@ rocksdb_t* rocksdb_open( return result; } -rocksdb_t* rocksdb_open_with_ttl( - const rocksdb_options_t* options, - const char* name, - int ttl, - char** errptr) { +rocksdb_t* rocksdb_open_with_ttl(const rocksdb_options_t* options, + const char* name, int ttl, char** errptr) { ROCKSDB_NAMESPACE::DBWithTTL* db; if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open( options->rep, std::string(name), &db, ttl))) { @@ -587,15 +611,13 @@ rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts( } void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be, - rocksdb_t* db, - char** errptr) { + rocksdb_t* db, char** errptr) { SaveError(errptr, be->rep->CreateNewBackup(db->rep)); } -void rocksdb_backup_engine_create_new_backup_flush(rocksdb_backup_engine_t* be, - rocksdb_t* db, - unsigned char flush_before_backup, - char** errptr) { +void rocksdb_backup_engine_create_new_backup_flush( + rocksdb_backup_engine_t* be, rocksdb_t* db, + unsigned char flush_before_backup, char** errptr) { SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup)); } @@ -618,9 +640,8 @@ void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt, opt->rep.keep_log_files = v; } - void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be, - uint32_t backup_id, char** errptr) { + uint32_t backup_id, char** errptr) { SaveError(errptr, be->rep->VerifyBackup(static_cast(backup_id))); } @@ -885,13 +906,14 @@ rocksdb_t* rocksdb_open_column_families( DB* db; std::vector handles; - if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), - std::string(name), column_families, &handles, &db))) { + if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name), + column_families, &handles, &db))) { return nullptr; } for (size_t i = 0; i < handles.size(); i++) { - rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t; + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; c_handle->rep = handles[i]; column_family_handles[i] = c_handle; } @@ -958,7 +980,8 @@ rocksdb_t* rocksdb_open_for_read_only_column_families( } for (size_t i = 0; i < handles.size(); i++) { - rocksdb_column_family_handle_t* c_handle = new rocksdb_column_family_handle_t; + rocksdb_column_family_handle_t* c_handle = + new rocksdb_column_family_handle_t; c_handle->rep = handles[i]; column_family_handles[i] = c_handle; } @@ -998,18 +1021,16 @@ rocksdb_t* rocksdb_open_as_secondary_column_families( return result; } -char** rocksdb_list_column_families( - const rocksdb_options_t* options, - const char* name, - size_t* lencfs, - char** errptr) { +char** rocksdb_list_column_families(const rocksdb_options_t* options, + const char* name, size_t* lencfs, + char** errptr) { std::vector fams; - SaveError(errptr, - DB::ListColumnFamilies(DBOptions(options->rep), - std::string(name), &fams)); + SaveError(errptr, DB::ListColumnFamilies(DBOptions(options->rep), + std::string(name), &fams)); *lencfs = fams.size(); - char** column_families = static_cast(malloc(sizeof(char*) * fams.size())); + char** column_families = + static_cast(malloc(sizeof(char*) * fams.size())); for (size_t i = 0; i < fams.size(); i++) { column_families[i] = strdup(fams[i].c_str()); } @@ -1024,14 +1045,12 @@ void rocksdb_list_column_families_destroy(char** list, size_t len) { } rocksdb_column_family_handle_t* rocksdb_create_column_family( - rocksdb_t* db, - const rocksdb_options_t* column_family_options, - const char* column_family_name, - char** errptr) { + rocksdb_t* db, const rocksdb_options_t* column_family_options, + const char* column_family_name, char** errptr) { rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t; - SaveError(errptr, - db->rep->CreateColumnFamily(ColumnFamilyOptions(column_family_options->rep), - std::string(column_family_name), &(handle->rep))); + SaveError(errptr, db->rep->CreateColumnFamily( + ColumnFamilyOptions(column_family_options->rep), + std::string(column_family_name), &(handle->rep))); return handle; } @@ -1047,10 +1066,9 @@ rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl( return handle; } -void rocksdb_drop_column_family( - rocksdb_t* db, - rocksdb_column_family_handle_t* handle, - char** errptr) { +void rocksdb_drop_column_family(rocksdb_t* db, + rocksdb_column_family_handle_t* handle, + char** errptr) { SaveError(errptr, db->rep->DropColumnFamily(handle->rep)); } @@ -1066,17 +1084,15 @@ char* rocksdb_column_family_handle_get_name( return CopyString(name); } -void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t* handle) { +void rocksdb_column_family_handle_destroy( + rocksdb_column_family_handle_t* handle) { delete handle->rep; delete handle; } -void rocksdb_put( - rocksdb_t* db, - const rocksdb_writeoptions_t* options, - const char* key, size_t keylen, - const char* val, size_t vallen, - char** errptr) { +void rocksdb_put(rocksdb_t* db, const rocksdb_writeoptions_t* options, + const char* key, size_t keylen, const char* val, size_t vallen, + char** errptr) { SaveError(errptr, db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); } @@ -1113,12 +1129,9 @@ void rocksdb_delete(rocksdb_t* db, const rocksdb_writeoptions_t* options, SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); } -void rocksdb_delete_cf( - rocksdb_t* db, - const rocksdb_writeoptions_t* options, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t keylen, - char** errptr) { +void rocksdb_delete_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, char** errptr) { SaveError(errptr, db->rep->Delete(options->rep, column_family->rep, Slice(key, keylen))); } @@ -1215,25 +1228,18 @@ void rocksdb_merge_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options, rocksdb_column_family_handle_t* column_family, const char* key, size_t keylen, const char* val, size_t vallen, char** errptr) { - SaveError(errptr, - db->rep->Merge(options->rep, column_family->rep, - Slice(key, keylen), Slice(val, vallen))); + SaveError(errptr, db->rep->Merge(options->rep, column_family->rep, + Slice(key, keylen), Slice(val, vallen))); } -void rocksdb_write( - rocksdb_t* db, - const rocksdb_writeoptions_t* options, - rocksdb_writebatch_t* batch, - char** errptr) { +void rocksdb_write(rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); } -char* rocksdb_get( - rocksdb_t* db, - const rocksdb_readoptions_t* options, - const char* key, size_t keylen, - size_t* vallen, - char** errptr) { +char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, size_t keylen, size_t* vallen, + char** errptr) { char* result = nullptr; std::string tmp; Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); @@ -1249,17 +1255,14 @@ char* rocksdb_get( return result; } -char* rocksdb_get_cf( - rocksdb_t* db, - const rocksdb_readoptions_t* options, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t keylen, - size_t* vallen, - char** errptr) { +char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t keylen, size_t* vallen, + char** errptr) { char* result = nullptr; std::string tmp; - Status s = db->rep->Get(options->rep, column_family->rep, - Slice(key, keylen), &tmp); + Status s = + db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp); if (s.ok()) { *vallen = tmp.size(); result = CopyString(tmp); @@ -1539,21 +1542,19 @@ unsigned char rocksdb_key_may_exist_cf( } rocksdb_iterator_t* rocksdb_create_iterator( - rocksdb_t* db, - const rocksdb_readoptions_t* options) { + rocksdb_t* db, const rocksdb_readoptions_t* options) { rocksdb_iterator_t* result = new rocksdb_iterator_t; result->rep = db->rep->NewIterator(options->rep); return result; } rocksdb_wal_iterator_t* rocksdb_get_updates_since( - rocksdb_t* db, uint64_t seq_number, - const rocksdb_wal_readoptions_t* options, - char** errptr) { + rocksdb_t* db, uint64_t seq_number, + const rocksdb_wal_readoptions_t* options, char** errptr) { std::unique_ptr iter; TransactionLogIterator::ReadOptions ro; - if (options!=nullptr) { - ro = options->rep; + if (options != nullptr) { + ro = options->rep; } if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) { return nullptr; @@ -1563,24 +1564,24 @@ rocksdb_wal_iterator_t* rocksdb_get_updates_since( return result; } -void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) { - iter->rep->Next(); -} +void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) { iter->rep->Next(); } unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) { - return iter->rep->Valid(); + return iter->rep->Valid(); } -void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) { - SaveError(errptr, iter->rep->status()); +void rocksdb_wal_iter_status(const rocksdb_wal_iterator_t* iter, + char** errptr) { + SaveError(errptr, iter->rep->status()); } -void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) { +void rocksdb_wal_iter_destroy(const rocksdb_wal_iterator_t* iter) { delete iter->rep; delete iter; } -rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) { +rocksdb_writebatch_t* rocksdb_wal_iter_get_batch( + const rocksdb_wal_iterator_t* iter, uint64_t* seq) { rocksdb_writebatch_t* result = rocksdb_writebatch_create(); BatchResult wal_batch = iter->rep->GetBatch(); result->rep = std::move(*wal_batch.writeBatchPtr); @@ -1590,26 +1591,22 @@ rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* return result; } -uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db) { - return db->rep->GetLatestSequenceNumber(); +uint64_t rocksdb_get_latest_sequence_number(rocksdb_t* db) { + return db->rep->GetLatestSequenceNumber(); } rocksdb_iterator_t* rocksdb_create_iterator_cf( - rocksdb_t* db, - const rocksdb_readoptions_t* options, + rocksdb_t* db, const rocksdb_readoptions_t* options, rocksdb_column_family_handle_t* column_family) { rocksdb_iterator_t* result = new rocksdb_iterator_t; result->rep = db->rep->NewIterator(options->rep, column_family->rep); return result; } -void rocksdb_create_iterators( - rocksdb_t *db, - rocksdb_readoptions_t* opts, - rocksdb_column_family_handle_t** column_families, - rocksdb_iterator_t** iterators, - size_t size, - char** errptr) { +void rocksdb_create_iterators(rocksdb_t* db, rocksdb_readoptions_t* opts, + rocksdb_column_family_handle_t** column_families, + rocksdb_iterator_t** iterators, size_t size, + char** errptr) { std::vector column_families_vec; for (size_t i = 0; i < size; i++) { column_families_vec.push_back(column_families[i]->rep); @@ -1628,23 +1625,19 @@ void rocksdb_create_iterators( } } -const rocksdb_snapshot_t* rocksdb_create_snapshot( - rocksdb_t* db) { +const rocksdb_snapshot_t* rocksdb_create_snapshot(rocksdb_t* db) { rocksdb_snapshot_t* result = new rocksdb_snapshot_t; result->rep = db->rep->GetSnapshot(); return result; } -void rocksdb_release_snapshot( - rocksdb_t* db, - const rocksdb_snapshot_t* snapshot) { +void rocksdb_release_snapshot(rocksdb_t* db, + const rocksdb_snapshot_t* snapshot) { db->rep->ReleaseSnapshot(snapshot->rep); delete snapshot; } -char* rocksdb_property_value( - rocksdb_t* db, - const char* propname) { +char* rocksdb_property_value(rocksdb_t* db, const char* propname) { std::string tmp; if (db->rep->GetProperty(Slice(propname), &tmp)) { // We use strdup() since we expect human readable output. @@ -1654,10 +1647,8 @@ char* rocksdb_property_value( } } -int rocksdb_property_int( - rocksdb_t* db, - const char* propname, - uint64_t *out_val) { +int rocksdb_property_int(rocksdb_t* db, const char* propname, + uint64_t* out_val) { if (db->rep->GetIntProperty(Slice(propname), out_val)) { return 0; } else { @@ -1665,11 +1656,9 @@ int rocksdb_property_int( } } -int rocksdb_property_int_cf( - rocksdb_t* db, - rocksdb_column_family_handle_t* column_family, - const char* propname, - uint64_t *out_val) { +int rocksdb_property_int_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t* out_val) { if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) { return 0; } else { @@ -1677,10 +1666,9 @@ int rocksdb_property_int_cf( } } -char* rocksdb_property_value_cf( - rocksdb_t* db, - rocksdb_column_family_handle_t* column_family, - const char* propname) { +char* rocksdb_property_value_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname) { std::string tmp; if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) { // We use strdup() since we expect human readable output. @@ -1726,23 +1714,19 @@ void rocksdb_approximate_sizes_cf( delete[] ranges; } -void rocksdb_delete_file( - rocksdb_t* db, - const char* name) { +void rocksdb_delete_file(rocksdb_t* db, const char* name) { db->rep->DeleteFile(name); } -const rocksdb_livefiles_t* rocksdb_livefiles( - rocksdb_t* db) { +const rocksdb_livefiles_t* rocksdb_livefiles(rocksdb_t* db) { rocksdb_livefiles_t* result = new rocksdb_livefiles_t; db->rep->GetLiveFilesMetaData(&result->rep); return result; } -void rocksdb_compact_range( - rocksdb_t* db, - const char* start_key, size_t start_key_len, - const char* limit_key, size_t limit_key_len) { +void rocksdb_compact_range(rocksdb_t* db, const char* start_key, + size_t start_key_len, const char* limit_key, + size_t limit_key_len) { Slice a, b; db->rep->CompactRange( CompactRangeOptions(), @@ -1751,11 +1735,10 @@ void rocksdb_compact_range( (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); } -void rocksdb_compact_range_cf( - rocksdb_t* db, - rocksdb_column_family_handle_t* column_family, - const char* start_key, size_t start_key_len, - const char* limit_key, size_t limit_key_len) { +void rocksdb_compact_range_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* start_key, size_t start_key_len, + const char* limit_key, size_t limit_key_len) { Slice a, b; db->rep->CompactRange( CompactRangeOptions(), column_family->rep, @@ -1811,18 +1794,14 @@ void rocksdb_compact_range_cf_opt(rocksdb_t* db, (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)); } -void rocksdb_flush( - rocksdb_t* db, - const rocksdb_flushoptions_t* options, - char** errptr) { +void rocksdb_flush(rocksdb_t* db, const rocksdb_flushoptions_t* options, + char** errptr) { SaveError(errptr, db->rep->Flush(options->rep)); } -void rocksdb_flush_cf( - rocksdb_t* db, - const rocksdb_flushoptions_t* options, - rocksdb_column_family_handle_t* column_family, - char** errptr) { +void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t* column_family, + char** errptr) { SaveError(errptr, db->rep->Flush(options->rep, column_family->rep)); } @@ -1830,30 +1809,22 @@ void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) { SaveError(errptr, db->rep->FlushWAL(sync)); } -void rocksdb_disable_file_deletions( - rocksdb_t* db, - char** errptr) { +void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr) { SaveError(errptr, db->rep->DisableFileDeletions()); } -void rocksdb_enable_file_deletions( - rocksdb_t* db, - unsigned char force, - char** errptr) { +void rocksdb_enable_file_deletions(rocksdb_t* db, unsigned char force, + char** errptr) { SaveError(errptr, db->rep->EnableFileDeletions(force)); } -void rocksdb_destroy_db( - const rocksdb_options_t* options, - const char* name, - char** errptr) { +void rocksdb_destroy_db(const rocksdb_options_t* options, const char* name, + char** errptr) { SaveError(errptr, DestroyDB(name, options->rep)); } -void rocksdb_repair_db( - const rocksdb_options_t* options, - const char* name, - char** errptr) { +void rocksdb_repair_db(const rocksdb_options_t* options, const char* name, + char** errptr) { SaveError(errptr, RepairDB(name, options->rep)); } @@ -1883,13 +1854,9 @@ void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k, iter->rep->SeekForPrev(Slice(k, klen)); } -void rocksdb_iter_next(rocksdb_iterator_t* iter) { - iter->rep->Next(); -} +void rocksdb_iter_next(rocksdb_iterator_t* iter) { iter->rep->Next(); } -void rocksdb_iter_prev(rocksdb_iterator_t* iter) { - iter->rep->Prev(); -} +void rocksdb_iter_prev(rocksdb_iterator_t* iter) { iter->rep->Prev(); } const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) { Slice s = iter->rep->key(); @@ -1991,20 +1958,18 @@ void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key, b->rep.Merge(Slice(key, klen), Slice(val, vlen)); } -void rocksdb_writebatch_merge_cf( - rocksdb_writebatch_t* b, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t klen, - const char* val, size_t vlen) { +void rocksdb_writebatch_merge_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen) { b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen)); } -void rocksdb_writebatch_mergev( - rocksdb_writebatch_t* b, - int num_keys, const char* const* keys_list, - const size_t* keys_list_sizes, - int num_values, const char* const* values_list, - const size_t* values_list_sizes) { +void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { std::vector key_slices(num_keys); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); @@ -2017,13 +1982,12 @@ void rocksdb_writebatch_mergev( SliceParts(value_slices.data(), num_values)); } -void rocksdb_writebatch_mergev_cf( - rocksdb_writebatch_t* b, - rocksdb_column_family_handle_t* column_family, - int num_keys, const char* const* keys_list, - const size_t* keys_list_sizes, - int num_values, const char* const* values_list, - const size_t* values_list_sizes) { +void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { std::vector key_slices(num_keys); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); @@ -2036,9 +2000,8 @@ void rocksdb_writebatch_mergev_cf( SliceParts(value_slices.data(), num_values)); } -void rocksdb_writebatch_delete( - rocksdb_writebatch_t* b, - const char* key, size_t klen) { +void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key, + size_t klen) { b->rep.Delete(Slice(key, klen)); } @@ -2047,10 +2010,9 @@ void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key, b->rep.SingleDelete(Slice(key, klen)); } -void rocksdb_writebatch_delete_cf( - rocksdb_writebatch_t* b, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t klen) { +void rocksdb_writebatch_delete_cf(rocksdb_writebatch_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen) { b->rep.Delete(column_family->rep, Slice(key, klen)); } @@ -2139,9 +2101,8 @@ void rocksdb_writebatch_delete_rangev_cf( SliceParts(end_key_slices.data(), num_keys)); } -void rocksdb_writebatch_put_log_data( - rocksdb_writebatch_t* b, - const char* blob, size_t len) { +void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob, + size_t len) { b->rep.PutLogData(Slice(blob, len)); } @@ -2158,11 +2119,11 @@ class H : public WriteBatch::Handler { } }; -void rocksdb_writebatch_iterate( - rocksdb_writebatch_t* b, - void* state, - void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), - void (*deleted)(void*, const char* k, size_t klen)) { +void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state, + void (*put)(void*, const char* k, size_t klen, + const char* v, size_t vlen), + void (*deleted)(void*, const char* k, + size_t klen)) { H handler; handler.state_ = state; handler.put_ = put; @@ -2188,9 +2149,11 @@ void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) { SaveError(errptr, b->rep.PopSavePoint()); } -rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(size_t reserved_bytes, unsigned char overwrite_key) { +rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create( + size_t reserved_bytes, unsigned char overwrite_key) { rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t; - b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes, overwrite_key); + b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes, + overwrite_key); return b; } @@ -2209,27 +2172,23 @@ int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) { return b->rep->GetWriteBatch()->Count(); } -void rocksdb_writebatch_wi_put( - rocksdb_writebatch_wi_t* b, - const char* key, size_t klen, - const char* val, size_t vlen) { +void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { b->rep->Put(Slice(key, klen), Slice(val, vlen)); } -void rocksdb_writebatch_wi_put_cf( - rocksdb_writebatch_wi_t* b, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t klen, - const char* val, size_t vlen) { +void rocksdb_writebatch_wi_put_cf(rocksdb_writebatch_wi_t* b, + rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, + size_t vlen) { b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen)); } -void rocksdb_writebatch_wi_putv( - rocksdb_writebatch_wi_t* b, - int num_keys, const char* const* keys_list, - const size_t* keys_list_sizes, - int num_values, const char* const* values_list, - const size_t* values_list_sizes) { +void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { std::vector key_slices(num_keys); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); @@ -2239,14 +2198,12 @@ void rocksdb_writebatch_wi_putv( value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } b->rep->Put(SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + SliceParts(value_slices.data(), num_values)); } void rocksdb_writebatch_wi_putv_cf( - rocksdb_writebatch_wi_t* b, - rocksdb_column_family_handle_t* column_family, - int num_keys, const char* const* keys_list, - const size_t* keys_list_sizes, + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { std::vector key_slices(num_keys); @@ -2258,30 +2215,25 @@ void rocksdb_writebatch_wi_putv_cf( value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + SliceParts(value_slices.data(), num_values)); } -void rocksdb_writebatch_wi_merge( - rocksdb_writebatch_wi_t* b, - const char* key, size_t klen, - const char* val, size_t vlen) { +void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen, const char* val, size_t vlen) { b->rep->Merge(Slice(key, klen), Slice(val, vlen)); } void rocksdb_writebatch_wi_merge_cf( - rocksdb_writebatch_wi_t* b, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t klen, - const char* val, size_t vlen) { + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + const char* key, size_t klen, const char* val, size_t vlen) { b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen)); } -void rocksdb_writebatch_wi_mergev( - rocksdb_writebatch_wi_t* b, - int num_keys, const char* const* keys_list, - const size_t* keys_list_sizes, - int num_values, const char* const* values_list, - const size_t* values_list_sizes) { +void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes, int num_values, + const char* const* values_list, + const size_t* values_list_sizes) { std::vector key_slices(num_keys); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); @@ -2291,14 +2243,12 @@ void rocksdb_writebatch_wi_mergev( value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } b->rep->Merge(SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + SliceParts(value_slices.data(), num_values)); } void rocksdb_writebatch_wi_mergev_cf( - rocksdb_writebatch_wi_t* b, - rocksdb_column_family_handle_t* column_family, - int num_keys, const char* const* keys_list, - const size_t* keys_list_sizes, + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes) { std::vector key_slices(num_keys); @@ -2310,12 +2260,11 @@ void rocksdb_writebatch_wi_mergev_cf( value_slices[i] = Slice(values_list[i], values_list_sizes[i]); } b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys), - SliceParts(value_slices.data(), num_values)); + SliceParts(value_slices.data(), num_values)); } -void rocksdb_writebatch_wi_delete( - rocksdb_writebatch_wi_t* b, - const char* key, size_t klen) { +void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key, + size_t klen) { b->rep->Delete(Slice(key, klen)); } @@ -2325,8 +2274,7 @@ void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b, } void rocksdb_writebatch_wi_delete_cf( - rocksdb_writebatch_wi_t* b, - rocksdb_column_family_handle_t* column_family, + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, const char* key, size_t klen) { b->rep->Delete(column_family->rep, Slice(key, klen)); } @@ -2337,10 +2285,9 @@ void rocksdb_writebatch_wi_singledelete_cf( b->rep->SingleDelete(column_family->rep, Slice(key, klen)); } -void rocksdb_writebatch_wi_deletev( - rocksdb_writebatch_wi_t* b, - int num_keys, const char* const* keys_list, - const size_t* keys_list_sizes) { +void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* keys_list, + const size_t* keys_list_sizes) { std::vector key_slices(num_keys); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); @@ -2349,10 +2296,8 @@ void rocksdb_writebatch_wi_deletev( } void rocksdb_writebatch_wi_deletev_cf( - rocksdb_writebatch_wi_t* b, - rocksdb_column_family_handle_t* column_family, - int num_keys, const char* const* keys_list, - const size_t* keys_list_sizes) { + rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, + int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) { std::vector key_slices(num_keys); for (int i = 0; i < num_keys; i++) { key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]); @@ -2361,11 +2306,12 @@ void rocksdb_writebatch_wi_deletev_cf( } void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b, - const char* start_key, - size_t start_key_len, const char* end_key, - size_t end_key_len) { + const char* start_key, + size_t start_key_len, + const char* end_key, + size_t end_key_len) { b->rep->DeleteRange(Slice(start_key, start_key_len), - Slice(end_key, end_key_len)); + Slice(end_key, end_key_len)); } void rocksdb_writebatch_wi_delete_range_cf( @@ -2373,14 +2319,15 @@ void rocksdb_writebatch_wi_delete_range_cf( const char* start_key, size_t start_key_len, const char* end_key, size_t end_key_len) { b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len), - Slice(end_key, end_key_len)); + Slice(end_key, end_key_len)); } -void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, int num_keys, - const char* const* start_keys_list, - const size_t* start_keys_list_sizes, - const char* const* end_keys_list, - const size_t* end_keys_list_sizes) { +void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, + int num_keys, + const char* const* start_keys_list, + const size_t* start_keys_list_sizes, + const char* const* end_keys_list, + const size_t* end_keys_list_sizes) { std::vector start_key_slices(num_keys); std::vector end_key_slices(num_keys); for (int i = 0; i < num_keys; i++) { @@ -2388,7 +2335,7 @@ void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b, int num_key end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); } b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys), - SliceParts(end_key_slices.data(), num_keys)); + SliceParts(end_key_slices.data(), num_keys)); } void rocksdb_writebatch_wi_delete_rangev_cf( @@ -2403,19 +2350,17 @@ void rocksdb_writebatch_wi_delete_rangev_cf( end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]); } b->rep->DeleteRange(column_family->rep, - SliceParts(start_key_slices.data(), num_keys), - SliceParts(end_key_slices.data(), num_keys)); + SliceParts(start_key_slices.data(), num_keys), + SliceParts(end_key_slices.data(), num_keys)); } -void rocksdb_writebatch_wi_put_log_data( - rocksdb_writebatch_wi_t* b, - const char* blob, size_t len) { +void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b, + const char* blob, size_t len) { b->rep->PutLogData(Slice(blob, len)); } void rocksdb_writebatch_wi_iterate( - rocksdb_writebatch_wi_t* b, - void* state, + rocksdb_writebatch_wi_t* b, void* state, void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*deleted)(void*, const char* k, size_t klen)) { H handler; @@ -2425,7 +2370,8 @@ void rocksdb_writebatch_wi_iterate( b->rep->GetWriteBatch()->Iterate(&handler); } -const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b, size_t* size) { +const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b, + size_t* size) { WriteBatch* wb = b->rep->GetWriteBatch(); *size = wb->GetDataSize(); return wb->Data().c_str(); @@ -2436,13 +2382,12 @@ void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) { } void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b, - char** errptr) { + char** errptr) { SaveError(errptr, b->rep->RollbackToSavePoint()); } rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base( - rocksdb_writebatch_wi_t* wbwi, - rocksdb_iterator_t* base_iterator) { + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator) { rocksdb_iterator_t* result = new rocksdb_iterator_t; result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep); delete base_iterator; @@ -2459,12 +2404,10 @@ rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( return result; } -char* rocksdb_writebatch_wi_get_from_batch( - rocksdb_writebatch_wi_t* wbwi, - const rocksdb_options_t* options, - const char* key, size_t keylen, - size_t* vallen, - char** errptr) { +char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi, + const rocksdb_options_t* options, + const char* key, size_t keylen, + size_t* vallen, char** errptr) { char* result = nullptr; std::string tmp; Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp); @@ -2481,16 +2424,13 @@ char* rocksdb_writebatch_wi_get_from_batch( } char* rocksdb_writebatch_wi_get_from_batch_cf( - rocksdb_writebatch_wi_t* wbwi, - const rocksdb_options_t* options, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t keylen, - size_t* vallen, - char** errptr) { + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { char* result = nullptr; std::string tmp; Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep, - Slice(key, keylen), &tmp); + Slice(key, keylen), &tmp); if (s.ok()) { *vallen = tmp.size(); result = CopyString(tmp); @@ -2504,15 +2444,13 @@ char* rocksdb_writebatch_wi_get_from_batch_cf( } char* rocksdb_writebatch_wi_get_from_batch_and_db( - rocksdb_writebatch_wi_t* wbwi, - rocksdb_t* db, - const rocksdb_readoptions_t* options, - const char* key, size_t keylen, - size_t* vallen, - char** errptr) { + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, + size_t* vallen, char** errptr) { char* result = nullptr; std::string tmp; - Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, Slice(key, keylen), &tmp); + Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, + Slice(key, keylen), &tmp); if (s.ok()) { *vallen = tmp.size(); result = CopyString(tmp); @@ -2526,17 +2464,14 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db( } char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( - rocksdb_writebatch_wi_t* wbwi, - rocksdb_t* db, + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, const rocksdb_readoptions_t* options, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t keylen, - size_t* vallen, - char** errptr) { + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr) { char* result = nullptr; std::string tmp; - Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, column_family->rep, - Slice(key, keylen), &tmp); + Status s = wbwi->rep->GetFromBatchAndDB( + db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp); if (s.ok()) { *vallen = tmp.size(); result = CopyString(tmp); @@ -2549,11 +2484,9 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( return result; } -void rocksdb_write_writebatch_wi( - rocksdb_t* db, - const rocksdb_writeoptions_t* options, - rocksdb_writebatch_wi_t* wbwi, - char** errptr) { +void rocksdb_write_writebatch_wi(rocksdb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_wi_t* wbwi, char** errptr) { WriteBatch* wb = wbwi->rep->GetWriteBatch(); SaveError(errptr, db->rep->Write(options->rep, wb)); } @@ -2608,8 +2541,7 @@ void rocksdb_load_latest_options_destroy( } } -rocksdb_block_based_table_options_t* -rocksdb_block_based_options_create() { +rocksdb_block_based_table_options_t* rocksdb_block_based_options_create() { return new rocksdb_block_based_table_options_t; } @@ -2639,22 +2571,32 @@ void rocksdb_block_based_options_set_block_restart_interval( } void rocksdb_block_based_options_set_index_block_restart_interval( - rocksdb_block_based_table_options_t* options, int index_block_restart_interval) { + rocksdb_block_based_table_options_t* options, + int index_block_restart_interval) { options->rep.index_block_restart_interval = index_block_restart_interval; } void rocksdb_block_based_options_set_metadata_block_size( - rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size) { + rocksdb_block_based_table_options_t* options, + uint64_t metadata_block_size) { options->rep.metadata_block_size = metadata_block_size; } void rocksdb_block_based_options_set_partition_filters( - rocksdb_block_based_table_options_t* options, unsigned char partition_filters) { + rocksdb_block_based_table_options_t* options, + unsigned char partition_filters) { options->rep.partition_filters = partition_filters; } +void rocksdb_block_based_options_set_optimize_filters_for_memory( + rocksdb_block_based_table_options_t* options, + unsigned char optimize_filters_for_memory) { + options->rep.optimize_filters_for_memory = optimize_filters_for_memory; +} + void rocksdb_block_based_options_set_use_delta_encoding( - rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding) { + rocksdb_block_based_table_options_t* options, + unsigned char use_delta_encoding) { options->rep.use_delta_encoding = use_delta_encoding; } @@ -2704,7 +2646,7 @@ void rocksdb_block_based_options_set_index_type( void rocksdb_block_based_options_set_data_block_index_type( rocksdb_block_based_table_options_t* options, int v) { options->rep.data_block_index_type = - static_cast(v); + static_cast(v); } void rocksdb_block_based_options_set_data_block_hash_ratio( @@ -2733,7 +2675,7 @@ void rocksdb_block_based_options_set_pin_top_level_index_and_filter( } void rocksdb_options_set_block_based_table_factory( - rocksdb_options_t *opt, + rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options) { if (table_options) { opt->rep.table_factory.reset( @@ -2741,13 +2683,11 @@ void rocksdb_options_set_block_based_table_factory( } } -rocksdb_cuckoo_table_options_t* -rocksdb_cuckoo_options_create() { +rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() { return new rocksdb_cuckoo_table_options_t; } -void rocksdb_cuckoo_options_destroy( - rocksdb_cuckoo_table_options_t* options) { +void rocksdb_cuckoo_options_destroy(rocksdb_cuckoo_table_options_t* options) { delete options; } @@ -2777,51 +2717,44 @@ void rocksdb_cuckoo_options_set_use_module_hash( } void rocksdb_options_set_cuckoo_table_factory( - rocksdb_options_t *opt, - rocksdb_cuckoo_table_options_t* table_options) { + rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options) { if (table_options) { opt->rep.table_factory.reset( ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep)); } } -void rocksdb_set_options( - rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr) { - std::unordered_map options_map; - for (int i=0; irep->SetOptions(options_map)); - } - -void rocksdb_set_options_cf( - rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr) { - std::unordered_map options_map; - for (int i=0; irep->SetOptions(handle->rep, options_map)); - } - -rocksdb_options_t* rocksdb_options_create() { - return new rocksdb_options_t; +void rocksdb_set_options(rocksdb_t* db, int count, const char* const keys[], + const char* const values[], char** errptr) { + std::unordered_map options_map; + for (int i = 0; i < count; i++) options_map[keys[i]] = values[i]; + SaveError(errptr, db->rep->SetOptions(options_map)); } -void rocksdb_options_destroy(rocksdb_options_t* options) { - delete options; +void rocksdb_set_options_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* handle, int count, + const char* const keys[], + const char* const values[], char** errptr) { + std::unordered_map options_map; + for (int i = 0; i < count; i++) options_map[keys[i]] = values[i]; + SaveError(errptr, db->rep->SetOptions(handle->rep, options_map)); } +rocksdb_options_t* rocksdb_options_create() { return new rocksdb_options_t; } + +void rocksdb_options_destroy(rocksdb_options_t* options) { delete options; } + rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) { return new rocksdb_options_t(*options); } -void rocksdb_options_increase_parallelism( - rocksdb_options_t* opt, int total_threads) { +void rocksdb_options_increase_parallelism(rocksdb_options_t* opt, + int total_threads) { opt->rep.IncreaseParallelism(total_threads); } -void rocksdb_options_optimize_for_point_lookup( - rocksdb_options_t* opt, uint64_t block_cache_size_mb) { +void rocksdb_options_optimize_for_point_lookup(rocksdb_options_t* opt, + uint64_t block_cache_size_mb) { opt->rep.OptimizeForPointLookup(block_cache_size_mb); } @@ -2835,8 +2768,8 @@ void rocksdb_options_optimize_universal_style_compaction( opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget); } -void rocksdb_options_set_allow_ingest_behind( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_allow_ingest_behind(rocksdb_options_t* opt, + unsigned char v) { opt->rep.allow_ingest_behind = v; } @@ -2844,9 +2777,8 @@ unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) { return opt->rep.allow_ingest_behind; } -void rocksdb_options_set_compaction_filter( - rocksdb_options_t* opt, - rocksdb_compactionfilter_t* filter) { +void rocksdb_options_set_compaction_filter(rocksdb_options_t* opt, + rocksdb_compactionfilter_t* filter) { opt->rep.compaction_filter = filter; } @@ -2856,8 +2788,8 @@ void rocksdb_options_set_compaction_filter_factory( std::shared_ptr(factory); } -void rocksdb_options_compaction_readahead_size( - rocksdb_options_t* opt, size_t s) { +void rocksdb_options_compaction_readahead_size(rocksdb_options_t* opt, + size_t s) { opt->rep.compaction_readahead_size = s; } @@ -2865,20 +2797,18 @@ size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) { return opt->rep.compaction_readahead_size; } -void rocksdb_options_set_comparator( - rocksdb_options_t* opt, - rocksdb_comparator_t* cmp) { +void rocksdb_options_set_comparator(rocksdb_options_t* opt, + rocksdb_comparator_t* cmp) { opt->rep.comparator = cmp; } void rocksdb_options_set_merge_operator( - rocksdb_options_t* opt, - rocksdb_mergeoperator_t* merge_operator) { + rocksdb_options_t* opt, rocksdb_mergeoperator_t* merge_operator) { opt->rep.merge_operator = std::shared_ptr(merge_operator); } -void rocksdb_options_set_create_if_missing( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_create_if_missing(rocksdb_options_t* opt, + unsigned char v) { opt->rep.create_if_missing = v; } @@ -2886,8 +2816,8 @@ unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) { return opt->rep.create_if_missing; } -void rocksdb_options_set_create_missing_column_families( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_create_missing_column_families(rocksdb_options_t* opt, + unsigned char v) { opt->rep.create_missing_column_families = v; } @@ -2896,8 +2826,8 @@ unsigned char rocksdb_options_get_create_missing_column_families( return opt->rep.create_missing_column_families; } -void rocksdb_options_set_error_if_exists( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_error_if_exists(rocksdb_options_t* opt, + unsigned char v) { opt->rep.error_if_exists = v; } @@ -2905,8 +2835,8 @@ unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) { return opt->rep.error_if_exists; } -void rocksdb_options_set_paranoid_checks( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_paranoid_checks(rocksdb_options_t* opt, + unsigned char v) { opt->rep.paranoid_checks = v; } @@ -2934,8 +2864,7 @@ void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) { } } -void rocksdb_options_set_info_log_level( - rocksdb_options_t* opt, int v) { +void rocksdb_options_set_info_log_level(rocksdb_options_t* opt, int v) { opt->rep.info_log_level = static_cast(v); } @@ -2968,7 +2897,8 @@ int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) { return opt->rep.max_open_files; } -void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, int n) { +void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt, + int n) { opt->rep.max_file_opening_threads = n; } @@ -2976,7 +2906,8 @@ int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) { return opt->rep.max_file_opening_threads; } -void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) { +void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, + uint64_t n) { opt->rep.max_total_wal_size = n; } @@ -2984,8 +2915,8 @@ uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) { return opt->rep.max_total_wal_size; } -void rocksdb_options_set_target_file_size_base( - rocksdb_options_t* opt, uint64_t n) { +void rocksdb_options_set_target_file_size_base(rocksdb_options_t* opt, + uint64_t n) { opt->rep.target_file_size_base = n; } @@ -2993,8 +2924,8 @@ uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) { return opt->rep.target_file_size_base; } -void rocksdb_options_set_target_file_size_multiplier( - rocksdb_options_t* opt, int n) { +void rocksdb_options_set_target_file_size_multiplier(rocksdb_options_t* opt, + int n) { opt->rep.target_file_size_multiplier = n; } @@ -3002,8 +2933,8 @@ int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) { return opt->rep.target_file_size_multiplier; } -void rocksdb_options_set_max_bytes_for_level_base( - rocksdb_options_t* opt, uint64_t n) { +void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, + uint64_t n) { opt->rep.max_bytes_for_level_base = n; } @@ -3184,8 +3115,8 @@ int rocksdb_options_get_level0_file_num_compaction_trigger( return opt->rep.level0_file_num_compaction_trigger; } -void rocksdb_options_set_level0_slowdown_writes_trigger( - rocksdb_options_t* opt, int n) { +void rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t* opt, + int n) { opt->rep.level0_slowdown_writes_trigger = n; } @@ -3193,8 +3124,8 @@ int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) { return opt->rep.level0_slowdown_writes_trigger; } -void rocksdb_options_set_level0_stop_writes_trigger( - rocksdb_options_t* opt, int n) { +void rocksdb_options_set_level0_stop_writes_trigger(rocksdb_options_t* opt, + int n) { opt->rep.level0_stop_writes_trigger = n; } @@ -3202,7 +3133,7 @@ int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) { return opt->rep.level0_stop_writes_trigger; } -void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt,int mode) { +void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt, int mode) { opt->rep.wal_recovery_mode = static_cast(mode); } @@ -3232,7 +3163,7 @@ void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt, opt->rep.compression_per_level.resize(num_levels); for (size_t i = 0; i < num_levels; ++i) { opt->rep.compression_per_level[i] = - static_cast(level_values[i]); + static_cast(level_values[i]); } } @@ -3331,8 +3262,7 @@ void rocksdb_options_set_prefix_extractor( opt->rep.prefix_extractor.reset(prefix_extractor); } -void rocksdb_options_set_use_fsync( - rocksdb_options_t* opt, int use_fsync) { +void rocksdb_options_set_use_fsync(rocksdb_options_t* opt, int use_fsync) { opt->rep.use_fsync = use_fsync; } @@ -3340,13 +3270,12 @@ int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) { return opt->rep.use_fsync; } -void rocksdb_options_set_db_log_dir( - rocksdb_options_t* opt, const char* db_log_dir) { +void rocksdb_options_set_db_log_dir(rocksdb_options_t* opt, + const char* db_log_dir) { opt->rep.db_log_dir = db_log_dir; } -void rocksdb_options_set_wal_dir( - rocksdb_options_t* opt, const char* v) { +void rocksdb_options_set_wal_dir(rocksdb_options_t* opt, const char* v) { opt->rep.wal_dir = v; } @@ -3358,8 +3287,8 @@ uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) { return opt->rep.WAL_ttl_seconds; } -void rocksdb_options_set_WAL_size_limit_MB( - rocksdb_options_t* opt, uint64_t limit) { +void rocksdb_options_set_WAL_size_limit_MB(rocksdb_options_t* opt, + uint64_t limit) { opt->rep.WAL_size_limit_MB = limit; } @@ -3367,8 +3296,8 @@ uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) { return opt->rep.WAL_size_limit_MB; } -void rocksdb_options_set_manifest_preallocation_size( - rocksdb_options_t* opt, size_t v) { +void rocksdb_options_set_manifest_preallocation_size(rocksdb_options_t* opt, + size_t v) { opt->rep.manifest_preallocation_size = v; } @@ -3395,8 +3324,8 @@ unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction( return opt->rep.use_direct_io_for_flush_and_compaction; } -void rocksdb_options_set_allow_mmap_reads( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_allow_mmap_reads(rocksdb_options_t* opt, + unsigned char v) { opt->rep.allow_mmap_reads = v; } @@ -3404,8 +3333,8 @@ unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) { return opt->rep.allow_mmap_reads; } -void rocksdb_options_set_allow_mmap_writes( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_allow_mmap_writes(rocksdb_options_t* opt, + unsigned char v) { opt->rep.allow_mmap_writes = v; } @@ -3413,8 +3342,8 @@ unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) { return opt->rep.allow_mmap_writes; } -void rocksdb_options_set_is_fd_close_on_exec( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_is_fd_close_on_exec(rocksdb_options_t* opt, + unsigned char v) { opt->rep.is_fd_close_on_exec = v; } @@ -3422,8 +3351,8 @@ unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) { return opt->rep.is_fd_close_on_exec; } -void rocksdb_options_set_stats_dump_period_sec( - rocksdb_options_t* opt, unsigned int v) { +void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, + unsigned int v) { opt->rep.stats_dump_period_sec = v; } @@ -3441,8 +3370,8 @@ unsigned int rocksdb_options_get_stats_persist_period_sec( return opt->rep.stats_persist_period_sec; } -void rocksdb_options_set_advise_random_on_open( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_advise_random_on_open(rocksdb_options_t* opt, + unsigned char v) { opt->rep.advise_random_on_open = v; } @@ -3451,9 +3380,9 @@ unsigned char rocksdb_options_get_advise_random_on_open( return opt->rep.advise_random_on_open; } -void rocksdb_options_set_access_hint_on_compaction_start( - rocksdb_options_t* opt, int v) { - switch(v) { +void rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t* opt, + int v) { + switch (v) { case 0: opt->rep.access_hint_on_compaction_start = ROCKSDB_NAMESPACE::Options::NONE; @@ -3480,8 +3409,8 @@ int rocksdb_options_get_access_hint_on_compaction_start( return opt->rep.access_hint_on_compaction_start; } -void rocksdb_options_set_use_adaptive_mutex( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_use_adaptive_mutex(rocksdb_options_t* opt, + unsigned char v) { opt->rep.use_adaptive_mutex = v; } @@ -3489,8 +3418,8 @@ unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) { return opt->rep.use_adaptive_mutex; } -void rocksdb_options_set_wal_bytes_per_sync( - rocksdb_options_t* opt, uint64_t v) { +void rocksdb_options_set_wal_bytes_per_sync(rocksdb_options_t* opt, + uint64_t v) { opt->rep.wal_bytes_per_sync = v; } @@ -3498,8 +3427,7 @@ uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) { return opt->rep.wal_bytes_per_sync; } -void rocksdb_options_set_bytes_per_sync( - rocksdb_options_t* opt, uint64_t v) { +void rocksdb_options_set_bytes_per_sync(rocksdb_options_t* opt, uint64_t v) { opt->rep.bytes_per_sync = v; } @@ -3547,7 +3475,8 @@ uint64_t rocksdb_options_get_max_sequential_skip_in_iterations( return opt->rep.max_sequential_skip_in_iterations; } -void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, int n) { +void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt, + int n) { opt->rep.max_write_buffer_number = n; } @@ -3555,7 +3484,8 @@ int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) { return opt->rep.max_write_buffer_number; } -void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt, int n) { +void rocksdb_options_set_min_write_buffer_number_to_merge( + rocksdb_options_t* opt, int n) { opt->rep.min_write_buffer_number_to_merge = n; } @@ -3620,7 +3550,8 @@ int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) { return opt->rep.max_background_jobs; } -void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) { +void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, + int n) { opt->rep.max_background_compactions = n; } @@ -3654,7 +3585,8 @@ size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) { return opt->rep.max_log_file_size; } -void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, size_t v) { +void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt, + size_t v) { opt->rep.log_file_time_to_roll = v; } @@ -3679,7 +3611,8 @@ size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) { return opt->rep.recycle_log_file_num; } -void rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) { +void rocksdb_options_set_soft_pending_compaction_bytes_limit( + rocksdb_options_t* opt, size_t v) { opt->rep.soft_pending_compaction_bytes_limit = v; } @@ -3688,7 +3621,8 @@ size_t rocksdb_options_get_soft_pending_compaction_bytes_limit( return opt->rep.soft_pending_compaction_bytes_limit; } -void rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, size_t v) { +void rocksdb_options_set_hard_pending_compaction_bytes_limit( + rocksdb_options_t* opt, size_t v) { opt->rep.hard_pending_compaction_bytes_limit = v; } @@ -3697,8 +3631,8 @@ size_t rocksdb_options_get_hard_pending_compaction_bytes_limit( return opt->rep.hard_pending_compaction_bytes_limit; } -void rocksdb_options_set_max_manifest_file_size( - rocksdb_options_t* opt, size_t v) { +void rocksdb_options_set_max_manifest_file_size(rocksdb_options_t* opt, + size_t v) { opt->rep.max_manifest_file_size = v; } @@ -3706,8 +3640,8 @@ size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) { return opt->rep.max_manifest_file_size; } -void rocksdb_options_set_table_cache_numshardbits( - rocksdb_options_t* opt, int v) { +void rocksdb_options_set_table_cache_numshardbits(rocksdb_options_t* opt, + int v) { opt->rep.table_cache_numshardbits = v; } @@ -3715,8 +3649,7 @@ int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) { return opt->rep.table_cache_numshardbits; } -void rocksdb_options_set_arena_block_size( - rocksdb_options_t* opt, size_t v) { +void rocksdb_options_set_arena_block_size(rocksdb_options_t* opt, size_t v) { opt->rep.arena_block_size = v; } @@ -3724,7 +3657,8 @@ size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) { return opt->rep.arena_block_size; } -void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int disable) { +void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, + int disable) { opt->rep.disable_auto_compactions = disable; } @@ -3733,7 +3667,8 @@ unsigned char rocksdb_options_get_disable_auto_compactions( return opt->rep.disable_auto_compactions; } -void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, int v) { +void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt, + int v) { opt->rep.optimize_filters_for_hits = v; } @@ -3756,7 +3691,7 @@ void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) { opt->rep.PrepareForBulkLoad(); } -void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t *opt) { +void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t* opt) { opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory); } @@ -3779,24 +3714,27 @@ size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) { return opt->rep.memtable_huge_page_size; } -void rocksdb_options_set_hash_skip_list_rep( - rocksdb_options_t *opt, size_t bucket_count, - int32_t skiplist_height, int32_t skiplist_branching_factor) { +void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t* opt, + size_t bucket_count, + int32_t skiplist_height, + int32_t skiplist_branching_factor) { ROCKSDB_NAMESPACE::MemTableRepFactory* factory = ROCKSDB_NAMESPACE::NewHashSkipListRepFactory( bucket_count, skiplist_height, skiplist_branching_factor); opt->rep.memtable_factory.reset(factory); } -void rocksdb_options_set_hash_link_list_rep( - rocksdb_options_t *opt, size_t bucket_count) { +void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt, + size_t bucket_count) { opt->rep.memtable_factory.reset( ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count)); } -void rocksdb_options_set_plain_table_factory( - rocksdb_options_t *opt, uint32_t user_key_len, int bloom_bits_per_key, - double hash_table_ratio, size_t index_sparseness) { +void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt, + uint32_t user_key_len, + int bloom_bits_per_key, + double hash_table_ratio, + size_t index_sparseness) { ROCKSDB_NAMESPACE::PlainTableOptions options; options.user_key_len = user_key_len; options.bloom_bits_per_key = bloom_bits_per_key; @@ -3808,8 +3746,8 @@ void rocksdb_options_set_plain_table_factory( opt->rep.table_factory.reset(factory); } -void rocksdb_options_set_max_successive_merges( - rocksdb_options_t* opt, size_t v) { +void rocksdb_options_set_max_successive_merges(rocksdb_options_t* opt, + size_t v) { opt->rep.max_successive_merges = v; } @@ -3817,8 +3755,7 @@ size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) { return opt->rep.max_successive_merges; } -void rocksdb_options_set_bloom_locality( - rocksdb_options_t* opt, uint32_t v) { +void rocksdb_options_set_bloom_locality(rocksdb_options_t* opt, uint32_t v) { opt->rep.bloom_locality = v; } @@ -3826,8 +3763,8 @@ uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) { return opt->rep.bloom_locality; } -void rocksdb_options_set_inplace_update_support( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_inplace_update_support(rocksdb_options_t* opt, + unsigned char v) { opt->rep.inplace_update_support = v; } @@ -3836,8 +3773,8 @@ unsigned char rocksdb_options_get_inplace_update_support( return opt->rep.inplace_update_support; } -void rocksdb_options_set_inplace_update_num_locks( - rocksdb_options_t* opt, size_t v) { +void rocksdb_options_set_inplace_update_num_locks(rocksdb_options_t* opt, + size_t v) { opt->rep.inplace_update_num_locks = v; } @@ -3845,8 +3782,7 @@ size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) { return opt->rep.inplace_update_num_locks; } -void rocksdb_options_set_report_bg_io_stats( - rocksdb_options_t* opt, int v) { +void rocksdb_options_set_report_bg_io_stats(rocksdb_options_t* opt, int v) { opt->rep.report_bg_io_stats = v; } @@ -3854,7 +3790,7 @@ unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) { return opt->rep.report_bg_io_stats; } -void rocksdb_options_set_compaction_style(rocksdb_options_t *opt, int style) { +void rocksdb_options_set_compaction_style(rocksdb_options_t* opt, int style) { opt->rep.compaction_style = static_cast(style); } @@ -3863,17 +3799,17 @@ int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) { return opt->rep.compaction_style; } -void rocksdb_options_set_universal_compaction_options(rocksdb_options_t *opt, rocksdb_universal_compaction_options_t *uco) { +void rocksdb_options_set_universal_compaction_options( + rocksdb_options_t* opt, rocksdb_universal_compaction_options_t* uco) { opt->rep.compaction_options_universal = *(uco->rep); } void rocksdb_options_set_fifo_compaction_options( - rocksdb_options_t* opt, - rocksdb_fifo_compaction_options_t* fifo) { + rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo) { opt->rep.compaction_options_fifo = fifo->rep; } -char *rocksdb_options_statistics_get_string(rocksdb_options_t *opt) { +char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) { ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get(); if (statistics) { return strdup(statistics->ToString().c_str()); @@ -3881,7 +3817,8 @@ char *rocksdb_options_statistics_get_string(rocksdb_options_t *opt) { return nullptr; } -void rocksdb_options_set_ratelimiter(rocksdb_options_t *opt, rocksdb_ratelimiter_t *limiter) { +void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt, + rocksdb_ratelimiter_t* limiter) { if (limiter) { opt->rep.rate_limiter = limiter->rep; } @@ -3913,23 +3850,22 @@ int rocksdb_options_get_wal_compression(rocksdb_options_t* opt) { return opt->rep.wal_compression; } -rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( - int64_t rate_bytes_per_sec, - int64_t refill_period_us, - int32_t fairness) { +rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(int64_t rate_bytes_per_sec, + int64_t refill_period_us, + int32_t fairness) { rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t; rate_limiter->rep.reset( - NewGenericRateLimiter(rate_bytes_per_sec, - refill_period_us, fairness)); + NewGenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness)); return rate_limiter; } -void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) { +void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) { delete limiter; } -void rocksdb_options_set_row_cache(rocksdb_options_t* opt, rocksdb_cache_t* cache) { - if(cache) { +void rocksdb_options_set_row_cache(rocksdb_options_t* opt, + rocksdb_cache_t* cache) { + if (cache) { opt->rep.row_cache = cache->rep; } } @@ -3958,12 +3894,12 @@ void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) { } char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context, - unsigned char exclude_zero_counters) { + unsigned char exclude_zero_counters) { return strdup(context->rep->ToString(exclude_zero_counters).c_str()); } uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, - int metric) { + int metric) { PerfContext* rep = context->rep; switch (metric) { case rocksdb_user_key_comparison_count: @@ -4143,15 +4079,12 @@ table_properties_collectors */ rocksdb_compactionfilter_t* rocksdb_compactionfilter_create( - void* state, - void (*destructor)(void*), - unsigned char (*filter)( - void*, - int level, - const char* key, size_t key_length, - const char* existing_value, size_t value_length, - char** new_value, size_t *new_value_length, - unsigned char* value_changed), + void* state, void (*destructor)(void*), + unsigned char (*filter)(void*, int level, const char* key, + size_t key_length, const char* existing_value, + size_t value_length, char** new_value, + size_t* new_value_length, + unsigned char* value_changed), const char* (*name)(void*)) { rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t; result->state_ = state; @@ -4163,8 +4096,7 @@ rocksdb_compactionfilter_t* rocksdb_compactionfilter_create( } void rocksdb_compactionfilter_set_ignore_snapshots( - rocksdb_compactionfilter_t* filter, - unsigned char whether_ignore) { + rocksdb_compactionfilter_t* filter, unsigned char whether_ignore) { filter->ignore_snapshots_ = whether_ignore; } @@ -4202,12 +4134,9 @@ void rocksdb_compactionfilterfactory_destroy( } rocksdb_comparator_t* rocksdb_comparator_create( - void* state, - void (*destructor)(void*), - int (*compare)( - void*, - const char* a, size_t alen, - const char* b, size_t blen), + void* state, void (*destructor)(void*), + int (*compare)(void*, const char* a, size_t alen, const char* b, + size_t blen), const char* (*name)(void*)) { rocksdb_comparator_t* result = new rocksdb_comparator_t; result->state_ = state; @@ -4360,13 +4289,10 @@ rocksdb_readoptions_t* rocksdb_readoptions_create() { return new rocksdb_readoptions_t; } -void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { - delete opt; -} +void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { delete opt; } -void rocksdb_readoptions_set_verify_checksums( - rocksdb_readoptions_t* opt, - unsigned char v) { +void rocksdb_readoptions_set_verify_checksums(rocksdb_readoptions_t* opt, + unsigned char v) { opt->rep.verify_checksums = v; } @@ -4375,8 +4301,8 @@ unsigned char rocksdb_readoptions_get_verify_checksums( return opt->rep.verify_checksums; } -void rocksdb_readoptions_set_fill_cache( - rocksdb_readoptions_t* opt, unsigned char v) { +void rocksdb_readoptions_set_fill_cache(rocksdb_readoptions_t* opt, + unsigned char v) { opt->rep.fill_cache = v; } @@ -4384,15 +4310,14 @@ unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) { return opt->rep.fill_cache; } -void rocksdb_readoptions_set_snapshot( - rocksdb_readoptions_t* opt, - const rocksdb_snapshot_t* snap) { +void rocksdb_readoptions_set_snapshot(rocksdb_readoptions_t* opt, + const rocksdb_snapshot_t* snap) { opt->rep.snapshot = (snap ? snap->rep : nullptr); } -void rocksdb_readoptions_set_iterate_upper_bound( - rocksdb_readoptions_t* opt, - const char* key, size_t keylen) { +void rocksdb_readoptions_set_iterate_upper_bound(rocksdb_readoptions_t* opt, + const char* key, + size_t keylen) { if (key == nullptr) { opt->upper_bound = Slice(); opt->rep.iterate_upper_bound = nullptr; @@ -4403,9 +4328,9 @@ void rocksdb_readoptions_set_iterate_upper_bound( } } -void rocksdb_readoptions_set_iterate_lower_bound( - rocksdb_readoptions_t *opt, - const char* key, size_t keylen) { +void rocksdb_readoptions_set_iterate_lower_bound(rocksdb_readoptions_t* opt, + const char* key, + size_t keylen) { if (key == nullptr) { opt->lower_bound = Slice(); opt->rep.iterate_lower_bound = nullptr; @@ -4415,8 +4340,7 @@ void rocksdb_readoptions_set_iterate_lower_bound( } } -void rocksdb_readoptions_set_read_tier( - rocksdb_readoptions_t* opt, int v) { +void rocksdb_readoptions_set_read_tier(rocksdb_readoptions_t* opt, int v) { opt->rep.read_tier = static_cast(v); } @@ -4424,8 +4348,8 @@ int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) { return static_cast(opt->rep.read_tier); } -void rocksdb_readoptions_set_tailing( - rocksdb_readoptions_t* opt, unsigned char v) { +void rocksdb_readoptions_set_tailing(rocksdb_readoptions_t* opt, + unsigned char v) { opt->rep.tailing = v; } @@ -4433,13 +4357,13 @@ unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) { return opt->rep.tailing; } -void rocksdb_readoptions_set_managed( - rocksdb_readoptions_t* opt, unsigned char v) { +void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt, + unsigned char v) { opt->rep.managed = v; } -void rocksdb_readoptions_set_readahead_size( - rocksdb_readoptions_t* opt, size_t v) { +void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt, + size_t v) { opt->rep.readahead_size = v; } @@ -4447,8 +4371,8 @@ size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) { return opt->rep.readahead_size; } -void rocksdb_readoptions_set_prefix_same_as_start( - rocksdb_readoptions_t* opt, unsigned char v) { +void rocksdb_readoptions_set_prefix_same_as_start(rocksdb_readoptions_t* opt, + unsigned char v) { opt->rep.prefix_same_as_start = v; } @@ -4477,8 +4401,7 @@ unsigned char rocksdb_readoptions_get_total_order_seek( } void rocksdb_readoptions_set_max_skippable_internal_keys( - rocksdb_readoptions_t* opt, - uint64_t v) { + rocksdb_readoptions_t* opt, uint64_t v) { opt->rep.max_skippable_internal_keys = v; } @@ -4497,8 +4420,8 @@ unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup( return opt->rep.background_purge_on_iterator_cleanup; } -void rocksdb_readoptions_set_ignore_range_deletions( - rocksdb_readoptions_t* opt, unsigned char v) { +void rocksdb_readoptions_set_ignore_range_deletions(rocksdb_readoptions_t* opt, + unsigned char v) { opt->rep.ignore_range_deletions = v; } @@ -4526,6 +4449,15 @@ rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) { return opt->rep.io_timeout.count(); } +void rocksdb_readoptions_set_async_io(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.async_io = v; +} + +unsigned char rocksdb_readoptions_get_async_io(rocksdb_readoptions_t* opt) { + return opt->rep.async_io; +} + void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt, const char* ts, size_t tslen) { if (ts == nullptr) { @@ -4552,12 +4484,10 @@ rocksdb_writeoptions_t* rocksdb_writeoptions_create() { return new rocksdb_writeoptions_t; } -void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { - delete opt; -} +void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { delete opt; } -void rocksdb_writeoptions_set_sync( - rocksdb_writeoptions_t* opt, unsigned char v) { +void rocksdb_writeoptions_set_sync(rocksdb_writeoptions_t* opt, + unsigned char v) { opt->rep.sync = v; } @@ -4624,7 +4554,8 @@ void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) { void rocksdb_compactoptions_set_bottommost_level_compaction( rocksdb_compactoptions_t* opt, unsigned char v) { - opt->rep.bottommost_level_compaction = static_cast(v); + opt->rep.bottommost_level_compaction = + static_cast(v); } unsigned char rocksdb_compactoptions_get_bottommost_level_compaction( @@ -4676,12 +4607,10 @@ rocksdb_flushoptions_t* rocksdb_flushoptions_create() { return new rocksdb_flushoptions_t; } -void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { - delete opt; -} +void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { delete opt; } -void rocksdb_flushoptions_set_wait( - rocksdb_flushoptions_t* opt, unsigned char v) { +void rocksdb_flushoptions_set_wait(rocksdb_flushoptions_t* opt, + unsigned char v) { opt->rep.wait = v; } @@ -4746,9 +4675,7 @@ rocksdb_cache_t* rocksdb_cache_create_lru_opts( return c; } -void rocksdb_cache_destroy(rocksdb_cache_t* cache) { - delete cache; -} +void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } void rocksdb_cache_disown_data(rocksdb_cache_t* cache) { cache->rep->DisownData(); @@ -4770,16 +4697,15 @@ size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) { return cache->rep->GetPinnedUsage(); } -rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size) { +rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, + uint64_t target_size) { rocksdb_dbpath_t* result = new rocksdb_dbpath_t; result->rep.path = std::string(path); result->rep.target_size = target_size; return result; } -void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) { - delete dbpath; -} +void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) { delete dbpath; } rocksdb_env_t* rocksdb_create_default_env() { rocksdb_env_t* result = new rocksdb_env_t; @@ -4812,7 +4738,8 @@ int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) { return env->rep->GetBackgroundThreads(Env::BOTTOM); } -void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) { +void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, + int n) { env->rep->SetBackgroundThreads(n, Env::HIGH); } @@ -4837,7 +4764,8 @@ void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) { env->rep->LowerThreadPoolIOPriority(); } -void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env) { +void rocksdb_env_lower_high_priority_thread_pool_io_priority( + rocksdb_env_t* env) { env->rep->LowerThreadPoolIOPriority(Env::HIGH); } @@ -4845,7 +4773,8 @@ void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) { env->rep->LowerThreadPoolCPUPriority(); } -void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env) { +void rocksdb_env_lower_high_priority_thread_pool_cpu_priority( + rocksdb_env_t* env) { env->rep->LowerThreadPoolCPUPriority(Env::HIGH); } @@ -5017,18 +4946,11 @@ void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) { } rocksdb_slicetransform_t* rocksdb_slicetransform_create( - void* state, - void (*destructor)(void*), - char* (*transform)( - void*, - const char* key, size_t length, - size_t* dst_length), - unsigned char (*in_domain)( - void*, - const char* key, size_t length), - unsigned char (*in_range)( - void*, - const char* key, size_t length), + void* state, void (*destructor)(void*), + char* (*transform)(void*, const char* key, size_t length, + size_t* dst_length), + unsigned char (*in_domain)(void*, const char* key, size_t length), + unsigned char (*in_range)(void*, const char* key, size_t length), const char* (*name)(void*)) { rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t; result->state_ = state; @@ -5040,9 +4962,7 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create( return result; } -void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { - delete st; -} +void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { delete st; } struct SliceTransformWrapper : public rocksdb_slicetransform_t { const SliceTransform* rep_; @@ -5052,14 +4972,13 @@ struct SliceTransformWrapper : public rocksdb_slicetransform_t { Slice Transform(const Slice& src) const override { return rep_->Transform(src); } - bool InDomain(const Slice& src) const override { - return rep_->InDomain(src); - } + bool InDomain(const Slice& src) const override { return rep_->InDomain(src); } bool InRange(const Slice& src) const override { return rep_->InRange(src); } - static void DoNothing(void*) { } + static void DoNothing(void*) {} }; -rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) { +rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix( + size_t prefixLen) { SliceTransformWrapper* wrapper = new SliceTransformWrapper; wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen); wrapper->state_ = nullptr; @@ -5075,14 +4994,16 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() { return wrapper; } -rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() { - rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t; +rocksdb_universal_compaction_options_t* +rocksdb_universal_compaction_options_create() { + rocksdb_universal_compaction_options_t* result = + new rocksdb_universal_compaction_options_t; result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal; return result; } void rocksdb_universal_compaction_options_set_size_ratio( - rocksdb_universal_compaction_options_t* uco, int ratio) { + rocksdb_universal_compaction_options_t* uco, int ratio) { uco->rep->size_ratio = ratio; } @@ -5092,7 +5013,7 @@ int rocksdb_universal_compaction_options_get_size_ratio( } void rocksdb_universal_compaction_options_set_min_merge_width( - rocksdb_universal_compaction_options_t* uco, int w) { + rocksdb_universal_compaction_options_t* uco, int w) { uco->rep->min_merge_width = w; } @@ -5102,7 +5023,7 @@ int rocksdb_universal_compaction_options_get_min_merge_width( } void rocksdb_universal_compaction_options_set_max_merge_width( - rocksdb_universal_compaction_options_t* uco, int w) { + rocksdb_universal_compaction_options_t* uco, int w) { uco->rep->max_merge_width = w; } @@ -5112,7 +5033,7 @@ int rocksdb_universal_compaction_options_get_max_merge_width( } void rocksdb_universal_compaction_options_set_max_size_amplification_percent( - rocksdb_universal_compaction_options_t* uco, int p) { + rocksdb_universal_compaction_options_t* uco, int p) { uco->rep->max_size_amplification_percent = p; } @@ -5122,7 +5043,7 @@ int rocksdb_universal_compaction_options_get_max_size_amplification_percent( } void rocksdb_universal_compaction_options_set_compression_size_percent( - rocksdb_universal_compaction_options_t* uco, int p) { + rocksdb_universal_compaction_options_t* uco, int p) { uco->rep->compression_size_percent = p; } @@ -5132,7 +5053,7 @@ int rocksdb_universal_compaction_options_get_compression_size_percent( } void rocksdb_universal_compaction_options_set_stop_style( - rocksdb_universal_compaction_options_t* uco, int style) { + rocksdb_universal_compaction_options_t* uco, int style) { uco->rep->stop_style = static_cast(style); } @@ -5143,14 +5064,15 @@ int rocksdb_universal_compaction_options_get_stop_style( } void rocksdb_universal_compaction_options_destroy( - rocksdb_universal_compaction_options_t* uco) { + rocksdb_universal_compaction_options_t* uco) { delete uco->rep; delete uco; } rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() { - rocksdb_fifo_compaction_options_t* result = new rocksdb_fifo_compaction_options_t; - result->rep = CompactionOptionsFIFO(); + rocksdb_fifo_compaction_options_t* result = + new rocksdb_fifo_compaction_options_t; + result->rep = CompactionOptionsFIFO(); return result; } @@ -5169,7 +5091,8 @@ void rocksdb_fifo_compaction_options_destroy( delete fifo_opts; } -void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level) { +void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, + int level) { if (level >= 0) { assert(level <= opt->rep.num_levels); opt->rep.compression_per_level.resize(opt->rep.num_levels); @@ -5182,8 +5105,7 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level } } -int rocksdb_livefiles_count( - const rocksdb_livefiles_t* lf) { +int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) { return static_cast(lf->rep.size()); } @@ -5192,54 +5114,39 @@ const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf, return lf->rep[index].column_family_name.c_str(); } -const char* rocksdb_livefiles_name( - const rocksdb_livefiles_t* lf, - int index) { +const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].name.c_str(); } -int rocksdb_livefiles_level( - const rocksdb_livefiles_t* lf, - int index) { +int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].level; } -size_t rocksdb_livefiles_size( - const rocksdb_livefiles_t* lf, - int index) { +size_t rocksdb_livefiles_size(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].size; } -const char* rocksdb_livefiles_smallestkey( - const rocksdb_livefiles_t* lf, - int index, - size_t* size) { +const char* rocksdb_livefiles_smallestkey(const rocksdb_livefiles_t* lf, + int index, size_t* size) { *size = lf->rep[index].smallestkey.size(); return lf->rep[index].smallestkey.data(); } -const char* rocksdb_livefiles_largestkey( - const rocksdb_livefiles_t* lf, - int index, - size_t* size) { +const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf, + int index, size_t* size) { *size = lf->rep[index].largestkey.size(); return lf->rep[index].largestkey.data(); } -uint64_t rocksdb_livefiles_entries( - const rocksdb_livefiles_t* lf, - int index) { +uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].num_entries; } -uint64_t rocksdb_livefiles_deletions( - const rocksdb_livefiles_t* lf, - int index) { +uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) { return lf->rep[index].num_deletions; } -extern void rocksdb_livefiles_destroy( - const rocksdb_livefiles_t* lf) { +extern void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) { delete lf; } @@ -5394,7 +5301,8 @@ rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() { return new rocksdb_transactiondb_options_t; } -void rocksdb_transactiondb_options_destroy(rocksdb_transactiondb_options_t* opt){ +void rocksdb_transactiondb_options_destroy( + rocksdb_transactiondb_options_t* opt) { delete opt; } @@ -5600,7 +5508,7 @@ rocksdb_transaction_t* rocksdb_transaction_begin( return result; } old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep, - txn_options->rep, old_txn->rep); + txn_options->rep, old_txn->rep); return old_txn; } @@ -5672,7 +5580,8 @@ void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) { txn->rep->SetSavePoint(); } -void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, char** errptr) { +void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, + char** errptr) { SaveError(errptr, txn->rep->RollbackToSavePoint()); } @@ -5904,12 +5813,10 @@ void rocksdb_transaction_multi_get_cf( } // Read a key outside a transaction -char* rocksdb_transactiondb_get( - rocksdb_transactiondb_t* txn_db, - const rocksdb_readoptions_t* options, - const char* key, size_t klen, - size_t* vlen, - char** errptr){ +char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, + const char* key, size_t klen, size_t* vlen, + char** errptr) { char* result = nullptr; std::string tmp; Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp); @@ -6086,11 +5993,9 @@ void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db, } // Write batch into transaction db -void rocksdb_transactiondb_write( - rocksdb_transactiondb_t* db, - const rocksdb_writeoptions_t* options, - rocksdb_writebatch_t* batch, - char** errptr) { +void rocksdb_transactiondb_write(rocksdb_transactiondb_t* db, + const rocksdb_writeoptions_t* options, + rocksdb_writebatch_t* batch, char** errptr) { SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); } @@ -6411,7 +6316,6 @@ struct rocksdb_memory_usage_t { // estimates amount of memory occupied by consumers (dbs and caches) rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create( rocksdb_memory_consumers_t* consumers, char** errptr) { - vector dbs; for (auto db : consumers->dbs) { dbs.push_back(db->rep); @@ -6433,7 +6337,8 @@ rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create( auto result = new rocksdb_memory_usage_t; result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal]; result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed]; - result->mem_table_readers_total = usage_by_type[MemoryUtil::kTableReadersTotal]; + result->mem_table_readers_total = + usage_by_type[MemoryUtil::kTableReadersTotal]; result->cache_total = usage_by_type[MemoryUtil::kCacheTotal]; return result; } diff --git a/db/c_test.c b/db/c_test.c index 12d6fd14313..b6877d46a99 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -49,32 +49,32 @@ static void StartPhase(const char* name) { } #ifdef _MSC_VER #pragma warning(push) -#pragma warning (disable: 4996) // getenv security warning +#pragma warning(disable : 4996) // getenv security warning #endif static const char* GetTempDir(void) { - const char* ret = getenv("TEST_TMPDIR"); - if (ret == NULL || ret[0] == '\0') + const char* ret = getenv("TEST_TMPDIR"); + if (ret == NULL || ret[0] == '\0') #ifdef OS_WIN - ret = getenv("TEMP"); + ret = getenv("TEMP"); #else - ret = "/tmp"; + ret = "/tmp"; #endif - return ret; + return ret; } #ifdef _MSC_VER #pragma warning(pop) #endif -#define CheckNoError(err) \ - if ((err) != NULL) { \ +#define CheckNoError(err) \ + if ((err) != NULL) { \ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ - abort(); \ + abort(); \ } -#define CheckCondition(cond) \ - if (!(cond)) { \ +#define CheckCondition(cond) \ + if (!(cond)) { \ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ - abort(); \ + abort(); \ } static void CheckEqual(const char* expected, const char* v, size_t n) { @@ -98,21 +98,15 @@ static void Free(char** ptr) { } } -static void CheckValue( - char* err, - const char* expected, - char** actual, - size_t actual_length) { +static void CheckValue(char* err, const char* expected, char** actual, + size_t actual_length) { CheckNoError(err); CheckEqual(expected, *actual, actual_length); Free(actual); } -static void CheckGet( - rocksdb_t* db, - const rocksdb_readoptions_t* options, - const char* key, - const char* expected) { +static void CheckGet(rocksdb_t* db, const rocksdb_readoptions_t* options, + const char* key, const char* expected) { char* err = NULL; size_t val_len; char* val; @@ -122,12 +116,9 @@ static void CheckGet( Free(&val); } -static void CheckGetCF( - rocksdb_t* db, - const rocksdb_readoptions_t* options, - rocksdb_column_family_handle_t* handle, - const char* key, - const char* expected) { +static void CheckGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* handle, const char* key, + const char* expected) { char* err = NULL; size_t val_len; char* val; @@ -174,8 +165,8 @@ static void CheckMultiGetValues(size_t num_keys, char** values, } } -static void CheckIter(rocksdb_iterator_t* iter, - const char* key, const char* val) { +static void CheckIter(rocksdb_iterator_t* iter, const char* key, + const char* val) { size_t len; const char* str; str = rocksdb_iter_key(iter, &len); @@ -185,10 +176,9 @@ static void CheckIter(rocksdb_iterator_t* iter, } // Callback from rocksdb_writebatch_iterate() -static void CheckPut(void* ptr, - const char* k, size_t klen, - const char* v, size_t vlen) { - int* state = (int*) ptr; +static void CheckPut(void* ptr, const char* k, size_t klen, const char* v, + size_t vlen) { + int* state = (int*)ptr; CheckCondition(*state < 2); switch (*state) { case 0: @@ -205,7 +195,7 @@ static void CheckPut(void* ptr, // Callback from rocksdb_writebatch_iterate() static void CheckDel(void* ptr, const char* k, size_t klen) { - int* state = (int*) ptr; + int* state = (int*)ptr; CheckCondition(*state == 2); CheckEqual("bar", k, klen); (*state)++; @@ -213,14 +203,16 @@ static void CheckDel(void* ptr, const char* k, size_t klen) { static void CmpDestroy(void* arg) { (void)arg; } -static int CmpCompare(void* arg, const char* a, size_t alen, - const char* b, size_t blen) { +static int CmpCompare(void* arg, const char* a, size_t alen, const char* b, + size_t blen) { (void)arg; size_t n = (alen < blen) ? alen : blen; int r = memcmp(a, b, n); if (r == 0) { - if (alen < blen) r = -1; - else if (alen > blen) r = +1; + if (alen < blen) + r = -1; + else if (alen > blen) + r = +1; } return r; } @@ -405,11 +397,9 @@ static const char* MergeOperatorName(void* arg) { return "TestMergeOperator"; } static char* MergeOperatorFullMerge( - void* arg, - const char* key, size_t key_length, - const char* existing_value, size_t existing_value_length, - const char* const* operands_list, const size_t* operands_list_length, - int num_operands, + void* arg, const char* key, size_t key_length, const char* existing_value, + size_t existing_value_length, const char* const* operands_list, + const size_t* operands_list_length, int num_operands, unsigned char* success, size_t* new_value_length) { (void)arg; (void)key; @@ -425,12 +415,12 @@ static char* MergeOperatorFullMerge( memcpy(result, "fake", 4); return result; } -static char* MergeOperatorPartialMerge( - void* arg, - const char* key, size_t key_length, - const char* const* operands_list, const size_t* operands_list_length, - int num_operands, - unsigned char* success, size_t* new_value_length) { +static char* MergeOperatorPartialMerge(void* arg, const char* key, + size_t key_length, + const char* const* operands_list, + const size_t* operands_list_length, + int num_operands, unsigned char* success, + size_t* new_value_length) { (void)arg; (void)key; (void)key_length; @@ -444,18 +434,16 @@ static char* MergeOperatorPartialMerge( return result; } -static void CheckTxnGet( - rocksdb_transaction_t* txn, - const rocksdb_readoptions_t* options, - const char* key, - const char* expected) { - char* err = NULL; - size_t val_len; - char* val; - val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err); - CheckNoError(err); - CheckEqual(expected, val, val_len); - Free(&val); +static void CheckTxnGet(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); } static void CheckTxnGetCF(rocksdb_transaction_t* txn, @@ -502,11 +490,9 @@ static void CheckTxnPinGetCF(rocksdb_transaction_t* txn, rocksdb_pinnableslice_destroy(p); } -static void CheckTxnDBGet( - rocksdb_transactiondb_t* txn_db, - const rocksdb_readoptions_t* options, - const char* key, - const char* expected) { +static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db, + const rocksdb_readoptions_t* options, const char* key, + const char* expected) { char* err = NULL; size_t val_len; char* val; @@ -632,7 +618,7 @@ int main(int argc, char** argv) { rocksdb_t* db; rocksdb_comparator_t* cmp; rocksdb_cache_t* cache; - rocksdb_dbpath_t *dbpath; + rocksdb_dbpath_t* dbpath; rocksdb_env_t* env; rocksdb_options_t* options; rocksdb_compactoptions_t* coptions; @@ -649,30 +635,20 @@ int main(int argc, char** argv) { char* err = NULL; int run = -1; - snprintf(dbname, sizeof(dbname), - "%s/rocksdb_c_test-%d", - GetTempDir(), - ((int) geteuid())); + snprintf(dbname, sizeof(dbname), "%s/rocksdb_c_test-%d", GetTempDir(), + ((int)geteuid())); - snprintf(dbbackupname, sizeof(dbbackupname), - "%s/rocksdb_c_test-%d-backup", - GetTempDir(), - ((int) geteuid())); + snprintf(dbbackupname, sizeof(dbbackupname), "%s/rocksdb_c_test-%d-backup", + GetTempDir(), ((int)geteuid())); snprintf(dbcheckpointname, sizeof(dbcheckpointname), - "%s/rocksdb_c_test-%d-checkpoint", - GetTempDir(), - ((int) geteuid())); + "%s/rocksdb_c_test-%d-checkpoint", GetTempDir(), ((int)geteuid())); - snprintf(sstfilename, sizeof(sstfilename), - "%s/rocksdb_c_test-%d-sst", - GetTempDir(), - ((int)geteuid())); + snprintf(sstfilename, sizeof(sstfilename), "%s/rocksdb_c_test-%d-sst", + GetTempDir(), ((int)geteuid())); - snprintf(dbpathname, sizeof(dbpathname), - "%s/rocksdb_c_test-%d-dbpath", - GetTempDir(), - ((int) geteuid())); + snprintf(dbpathname, sizeof(dbpathname), "%s/rocksdb_c_test-%d-dbpath", + GetTempDir(), ((int)geteuid())); StartPhase("create_objects"); cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); @@ -746,7 +722,8 @@ int main(int argc, char** argv) { rocksdb_destroy_db(options, dbbackupname, &err); CheckNoError(err); - rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err); + rocksdb_backup_engine_t* be = + rocksdb_backup_engine_open(options, dbbackupname, &err); CheckNoError(err); rocksdb_backup_engine_create_new_backup(be, db, &err); @@ -759,7 +736,8 @@ int main(int argc, char** argv) { rocksdb_backup_engine_create_new_backup(be, db, &err); CheckNoError(err); - const rocksdb_backup_engine_info_t* bei = rocksdb_backup_engine_get_backup_info(be); + const rocksdb_backup_engine_info_t* bei = + rocksdb_backup_engine_get_backup_info(be); CheckCondition(rocksdb_backup_engine_info_count(bei) > 1); rocksdb_backup_engine_info_destroy(bei); @@ -778,9 +756,11 @@ int main(int argc, char** argv) { rocksdb_destroy_db(options, dbname, &err); CheckNoError(err); - rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create(); + rocksdb_restore_options_t* restore_options = + rocksdb_restore_options_create(); rocksdb_restore_options_set_keep_log_files(restore_options, 0); - rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err); + rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, + restore_options, &err); CheckNoError(err); rocksdb_restore_options_destroy(restore_options); @@ -799,7 +779,8 @@ int main(int argc, char** argv) { rocksdb_destroy_db(options, dbcheckpointname, &err); CheckNoError(err); - rocksdb_checkpoint_t* checkpoint = rocksdb_checkpoint_object_create(db, &err); + rocksdb_checkpoint_t* checkpoint = + rocksdb_checkpoint_object_create(db, &err); CheckNoError(err); rocksdb_checkpoint_create(checkpoint, dbcheckpointname, 0, &err); @@ -976,10 +957,10 @@ int main(int argc, char** argv) { StartPhase("writebatch_vectors"); { rocksdb_writebatch_t* wb = rocksdb_writebatch_create(); - const char* k_list[2] = { "z", "ap" }; - const size_t k_sizes[2] = { 1, 2 }; - const char* v_list[3] = { "x", "y", "z" }; - const size_t v_sizes[3] = { 1, 1, 1 }; + const char* k_list[2] = {"z", "ap"}; + const size_t k_sizes[2] = {1, 2}; + const char* v_list[3] = {"x", "y", "z"}; + const size_t v_sizes[3] = {1, 1, 1}; rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); rocksdb_write(db, woptions, wb, &err); CheckNoError(err); @@ -1041,13 +1022,17 @@ int main(int argc, char** argv) { CheckCondition(count == 3); size_t size; char* value; - value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size, &err); + value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size, + &err); CheckValue(err, "c", &value, size); - value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size, &err); + value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size, + &err); CheckValue(err, NULL, &value, size); - value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "foo", 3, &size, &err); + value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, + "foo", 3, &size, &err); CheckValue(err, "hello", &value, size); - value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, "box", 3, &size, &err); + value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions, + "box", 3, &size, &err); CheckValue(err, "c", &value, size); rocksdb_write_writebatch_wi(db, woptions, wbi, &err); CheckNoError(err); @@ -1064,10 +1049,10 @@ int main(int argc, char** argv) { StartPhase("writebatch_wi_vectors"); { rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1); - const char* k_list[2] = { "z", "ap" }; - const size_t k_sizes[2] = { 1, 2 }; - const char* v_list[3] = { "x", "y", "z" }; - const size_t v_sizes[3] = { 1, 1, 1 }; + const char* k_list[2] = {"z", "ap"}; + const size_t k_sizes[2] = {1, 2}; + const char* v_list[3] = {"x", "y", "z"}; + const size_t v_sizes[3] = {1, 1, 1}; rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes); rocksdb_write_writebatch_wi(db, woptions, wb, &err); CheckNoError(err); @@ -1156,13 +1141,14 @@ int main(int argc, char** argv) { StartPhase("multiget"); { - const char* keys[3] = { "box", "foo", "notfound" }; - const size_t keys_sizes[3] = { 3, 3, 8 }; + const char* keys[3] = {"box", "foo", "notfound"}; + const size_t keys_sizes[3] = {3, 3, 8}; char* vals[3]; size_t vals_sizes[3]; char* errs[3]; const char* expected[3] = {"c", "hello", NULL}; - rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes, errs); + rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes, + errs); CheckMultiGetValues(3, vals, vals_sizes, errs, expected); } @@ -1180,10 +1166,10 @@ int main(int argc, char** argv) { char keybuf[100]; char valbuf[100]; uint64_t sizes[2]; - const char* start[2] = { "a", "k00000000000000010000" }; - size_t start_len[2] = { 1, 21 }; - const char* limit[2] = { "k00000000000000010000", "z" }; - size_t limit_len[2] = { 21, 1 }; + const char* start[2] = {"a", "k00000000000000010000"}; + size_t start_len[2] = {1, 21}; + const char* limit[2] = {"k00000000000000010000", "z"}; + size_t limit_len[2] = {21, 1}; rocksdb_writeoptions_set_sync(woptions, 0); for (i = 0; i < n; i++) { snprintf(keybuf, sizeof(keybuf), "k%020d", i); @@ -1393,8 +1379,8 @@ int main(int argc, char** argv) { factory); db = CheckCompaction(db, options_with_filter_factory, roptions, woptions); - rocksdb_options_set_compaction_filter_factory( - options_with_filter_factory, NULL); + rocksdb_options_set_compaction_filter_factory(options_with_filter_factory, + NULL); rocksdb_options_destroy(options_with_filter_factory); } @@ -1449,7 +1435,8 @@ int main(int argc, char** argv) { rocksdb_close(db); size_t cflen; - char** column_fams = rocksdb_list_column_families(db_options, dbname, &cflen, &err); + char** column_fams = + rocksdb_list_column_families(db_options, dbname, &cflen, &err); CheckNoError(err); CheckEqual("default", column_fams[0], 7); CheckEqual("cf1", column_fams[1], 3); @@ -1465,7 +1452,8 @@ int main(int argc, char** argv) { LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 2, cf_names, NULL); - db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts, handles, &err); + db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts, + handles, &err); CheckNoError(err); rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err); @@ -1483,11 +1471,10 @@ int main(int argc, char** argv) { &err); CheckNoError(err); - rocksdb_flushoptions_t *flush_options = rocksdb_flushoptions_create(); + rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create(); rocksdb_flushoptions_set_wait(flush_options, 1); rocksdb_flush_cf(db, flush_options, handles[1], &err); - CheckNoError(err) - rocksdb_flushoptions_destroy(flush_options); + CheckNoError(err) rocksdb_flushoptions_destroy(flush_options); CheckGetCF(db, roptions, handles[1], "foo", "hello"); CheckPinGetCF(db, roptions, handles[1], "foo", "hello"); @@ -1524,27 +1511,29 @@ int main(int argc, char** argv) { rocksdb_flush_wal(db, 1, &err); CheckNoError(err); - const char* keys[3] = { "box", "box", "barfooxx" }; - const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] }; - const size_t keys_sizes[3] = { 3, 3, 8 }; + const char* keys[3] = {"box", "box", "barfooxx"}; + const rocksdb_column_family_handle_t* get_handles[3] = { + handles[0], handles[1], handles[1]}; + const size_t keys_sizes[3] = {3, 3, 8}; char* vals[3]; size_t vals_sizes[3]; char* errs[3]; - rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals, vals_sizes, errs); + rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals, + vals_sizes, errs); int i; for (i = 0; i < 3; i++) { CheckEqual(NULL, errs[i], 0); switch (i) { - case 0: - CheckEqual(NULL, vals[i], vals_sizes[i]); // wrong cf - break; - case 1: - CheckEqual("c", vals[i], vals_sizes[i]); // bingo - break; - case 2: - CheckEqual(NULL, vals[i], vals_sizes[i]); // normal not found - break; + case 0: + CheckEqual(NULL, vals[i], vals_sizes[i]); // wrong cf + break; + case 1: + CheckEqual("c", vals[i], vals_sizes[i]); // bingo + break; + case 2: + CheckEqual(NULL, vals[i], vals_sizes[i]); // normal not found + break; } Free(&vals[i]); } @@ -1592,7 +1581,8 @@ int main(int argc, char** argv) { } } - rocksdb_iterator_t* iter = rocksdb_create_iterator_cf(db, roptions, handles[1]); + rocksdb_iterator_t* iter = + rocksdb_create_iterator_cf(db, roptions, handles[1]); CheckCondition(!rocksdb_iter_valid(iter)); rocksdb_iter_seek_to_first(iter); CheckCondition(rocksdb_iter_valid(iter)); @@ -1605,9 +1595,11 @@ int main(int argc, char** argv) { CheckNoError(err); rocksdb_iter_destroy(iter); - rocksdb_column_family_handle_t* iters_cf_handles[2] = { handles[0], handles[1] }; + rocksdb_column_family_handle_t* iters_cf_handles[2] = {handles[0], + handles[1]}; rocksdb_iterator_t* iters_handles[2]; - rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2, &err); + rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2, + &err); CheckNoError(err); iter = iters_handles[0]; @@ -1652,7 +1644,8 @@ int main(int argc, char** argv) { { // Create new database rocksdb_options_set_allow_mmap_reads(options, 1); - rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); + rocksdb_options_set_prefix_extractor( + options, rocksdb_slicetransform_create_fixed_prefix(3)); rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4); rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); rocksdb_options_set_allow_concurrent_memtable_write(options, 0); @@ -1747,8 +1740,9 @@ int main(int argc, char** argv) { // amount of memory used within memtables should grow CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >= rocksdb_approximate_memory_usage_get_mem_table_total(mu1)); - CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >= - rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1)); + CheckCondition( + rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >= + rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1)); rocksdb_memory_consumers_destroy(consumers); rocksdb_approximate_memory_usage_destroy(mu1); @@ -2578,6 +2572,9 @@ int main(int argc, char** argv) { rocksdb_readoptions_set_io_timeout(ro, 400); CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro)); + rocksdb_readoptions_set_async_io(ro, 1); + CheckCondition(1 == rocksdb_readoptions_get_async_io(ro)); + rocksdb_readoptions_destroy(ro); } @@ -2839,53 +2836,57 @@ int main(int argc, char** argv) { db = rocksdb_open(options, dbname, &err); CheckNoError(err); - rocksdb_put(db, woptions, "a", 1, "0", 1, &err); CheckNoError(err); - rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err); CheckNoError(err); - rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err); - rocksdb_put(db, woptions, "g1", 2, "0", 1, &err); CheckNoError(err); + rocksdb_put(db, woptions, "a", 1, "0", 1, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); + CheckNoError(err); + rocksdb_put(db, woptions, "g1", 2, "0", 1, &err); + CheckNoError(err); // testing basic case with no iterate_upper_bound and no prefix_extractor { - rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); - rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); - rocksdb_iter_seek(iter, "foo", 3); - CheckCondition(rocksdb_iter_valid(iter)); - CheckIter(iter, "foo", "bar"); + rocksdb_iter_seek(iter, "foo", 3); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo", "bar"); - rocksdb_iter_next(iter); - CheckCondition(rocksdb_iter_valid(iter)); - CheckIter(iter, "foo1", "bar1"); + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo1", "bar1"); - rocksdb_iter_next(iter); - CheckCondition(rocksdb_iter_valid(iter)); - CheckIter(iter, "g1", "0"); + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "g1", "0"); - rocksdb_iter_destroy(iter); + rocksdb_iter_destroy(iter); } // testing iterate_upper_bound and forward iterator // to make sure it stops at bound { - // iterate_upper_bound points beyond the last expected entry - rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4); + // iterate_upper_bound points beyond the last expected entry + rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4); - rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); - rocksdb_iter_seek(iter, "foo", 3); - CheckCondition(rocksdb_iter_valid(iter)); - CheckIter(iter, "foo", "bar"); + rocksdb_iter_seek(iter, "foo", 3); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo", "bar"); - rocksdb_iter_next(iter); - CheckCondition(rocksdb_iter_valid(iter)); - CheckIter(iter, "foo1", "bar1"); + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo1", "bar1"); - rocksdb_iter_next(iter); - // should stop here... - CheckCondition(!rocksdb_iter_valid(iter)); + rocksdb_iter_next(iter); + // should stop here... + CheckCondition(!rocksdb_iter_valid(iter)); - rocksdb_iter_destroy(iter); - rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); + rocksdb_iter_destroy(iter); + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); } } @@ -3009,7 +3010,7 @@ int main(int argc, char** argv) { snapshot = rocksdb_transactiondb_create_snapshot(txn_db); rocksdb_readoptions_set_snapshot(roptions, snapshot); - rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err); + rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err); CheckNoError(err); CheckTxnDBGet(txn_db, roptions, "foo", "hello"); @@ -3021,7 +3022,8 @@ int main(int argc, char** argv) { // iterate rocksdb_transaction_put(txn, "bar", 3, "hi", 2, &err); - rocksdb_iterator_t* iter = rocksdb_transaction_create_iterator(txn, roptions); + rocksdb_iterator_t* iter = + rocksdb_transaction_create_iterator(txn, roptions); CheckCondition(!rocksdb_iter_valid(iter)); rocksdb_iter_seek_to_first(iter); CheckCondition(rocksdb_iter_valid(iter)); diff --git a/db/column_family.cc b/db/column_family.cc index be085a64295..fe143e9d2db 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -192,8 +192,7 @@ Status CheckCFPathsSupported(const DBOptions& db_options, return Status::NotSupported( "More than one CF paths are only supported in " "universal and level compaction styles. "); - } else if (cf_options.cf_paths.empty() && - db_options.db_paths.size() > 1) { + } else if (cf_options.cf_paths.empty() && db_options.db_paths.size() > 1) { return Status::NotSupported( "More than one DB paths are only supported in " "universal and level compaction styles. "); @@ -205,7 +204,7 @@ Status CheckCFPathsSupported(const DBOptions& db_options, namespace { const uint64_t kDefaultTtl = 0xfffffffffffffffe; const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe; -} // namespace +} // anonymous namespace ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, const ColumnFamilyOptions& src) { @@ -353,7 +352,8 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, // were not deleted yet, when we open the DB we will find these .trash files // and schedule them to be deleted (or delete immediately if SstFileManager // was not used) - auto sfm = static_cast(db_options.sst_file_manager.get()); + auto sfm = + static_cast(db_options.sst_file_manager.get()); for (size_t i = 0; i < result.cf_paths.size(); i++) { DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path) @@ -557,7 +557,6 @@ ColumnFamilyData::ColumnFamilyData( next_(nullptr), prev_(nullptr), log_number_(0), - flush_reason_(FlushReason::kOthers), column_family_set_(column_family_set), queued_for_flush_(false), queued_for_compaction_(false), @@ -565,7 +564,8 @@ ColumnFamilyData::ColumnFamilyData( allow_2pc_(db_options.allow_2pc), last_memtable_id_(0), db_paths_registered_(false), - mempurge_used_(false) { + mempurge_used_(false), + next_epoch_number_(1) { if (id_ != kDummyColumnFamilyDataId) { // TODO(cc): RegisterDbPaths can be expensive, considering moving it // outside of this constructor which might be called with db mutex held. @@ -610,8 +610,8 @@ ColumnFamilyData::ColumnFamilyData( compaction_picker_.reset( new FIFOCompactionPicker(ioptions_, &internal_comparator_)); } else if (ioptions_.compaction_style == kCompactionStyleNone) { - compaction_picker_.reset(new NullCompactionPicker( - ioptions_, &internal_comparator_)); + compaction_picker_.reset( + new NullCompactionPicker(ioptions_, &internal_comparator_)); ROCKS_LOG_WARN(ioptions_.logger, "Column family %s does not use any background compaction. " "Compactions can only be done via CompactFiles\n", @@ -878,7 +878,7 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, return static_cast(res); } } -} // namespace +} // anonymous namespace std::pair ColumnFamilyData::GetWriteStallConditionAndCause( @@ -923,7 +923,7 @@ ColumnFamilyData::GetWriteStallConditionAndCause( } WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( - const MutableCFOptions& mutable_cf_options) { + const MutableCFOptions& mutable_cf_options) { auto write_stall_condition = WriteStallCondition::kNormal; if (current_ != nullptr) { auto* vstorage = current_->storage_info(); @@ -1034,7 +1034,8 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && (compaction_needed_bytes - mutable_cf_options.soft_pending_compaction_bytes_limit) > - 3 * (mutable_cf_options.hard_pending_compaction_bytes_limit - + 3 * + (mutable_cf_options.hard_pending_compaction_bytes_limit - mutable_cf_options.soft_pending_compaction_bytes_limit) / 4; @@ -1149,12 +1150,9 @@ bool ColumnFamilyData::NeedsCompaction() const { Compaction* ColumnFamilyData::PickCompaction( const MutableCFOptions& mutable_options, const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) { - SequenceNumber earliest_mem_seqno = - std::min(mem_->GetEarliestSequenceNumber(), - imm_.current()->GetEarliestSequenceNumber(false)); auto* result = compaction_picker_->PickCompaction( GetName(), mutable_options, mutable_db_options, current_->storage_info(), - log_buffer, earliest_mem_seqno); + log_buffer); if (result != nullptr) { result->SetInputVersion(current_); } @@ -1241,6 +1239,7 @@ Compaction* ColumnFamilyData::CompactRange( if (result != nullptr) { result->SetInputVersion(current_); } + TEST_SYNC_POINT("ColumnFamilyData::CompactRange:Return"); return result; } @@ -1324,8 +1323,8 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { return false; } -void ColumnFamilyData::InstallSuperVersion( - SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) { +void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context, + InstrumentedMutex* db_mutex) { db_mutex->AssertHeld(); return InstallSuperVersion(sv_context, mutable_cf_options_); } @@ -1545,8 +1544,8 @@ Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) { // than base_level. return Env::WLTH_MEDIUM; } - return static_cast(level - base_level + - static_cast(Env::WLTH_MEDIUM)); + return static_cast( + level - base_level + static_cast(Env::WLTH_MEDIUM)); } Status ColumnFamilyData::AddDirectories( @@ -1584,6 +1583,13 @@ FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const { return data_dirs_[path_id].get(); } +void ColumnFamilyData::RecoverEpochNumbers() { + assert(current_); + auto* vstorage = current_->storage_info(); + assert(vstorage); + vstorage->RecoverEpochNumbers(this); +} + ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const ImmutableDBOptions* db_options, const FileOptions& file_options, @@ -1642,8 +1648,8 @@ ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const { } } -ColumnFamilyData* ColumnFamilySet::GetColumnFamily(const std::string& name) - const { +ColumnFamilyData* ColumnFamilySet::GetColumnFamily( + const std::string& name) const { auto cfd_iter = column_families_.find(name); if (cfd_iter != column_families_.end()) { auto cfd = GetColumnFamily(cfd_iter->second); diff --git a/db/column_family.h b/db/column_family.h index 91a8253742e..0c696ed4e2b 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -163,8 +163,8 @@ extern const double kIncSlowdownRatio; class ColumnFamilyHandleImpl : public ColumnFamilyHandle { public: // create while holding the mutex - ColumnFamilyHandleImpl( - ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex); + ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, + InstrumentedMutex* mutex); // destroy without mutex virtual ~ColumnFamilyHandleImpl(); virtual ColumnFamilyData* cfd() const { return cfd_; } @@ -189,7 +189,8 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle { class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { public: ColumnFamilyHandleInternal() - : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {} + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), + internal_cfd_(nullptr) {} void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } @@ -309,10 +310,6 @@ class ColumnFamilyData { void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } uint64_t GetLogNumber() const { return log_number_; } - void SetFlushReason(FlushReason flush_reason) { - flush_reason_ = flush_reason; - } - FlushReason GetFlushReason() const { return flush_reason_; } // thread-safe const FileOptions* soptions() const; const ImmutableOptions* ioptions() const { return &ioptions_; } @@ -357,7 +354,7 @@ class ColumnFamilyData { Version* current() { return current_; } Version* dummy_versions() { return dummy_versions_; } void SetCurrent(Version* _current); - uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held + uint64_t GetNumLiveVersions() const; // REQUIRE: DB mutex held uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held @@ -532,6 +529,24 @@ class ColumnFamilyData { void SetMempurgeUsed() { mempurge_used_ = true; } bool GetMempurgeUsed() { return mempurge_used_; } + // Allocate and return a new epoch number + uint64_t NewEpochNumber() { return next_epoch_number_.fetch_add(1); } + + // Get the next epoch number to be assigned + uint64_t GetNextEpochNumber() const { return next_epoch_number_.load(); } + + // Set the next epoch number to be assigned + void SetNextEpochNumber(uint64_t next_epoch_number) { + next_epoch_number_.store(next_epoch_number); + } + + // Reset the next epoch number to be assigned + void ResetNextEpochNumber() { next_epoch_number_.store(1); } + + // Recover the next epoch number of this CF and epoch number + // of its files (if missing) + void RecoverEpochNumbers(); + private: friend class ColumnFamilySet; ColumnFamilyData(uint32_t id, const std::string& name, @@ -552,7 +567,7 @@ class ColumnFamilyData { Version* dummy_versions_; // Head of circular doubly-linked list of versions. Version* current_; // == dummy_versions->prev_ - std::atomic refs_; // outstanding references to ColumnFamilyData + std::atomic refs_; // outstanding references to ColumnFamilyData std::atomic initialized_; std::atomic dropped_; // true if client dropped it @@ -597,8 +612,6 @@ class ColumnFamilyData { // recovered from uint64_t log_number_; - std::atomic flush_reason_; - // An object that keeps all the compaction stats // and picks the next compaction std::unique_ptr compaction_picker_; @@ -633,6 +646,8 @@ class ColumnFamilyData { // a Version associated with this CFD std::shared_ptr file_metadata_cache_res_mgr_; bool mempurge_used_; + + std::atomic next_epoch_number_; }; // ColumnFamilySet has interesting thread-safety requirements @@ -656,8 +671,7 @@ class ColumnFamilySet { // ColumnFamilySet supports iteration class iterator { public: - explicit iterator(ColumnFamilyData* cfd) - : current_(cfd) {} + explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {} // NOTE: minimum operators for for-loop iteration iterator& operator++() { current_ = current_->next_; diff --git a/db/column_family_test.cc b/db/column_family_test.cc index aa8f73f636e..d33cbe50a77 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -39,9 +39,7 @@ class EnvCounter : public SpecialEnv { public: explicit EnvCounter(Env* base) : SpecialEnv(base), num_new_writable_file_(0) {} - int GetNumberOfNewWritableFileCalls() { - return num_new_writable_file_; - } + int GetNumberOfNewWritableFileCalls() { return num_new_writable_file_; } Status NewWritableFile(const std::string& f, std::unique_ptr* r, const EnvOptions& soptions) override { ++num_new_writable_file_; @@ -187,7 +185,7 @@ class ColumnFamilyTestBase : public testing::Test { } Status OpenReadOnly(std::vector cf, - std::vector options = {}) { + std::vector options = {}) { std::vector column_families; names_.clear(); for (size_t i = 0; i < cf.size(); ++i) { @@ -201,20 +199,17 @@ class ColumnFamilyTestBase : public testing::Test { #ifndef ROCKSDB_LITE // ReadOnlyDB is not supported void AssertOpenReadOnly(std::vector cf, - std::vector options = {}) { + std::vector options = {}) { ASSERT_OK(OpenReadOnly(cf, options)); } #endif // !ROCKSDB_LITE - void Open(std::vector cf, std::vector options = {}) { ASSERT_OK(TryOpen(cf, options)); } - void Open() { - Open({"default"}); - } + void Open() { Open({"default"}); } DBImpl* dbfull() { return static_cast_with_check(db_); } @@ -253,7 +248,7 @@ class ColumnFamilyTestBase : public testing::Test { } void Destroy(const std::vector& column_families = - std::vector()) { + std::vector()) { Close(); ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_), column_families)); @@ -335,9 +330,7 @@ class ColumnFamilyTestBase : public testing::Test { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); } - void WaitForCompaction() { - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - } + void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); } uint64_t MaxTotalInMemoryState() { return dbfull()->TEST_MaxTotalInMemoryState(); @@ -354,9 +347,7 @@ class ColumnFamilyTestBase : public testing::Test { Status Merge(int cf, const std::string& key, const std::string& value) { return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value)); } - Status Flush(int cf) { - return db_->Flush(FlushOptions(), handles_[cf]); - } + Status Flush(int cf) { return db_->Flush(FlushOptions(), handles_[cf]); } std::string Get(int cf, const std::string& key) { ReadOptions options; @@ -409,8 +400,8 @@ class ColumnFamilyTestBase : public testing::Test { #ifndef ROCKSDB_LITE ASSERT_EQ(value, FilesPerLevel(cf)); #else - (void) value; - (void) cf; + (void)value; + (void)cf; #endif } @@ -426,7 +417,7 @@ class ColumnFamilyTestBase : public testing::Test { #ifndef ROCKSDB_LITE ASSERT_EQ(expected_value, CountLiveFiles()); #else - (void) expected_value; + (void)expected_value; #endif } @@ -476,7 +467,7 @@ class ColumnFamilyTestBase : public testing::Test { #ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported ASSERT_EQ(value, CountLiveLogFiles()); #else - (void) value; + (void)value; #endif // !ROCKSDB_LITE } @@ -521,14 +512,14 @@ class ColumnFamilyTestBase : public testing::Test { return static_cast(files.size()); } - void RecalculateWriteStallConditions(ColumnFamilyData* cfd, - const MutableCFOptions& mutable_cf_options) { + void RecalculateWriteStallConditions( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options) { // add lock to avoid race condition between // `RecalculateWriteStallConditions` which writes to CFStats and // background `DBImpl::DumpStats()` threads which read CFStats dbfull()->TEST_LockMutex(); cfd->RecalculateWriteStallConditions(mutable_cf_options); - dbfull()-> TEST_UnlockMutex(); + dbfull()->TEST_UnlockMutex(); } std::vector handles_; @@ -970,8 +961,7 @@ TEST_P(ColumnFamilyTest, FlushTest) { } for (int i = 0; i < 3; ++i) { - uint64_t max_total_in_memory_state = - MaxTotalInMemoryState(); + uint64_t max_total_in_memory_state = MaxTotalInMemoryState(); ASSERT_OK(Flush(i)); AssertMaxTotalInMemoryState(max_total_in_memory_state); } @@ -1209,7 +1199,7 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) { WaitForFlush(2); AssertNumberOfImmutableMemtables({0, 0, 0, 0}); AssertCountLiveLogFiles(12); - PutRandomData(1, 2*200, 1000); + PutRandomData(1, 2 * 200, 1000); WaitForFlush(1); AssertNumberOfImmutableMemtables({0, 0, 0, 0}); AssertCountLiveLogFiles(7); @@ -2123,7 +2113,6 @@ TEST_P(ColumnFamilyTest, ReadOnlyDBTest) { ASSERT_EQ("bla", Get(1, "foo")); ASSERT_EQ("blablablabla", Get(2, "foo")); - // test newiterators { std::vector iterators; @@ -2488,7 +2477,7 @@ void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id, } test_stage = kChildThreadFinishDroppingColumnFamily; } -} // namespace +} // anonymous namespace TEST_P(ColumnFamilyTest, CreateAndDropRace) { const int kCfCount = 5; diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index 499220d7f47..ef38946f7e2 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -348,9 +348,7 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { return true; } - void SetDB(DB* db) { - db_ = db; - } + void SetDB(DB* db) { db_ = db; } const char* Name() const override { return "FilterWithGet"; } @@ -358,7 +356,6 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { DB* db_; }; - std::shared_ptr cf(new FilterWithGet()); Options options; @@ -385,7 +382,6 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0)); } - delete db; } @@ -400,10 +396,9 @@ TEST_F(CompactFilesTest, SentinelCompressionType) { } // Check that passing `CompressionType::kDisableCompressionOption` to // `CompactFiles` causes it to use the column family compression options. - for (auto compaction_style : - {CompactionStyle::kCompactionStyleLevel, - CompactionStyle::kCompactionStyleUniversal, - CompactionStyle::kCompactionStyleNone}) { + for (auto compaction_style : {CompactionStyle::kCompactionStyleLevel, + CompactionStyle::kCompactionStyleUniversal, + CompactionStyle::kCompactionStyleNone}) { ASSERT_OK(DestroyDB(db_name_, Options())); Options options; options.compaction_style = compaction_style; diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index a32b529f743..d7d57bbf519 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -238,12 +238,19 @@ Compaction::Compaction( inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), grandparents_(std::move(_grandparents)), score_(_score), - bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)), + bottommost_level_( + // For simplicity, we don't support the concept of "bottommost level" + // with + // `CompactionReason::kExternalSstIngestion` and + // `CompactionReason::kRefitLevel` + (_compaction_reason == CompactionReason::kExternalSstIngestion || + _compaction_reason == CompactionReason::kRefitLevel) + ? false + : IsBottommostLevel(output_level_, vstorage, inputs_)), is_full_compaction_(IsFullCompaction(vstorage, inputs_)), is_manual_compaction_(_manual_compaction), trim_ts_(_trim_ts), is_trivial_move_(false), - compaction_reason_(_compaction_reason), notify_on_compaction_completion_(false), enable_blob_garbage_collection_( @@ -258,8 +265,15 @@ Compaction::Compaction( _blob_garbage_collection_age_cutoff > 1 ? mutable_cf_options()->blob_garbage_collection_age_cutoff : _blob_garbage_collection_age_cutoff), - penultimate_level_(EvaluatePenultimateLevel( - vstorage, immutable_options_, start_level_, output_level_)) { + penultimate_level_( + // For simplicity, we don't support the concept of "penultimate level" + // with `CompactionReason::kExternalSstIngestion` and + // `CompactionReason::kRefitLevel` + _compaction_reason == CompactionReason::kExternalSstIngestion || + _compaction_reason == CompactionReason::kRefitLevel + ? Compaction::kInvalidLevel + : EvaluatePenultimateLevel(vstorage, immutable_options_, + start_level_, output_level_)) { MarkFilesBeingCompacted(true); if (is_manual_compaction_) { compaction_reason_ = CompactionReason::kManualCompaction; @@ -332,6 +346,7 @@ void Compaction::PopulatePenultimateLevelOutputRange() { // the case that the penultimate level is empty). if (immutable_options_.compaction_style == kCompactionStyleUniversal) { exclude_level = kInvalidLevel; + penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange; std::set penultimate_inputs; for (const auto& input_lvl : inputs_) { if (input_lvl.level == penultimate_level_) { @@ -345,7 +360,8 @@ void Compaction::PopulatePenultimateLevelOutputRange() { if (penultimate_inputs.find(file->fd.GetNumber()) == penultimate_inputs.end()) { exclude_level = number_levels_ - 1; - penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange; + penultimate_output_range_type_ = + PenultimateOutputRangeType::kNonLastRange; break; } } @@ -354,35 +370,6 @@ void Compaction::PopulatePenultimateLevelOutputRange() { GetBoundaryKeys(input_vstorage_, inputs_, &penultimate_level_smallest_user_key_, &penultimate_level_largest_user_key_, exclude_level); - - // If there's a case that the penultimate level output range is overlapping - // with the existing files, disable the penultimate level output by setting - // the range to empty. One example is the range delete could have overlap - // boundary with the next file. (which is actually a false overlap) - // TODO: Exclude such false overlap, so it won't disable the penultimate - // output. - std::set penultimate_inputs; - for (const auto& input_lvl : inputs_) { - if (input_lvl.level == penultimate_level_) { - for (const auto& file : input_lvl.files) { - penultimate_inputs.emplace(file->fd.GetNumber()); - } - } - } - - auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_); - for (const auto& file : penultimate_files) { - if (penultimate_inputs.find(file->fd.GetNumber()) == - penultimate_inputs.end() && - OverlapPenultimateLevelOutputRange(file->smallest.user_key(), - file->largest.user_key())) { - // basically disable the penultimate range output. which should be rare - // or a false overlap caused by range del - penultimate_level_smallest_user_key_ = ""; - penultimate_level_largest_user_key_ = ""; - penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled; - } - } } Compaction::~Compaction() { @@ -807,6 +794,16 @@ uint64_t Compaction::MinInputFileOldestAncesterTime( return min_oldest_ancester_time; } +uint64_t Compaction::MinInputFileEpochNumber() const { + uint64_t min_epoch_number = std::numeric_limits::max(); + for (const auto& inputs_per_level : inputs_) { + for (const auto& file : inputs_per_level.files) { + min_epoch_number = std::min(min_epoch_number, file->epoch_number); + } + } + return min_epoch_number; +} + int Compaction::EvaluatePenultimateLevel( const VersionStorageInfo* vstorage, const ImmutableOptions& immutable_options, const int start_level, diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index 21d1190ac3a..ee863960146 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -378,6 +378,9 @@ class Compaction { // This is used to filter out some input files' ancester's time range. uint64_t MinInputFileOldestAncesterTime(const InternalKey* start, const InternalKey* end) const; + // Return the minimum epoch number among + // input files' associated with this compaction + uint64_t MinInputFileEpochNumber() const; // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of // compaction begin and compaction completion callbacks match. diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index c37e4f6ed50..e1bdddcb750 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -124,6 +124,9 @@ CompactionIterator::CompactionIterator( timestamp_size_ == full_history_ts_low_->size()); #endif input_.SetPinnedItersMgr(&pinned_iters_mgr_); + // The default `merge_until_status_` does not need to be checked since it is + // overwritten as soon as `MergeUntil()` is called + merge_until_status_.PermitUncheckedError(); TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get()); } @@ -168,11 +171,30 @@ void CompactionIterator::Next() { } // Keep current_key_ in sync. - current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + if (0 == timestamp_size_) { + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + } else { + Slice ts = ikey_.GetTimestamp(timestamp_size_); + current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type, &ts); + } key_ = current_key_.GetInternalKey(); ikey_.user_key = current_key_.GetUserKey(); validity_info_.SetValid(ValidContext::kMerge1); } else { + if (merge_until_status_.IsMergeInProgress()) { + // `Status::MergeInProgress()` tells us that the previous `MergeUntil()` + // produced only merge operands. Those merge operands were accessed and + // written out using `merge_out_iter_`. Since `merge_out_iter_` is + // exhausted at this point, all merge operands have been written out. + // + // Still, there may be a base value (PUT, DELETE, SINGLEDEL, etc.) that + // needs to be written out. Normally, `CompactionIterator` would skip it + // on the basis that it has already output something in the same + // snapshot stripe. To prevent this, we reset `has_current_user_key_` to + // trick the future iteration from finding out the snapshot stripe is + // unchanged. + has_current_user_key_ = false; + } // We consumed all pinned merge operands, release pinned iterators pinned_iters_mgr_.ReleasePinnedData(); // MergeHelper moves the iterator to the first record after the merged @@ -825,8 +847,8 @@ void CompactionIterator::NextFromInput() { cmp_with_history_ts_low_ < 0)) && bottommost_level_) { // Handle the case where we have a delete key at the bottom most level - // We can skip outputting the key iff there are no subsequent puts for this - // key + // We can skip outputting the key iff there are no subsequent puts for + // this key assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( ikey_.user_key, &level_ptrs_)); ParsedInternalKey next_ikey; @@ -853,8 +875,8 @@ void CompactionIterator::NextFromInput() { DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) { AdvanceInputIter(); } - // If you find you still need to output a row with this key, we need to output the - // delete too + // If you find you still need to output a row with this key, we need to + // output the delete too if (input_.Valid() && (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) .ok()) && @@ -875,14 +897,15 @@ void CompactionIterator::NextFromInput() { // have hit (A) // We encapsulate the merge related state machine in a different // object to minimize change to the existing flow. - Status s = merge_helper_->MergeUntil( + merge_until_status_ = merge_helper_->MergeUntil( &input_, range_del_agg_, prev_snapshot, bottommost_level_, - allow_data_in_errors_, blob_fetcher_.get(), prefetch_buffers_.get(), - &iter_stats_); + allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_, + prefetch_buffers_.get(), &iter_stats_); merge_out_iter_.SeekToFirst(); - if (!s.ok() && !s.IsMergeInProgress()) { - status_ = s; + if (!merge_until_status_.ok() && + !merge_until_status_.IsMergeInProgress()) { + status_ = merge_until_status_; return; } else if (merge_out_iter_.Valid()) { // NOTE: key, value, and ikey_ refer to old entries. @@ -1214,8 +1237,8 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( ROCKS_LOG_FATAL(info_log_, "No snapshot left in findEarliestVisibleSnapshot"); } - auto snapshots_iter = std::lower_bound( - snapshots_->begin(), snapshots_->end(), in); + auto snapshots_iter = + std::lower_bound(snapshots_->begin(), snapshots_->end(), in); assert(prev_snapshot != nullptr); if (snapshots_iter == snapshots_->begin()) { *prev_snapshot = 0; @@ -1230,8 +1253,8 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot( } } if (snapshot_checker_ == nullptr) { - return snapshots_iter != snapshots_->end() - ? *snapshots_iter : kMaxSequenceNumber; + return snapshots_iter != snapshots_->end() ? *snapshots_iter + : kMaxSequenceNumber; } bool has_released_snapshot = !released_snapshots_.empty(); for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) { diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index c215d2bbbd0..a224a8e0e29 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -432,6 +432,7 @@ class CompactionIterator { bool clear_and_output_next_key_ = false; MergeOutputIterator merge_out_iter_; + Status merge_until_status_; // PinnedIteratorsManager used to pin input_ Iterator blocks while reading // merge operands and then releasing them after consuming them. PinnedIteratorsManager pinned_iters_mgr_; diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index 0bc3cd7134e..81362d79215 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -203,7 +203,8 @@ class TestSnapshotChecker : public SnapshotChecker { public: explicit TestSnapshotChecker( SequenceNumber last_committed_sequence, - const std::unordered_map& snapshots = {{}}) + const std::unordered_map& snapshots = + {{}}) : last_committed_sequence_(last_committed_sequence), snapshots_(snapshots) {} @@ -811,6 +812,8 @@ TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) { c_iter_->Next(); ASSERT_OK(c_iter_->status()); ASSERT_FALSE(c_iter_->Valid()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) { @@ -876,6 +879,8 @@ TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) { // output_to_penultimate_level. c_iter_->Next(); ASSERT_TRUE(c_iter_->status().IsCorruption()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); } INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest, @@ -1038,7 +1043,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest, TEST_F(CompactionIteratorWithSnapshotCheckerTest, NotRemoveDeletionIfValuePresentToEarlierSnapshot) { - AddSnapshot(2,1); + AddSnapshot(2, 1); RunTest({test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)}, {"", "", ""}, @@ -1250,6 +1255,31 @@ TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) { } } +TEST_P(CompactionIteratorTsGcTest, NoMergeEligibleForGc) { + constexpr char user_key[] = "a"; + const std::vector input_keys = { + test::KeyStr(10002, user_key, 102, kTypeMerge), + test::KeyStr(10001, user_key, 101, kTypeMerge), + test::KeyStr(10000, user_key, 100, kTypeValue)}; + const std::vector input_values = {"2", "1", "a0"}; + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendTESTOperator(); + const std::vector& expected_keys = input_keys; + const std::vector& expected_values = input_values; + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + RunTest(input_keys, input_values, expected_keys, expected_values, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, + /*full_history_ts_low=*/nullptr); + } +} + TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) { constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}}; const std::vector input_keys = { @@ -1303,6 +1333,91 @@ TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) { } } +TEST_P(CompactionIteratorTsGcTest, SomeMergesOlderThanThreshold) { + constexpr char user_key[][2] = {"a", "f"}; + const std::vector input_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge), + test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge), + test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge), + test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge), + test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge), + test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600, + kTypeDeletionWithTimestamp)}; + const std::vector input_values = {"25", "19", "18", "16", + "19", "17", ""}; + std::shared_ptr merge_op = + MergeOperators::CreateStringAppendTESTOperator(); + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 20000); + + const std::vector> params = { + {false, false}, {false, true}, {true, true}}; + + { + AddSnapshot(1600); + AddSnapshot(1900); + const std::vector expected_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge), + test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge), + test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge), + test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge), + test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge), + test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600, + kTypeDeletionWithTimestamp)}; + const std::vector expected_values = {"25", "19", "18", "16", + "19", "17", ""}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + auto expected_keys_copy = expected_keys; + auto expected_values_copy = expected_values; + if (bottommost_level || key_not_exists_beyond_output_level) { + // the kTypeDeletionWithTimestamp will be dropped + expected_keys_copy.pop_back(); + expected_values_copy.pop_back(); + if (bottommost_level) { + // seq zero + expected_keys_copy[3] = + test::KeyStr(/*ts=*/0, user_key[0], /*seq=*/0, kTypeValue); + } + } + RunTest(input_keys, input_values, expected_keys_copy, + expected_values_copy, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + ClearSnapshots(); + } + + // No snapshots + { + const std::vector expected_keys = { + test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeValue), + test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeValue)}; + const std::vector expected_values = {"16,18,19,25", "17,19"}; + for (const auto& param : params) { + const bool bottommost_level = param.first; + const bool key_not_exists_beyond_output_level = param.second; + auto expected_keys_copy = expected_keys; + auto expected_values_copy = expected_values; + if (bottommost_level) { + expected_keys_copy[1] = + test::KeyStr(/*ts=*/0, user_key[1], /*seq=*/0, kTypeValue); + } + RunTest(input_keys, input_values, expected_keys_copy, + expected_values_copy, + /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(), + /*compaction_filter=*/nullptr, bottommost_level, + /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber, + key_not_exists_beyond_output_level, &full_history_ts_low); + } + } +} + TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) { constexpr char user_key[] = "a"; const std::vector input_keys = { diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index c239f1c2092..a19b67c0c57 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -99,6 +99,8 @@ const char* GetCompactionReasonString(CompactionReason compaction_reason) { return "ForcedBlobGC"; case CompactionReason::kRoundRobinTtl: return "RoundRobinTtl"; + case CompactionReason::kRefitLevel: + return "RefitLevel"; case CompactionReason::kNumOfReasons: // fall through default: @@ -714,11 +716,12 @@ Status CompactionJob::Run() { break; } // Verify that the table is usable - // We set for_compaction to false and don't OptimizeForCompactionTableRead - // here because this is a special case after we finish the table building - // No matter whether use_direct_io_for_flush_and_compaction is true, - // we will regard this verification as user reads since the goal is - // to cache it here for further user reads + // We set for_compaction to false and don't + // OptimizeForCompactionTableRead here because this is a special case + // after we finish the table building No matter whether + // use_direct_io_for_flush_and_compaction is true, we will regard this + // verification as user reads since the goal is to cache it here for + // further user reads ReadOptions read_options; InternalIterator* iter = cfd->table_cache()->NewIterator( read_options, file_options_, cfd->internal_comparator(), @@ -764,8 +767,8 @@ Status CompactionJob::Run() { } }; for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { - thread_pool.emplace_back(verify_table, - std::ref(compact_->sub_compact_states[i].status)); + thread_pool.emplace_back( + verify_table, std::ref(compact_->sub_compact_states[i].status)); } verify_table(compact_->sub_compact_states[0].status); for (auto& thread : thread_pool) { @@ -1834,12 +1837,14 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, } // Initialize a SubcompactionState::Output and add it to sub_compact->outputs + uint64_t epoch_number = sub_compact->compaction->MinInputFileEpochNumber(); { FileMetaData meta; meta.fd = FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0); meta.oldest_ancester_time = oldest_ancester_time; meta.file_creation_time = current_time; + meta.epoch_number = epoch_number; meta.temperature = temperature; assert(!db_id_.empty()); assert(!db_session_id_.empty()); @@ -2158,6 +2163,9 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) { // set smallest and largest keys in FileMetaData meta.smallest.DecodeFrom(result_file.smallest_internal_key); meta.largest.DecodeFrom(result_file.largest_internal_key); + meta.unique_id[0] = result_file.unique_id_lo; + meta.unique_id[1] = result_file.unique_id_hi; + meta.epoch_number = result_file.epoch_number; ColumnFamilyData* cfd = compact_->compaction->column_family_data(); sub->Current().AddOutput(std::move(meta), cfd->internal_comparator(), @@ -2224,6 +2232,9 @@ void CompactionJob::RetrieveResultsAndCleanup( file.largest_internal_key = out.meta.largest.Encode().ToString(); file.smallest_seqno = out.meta.fd.smallest_seqno; file.largest_seqno = out.meta.fd.smallest_seqno; + file.unique_id_lo = out.meta.unique_id[0]; + file.unique_id_hi = out.meta.unique_id[1]; + file.epoch_number = out.meta.epoch_number; result->output_files.push_back(file); } diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index 31230b68c5f..d299885afd1 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -268,12 +268,12 @@ class CompactionJob { Status OpenCompactionOutputFile(SubcompactionState* sub_compact, CompactionOutputs& outputs); void UpdateCompactionJobStats( - const InternalStats::CompactionStats& stats) const; + const InternalStats::CompactionStats& stats) const; void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, CompactionJobStats* compaction_job_stats = nullptr); - void UpdateCompactionInputStatsHelper( - int* num_files, uint64_t* bytes_read, int input_level); + void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read, + int input_level); void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact); @@ -409,6 +409,7 @@ struct CompactionServiceOutputFile { std::string largest_internal_key; uint64_t oldest_ancester_time; uint64_t file_creation_time; + uint64_t epoch_number; uint64_t paranoid_hash; bool marked_for_compaction; UniqueId64x2 unique_id; @@ -418,8 +419,8 @@ struct CompactionServiceOutputFile { const std::string& name, SequenceNumber smallest, SequenceNumber largest, std::string _smallest_internal_key, std::string _largest_internal_key, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - uint64_t _paranoid_hash, bool _marked_for_compaction, - UniqueId64x2 _unique_id) + uint64_t _epoch_number, uint64_t _paranoid_hash, + bool _marked_for_compaction, UniqueId64x2 _unique_id) : file_name(name), smallest_seqno(smallest), largest_seqno(largest), @@ -427,6 +428,7 @@ struct CompactionServiceOutputFile { largest_internal_key(std::move(_largest_internal_key)), oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), + epoch_number(_epoch_number), paranoid_hash(_paranoid_hash), marked_for_compaction(_marked_for_compaction), unique_id(std::move(_unique_id)) {} diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc index b25191f2264..9302707780b 100644 --- a/db/compaction/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -155,9 +155,8 @@ class CompactionJobStatsTest : public testing::Test, ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); } - Status TryReopenWithColumnFamilies( - const std::vector& cfs, - const std::vector& options) { + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { Close(); EXPECT_EQ(cfs.size(), options.size()); std::vector column_families; @@ -175,9 +174,7 @@ class CompactionJobStatsTest : public testing::Test, return TryReopenWithColumnFamilies(cfs, v_opts); } - void Reopen(const Options& options) { - ASSERT_OK(TryReopen(options)); - } + void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); } void Close() { for (auto h : handles_) { @@ -226,9 +223,7 @@ class CompactionJobStatsTest : public testing::Test, return db_->Put(wo, handles_[cf], k, v); } - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } Status Delete(int cf, const std::string& k) { return db_->Delete(WriteOptions(), handles_[cf], k); @@ -338,21 +333,21 @@ class CompactionJobStatsTest : public testing::Test, } } - static void SetDeletionCompactionStats( - CompactionJobStats *stats, uint64_t input_deletions, - uint64_t expired_deletions, uint64_t records_replaced) { + static void SetDeletionCompactionStats(CompactionJobStats* stats, + uint64_t input_deletions, + uint64_t expired_deletions, + uint64_t records_replaced) { stats->num_input_deletion_records = input_deletions; stats->num_expired_deletion_records = expired_deletions; stats->num_records_replaced = records_replaced; } - void MakeTableWithKeyValues( - Random* rnd, uint64_t smallest, uint64_t largest, - int key_size, int value_size, uint64_t interval, - double ratio, int cf = 0) { + void MakeTableWithKeyValues(Random* rnd, uint64_t smallest, uint64_t largest, + int key_size, int value_size, uint64_t interval, + double ratio, int cf = 0) { for (auto key = smallest; key < largest; key += interval) { ASSERT_OK(Put(cf, Slice(Key(key, key_size)), - Slice(RandomString(rnd, value_size, ratio)))); + Slice(RandomString(rnd, value_size, ratio)))); } ASSERT_OK(Flush(cf)); } @@ -361,9 +356,9 @@ class CompactionJobStatsTest : public testing::Test, // rounds of keys are inserted into the database, as per the behavior // of the DeletionStatsTest. void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest, - uint64_t interval, int deletion_interval, int key_size, - uint64_t cutoff_key_num, CompactionJobStats* stats, int cf = 0) { - + uint64_t interval, int deletion_interval, + int key_size, uint64_t cutoff_key_num, + CompactionJobStats* stats, int cf = 0) { // interval needs to be >= 2 so that deletion entries can be inserted // that are intended to not result in an actual key deletion by using // an offset of 1 from another existing key @@ -387,20 +382,19 @@ class CompactionJobStatsTest : public testing::Test, // Insert some deletions for keys that don't exist that // are both in and out of the key range - ASSERT_OK(Delete(cf, Key(smallest+1, key_size))); + ASSERT_OK(Delete(cf, Key(smallest + 1, key_size))); deletions_made++; - ASSERT_OK(Delete(cf, Key(smallest-1, key_size))); + ASSERT_OK(Delete(cf, Key(smallest - 1, key_size))); deletions_made++; num_expired++; - ASSERT_OK(Delete(cf, Key(smallest-9, key_size))); + ASSERT_OK(Delete(cf, Key(smallest - 9, key_size))); deletions_made++; num_expired++; ASSERT_OK(Flush(cf)); - SetDeletionCompactionStats(stats, deletions_made, num_expired, - num_deleted); + SetDeletionCompactionStats(stats, deletions_made, num_expired, num_deleted); } }; @@ -440,25 +434,20 @@ class CompactionJobStatsChecker : public EventListener { // use ASSERT_GE and ASSERT_LE with a reasonable bias --- // 10% in uncompressed case and 20% when compression is used. virtual void Verify(const CompactionJobStats& current_stats, - const CompactionJobStats& stats) { + const CompactionJobStats& stats) { // time ASSERT_GT(current_stats.elapsed_micros, 0U); - ASSERT_EQ(current_stats.num_input_records, - stats.num_input_records); - ASSERT_EQ(current_stats.num_input_files, - stats.num_input_files); + ASSERT_EQ(current_stats.num_input_records, stats.num_input_records); + ASSERT_EQ(current_stats.num_input_files, stats.num_input_files); ASSERT_EQ(current_stats.num_input_files_at_output_level, - stats.num_input_files_at_output_level); + stats.num_input_files_at_output_level); - ASSERT_EQ(current_stats.num_output_records, - stats.num_output_records); - ASSERT_EQ(current_stats.num_output_files, - stats.num_output_files); + ASSERT_EQ(current_stats.num_output_records, stats.num_output_records); + ASSERT_EQ(current_stats.num_output_files, stats.num_output_files); ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction); - ASSERT_EQ(current_stats.is_manual_compaction, - stats.is_manual_compaction); + ASSERT_EQ(current_stats.is_manual_compaction, stats.is_manual_compaction); // file size double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10; @@ -475,18 +464,14 @@ class CompactionJobStatsChecker : public EventListener { ASSERT_EQ(current_stats.total_input_raw_value_bytes, stats.total_input_raw_value_bytes); - ASSERT_EQ(current_stats.num_records_replaced, - stats.num_records_replaced); + ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced); - ASSERT_EQ(current_stats.num_corrupt_keys, - stats.num_corrupt_keys); + ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys); - ASSERT_EQ( - std::string(current_stats.smallest_output_key_prefix), - std::string(stats.smallest_output_key_prefix)); - ASSERT_EQ( - std::string(current_stats.largest_output_key_prefix), - std::string(stats.largest_output_key_prefix)); + ASSERT_EQ(std::string(current_stats.smallest_output_key_prefix), + std::string(stats.smallest_output_key_prefix)); + ASSERT_EQ(std::string(current_stats.largest_output_key_prefix), + std::string(stats.largest_output_key_prefix)); } // Add an expected compaction stats, which will be used to @@ -497,9 +482,7 @@ class CompactionJobStatsChecker : public EventListener { expected_stats_.push(stats); } - void EnableCompression(bool flag) { - compression_enabled_ = flag; - } + void EnableCompression(bool flag) { compression_enabled_ = flag; } bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; } @@ -517,45 +500,37 @@ class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker { // Verifies whether two CompactionJobStats match. void Verify(const CompactionJobStats& current_stats, const CompactionJobStats& stats) override { - ASSERT_EQ( - current_stats.num_input_deletion_records, - stats.num_input_deletion_records); - ASSERT_EQ( - current_stats.num_expired_deletion_records, - stats.num_expired_deletion_records); - ASSERT_EQ( - current_stats.num_records_replaced, - stats.num_records_replaced); - - ASSERT_EQ(current_stats.num_corrupt_keys, - stats.num_corrupt_keys); + ASSERT_EQ(current_stats.num_input_deletion_records, + stats.num_input_deletion_records); + ASSERT_EQ(current_stats.num_expired_deletion_records, + stats.num_expired_deletion_records); + ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced); + + ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys); } }; namespace { -uint64_t EstimatedFileSize( - uint64_t num_records, size_t key_size, size_t value_size, - double compression_ratio = 1.0, - size_t block_size = 4096, - int bloom_bits_per_key = 10) { +uint64_t EstimatedFileSize(uint64_t num_records, size_t key_size, + size_t value_size, double compression_ratio = 1.0, + size_t block_size = 4096, + int bloom_bits_per_key = 10) { const size_t kPerKeyOverhead = 8; const size_t kFooterSize = 512; - uint64_t data_size = - static_cast( - num_records * (key_size + value_size * compression_ratio + - kPerKeyOverhead)); + uint64_t data_size = static_cast( + num_records * + (key_size + value_size * compression_ratio + kPerKeyOverhead)); - return data_size + kFooterSize - + num_records * bloom_bits_per_key / 8 // filter block + return data_size + kFooterSize + + num_records * bloom_bits_per_key / 8 // filter block + data_size * (key_size + 8) / block_size; // index block } namespace { -void CopyPrefix( - const Slice& src, size_t prefix_length, std::string* dst) { +void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { assert(prefix_length > 0); size_t length = src.size() > prefix_length ? prefix_length : src.size(); dst->assign(src.data(), length); @@ -581,28 +556,24 @@ CompactionJobStats NewManualCompactionJobStats( stats.num_output_files = num_output_files; stats.total_input_bytes = - EstimatedFileSize( - num_input_records / num_input_files, - key_size, value_size, compression_ratio) * num_input_files; + EstimatedFileSize(num_input_records / num_input_files, key_size, + value_size, compression_ratio) * + num_input_files; stats.total_output_bytes = - EstimatedFileSize( - num_output_records / num_output_files, - key_size, value_size, compression_ratio) * num_output_files; - stats.total_input_raw_key_bytes = - num_input_records * (key_size + 8); - stats.total_input_raw_value_bytes = - num_input_records * value_size; + EstimatedFileSize(num_output_records / num_output_files, key_size, + value_size, compression_ratio) * + num_output_files; + stats.total_input_raw_key_bytes = num_input_records * (key_size + 8); + stats.total_input_raw_value_bytes = num_input_records * value_size; stats.is_full_compaction = is_full; stats.is_manual_compaction = is_manual; stats.num_records_replaced = num_records_replaced; - CopyPrefix(smallest_key, - CompactionJobStats::kMaxPrefixLength, + CopyPrefix(smallest_key, CompactionJobStats::kMaxPrefixLength, &stats.smallest_output_key_prefix); - CopyPrefix(largest_key, - CompactionJobStats::kMaxPrefixLength, + CopyPrefix(largest_key, CompactionJobStats::kMaxPrefixLength, &stats.largest_output_key_prefix); return stats; @@ -662,13 +633,11 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { // 1st Phase: generate "num_L0_files" L0 files. int num_L0_files = 0; - for (uint64_t start_key = key_base; - start_key <= key_base * kTestScale; - start_key += key_base) { - MakeTableWithKeyValues( - &rnd, start_key, start_key + key_base - 1, - kKeySize, kValueSize, key_interval, - compression_ratio, 1); + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, + kKeySize, kValueSize, key_interval, + compression_ratio, 1); snprintf(buf, kBufSize, "%d", ++num_L0_files); ASSERT_EQ(std::string(buf), FilesPerLevel(1)); } @@ -684,13 +653,9 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { start_key += key_base, count++) { smallest_key = Key(start_key, 10); largest_key = Key(start_key + key_base - key_interval, 10); - stats_checker->AddExpectedStats( - NewManualCompactionJobStats( - smallest_key, largest_key, - 1, 0, num_keys_per_L0_file, - kKeySize, kValueSize, - 1, num_keys_per_L0_file, - compression_ratio, 0)); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, 1, 0, num_keys_per_L0_file, kKeySize, + kValueSize, 1, num_keys_per_L0_file, compression_ratio, 0)); ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); TEST_Compact(0, 1, smallest_key, largest_key); snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count); @@ -701,14 +666,10 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { int num_remaining_L0 = num_L0_files - L0_compaction_count; smallest_key = Key(key_base * (L0_compaction_count + 1), 10); largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); - stats_checker->AddExpectedStats( - NewManualCompactionJobStats( - smallest_key, largest_key, - num_remaining_L0, - 0, num_keys_per_L0_file * num_remaining_L0, - kKeySize, kValueSize, - 1, num_keys_per_L0_file * num_remaining_L0, - compression_ratio, 0)); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, num_remaining_L0, 0, + num_keys_per_L0_file * num_remaining_L0, kKeySize, kValueSize, 1, + num_keys_per_L0_file * num_remaining_L0, compression_ratio, 0)); ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); TEST_Compact(0, 1, smallest_key, largest_key); @@ -719,13 +680,11 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys) int sparseness = 2; - for (uint64_t start_key = key_base; - start_key <= key_base * kTestScale; - start_key += key_base * sparseness) { + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base * sparseness) { MakeTableWithKeyValues( - &rnd, start_key, start_key + key_base * sparseness - 1, - kKeySize, kValueSize, - key_base * sparseness / num_keys_per_L0_file, + &rnd, start_key, start_key + key_base * sparseness - 1, kKeySize, + kValueSize, key_base * sparseness / num_keys_per_L0_file, compression_ratio, 1); snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files); ASSERT_EQ(std::string(buf), FilesPerLevel(1)); @@ -737,21 +696,15 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { // output files without coordinating to see if the output could fit into // a smaller number of files like it does when it runs sequentially int num_output_files = options.max_subcompactions > 1 ? 2 : 1; - for (uint64_t start_key = key_base; - num_L0_files > 1; + for (uint64_t start_key = key_base; num_L0_files > 1; start_key += key_base * sparseness) { smallest_key = Key(start_key, 10); - largest_key = - Key(start_key + key_base * sparseness - key_interval, 10); - stats_checker->AddExpectedStats( - NewManualCompactionJobStats( - smallest_key, largest_key, - 3, 2, num_keys_per_L0_file * 3, - kKeySize, kValueSize, - num_output_files, - num_keys_per_L0_file * 2, // 1/3 of the data will be updated. - compression_ratio, - num_keys_per_L0_file)); + largest_key = Key(start_key + key_base * sparseness - key_interval, 10); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + smallest_key, largest_key, 3, 2, num_keys_per_L0_file * 3, kKeySize, + kValueSize, num_output_files, + num_keys_per_L0_file * 2, // 1/3 of the data will be updated. + compression_ratio, num_keys_per_L0_file)); ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); Compact(1, smallest_key, largest_key); if (options.max_subcompactions == 1) { @@ -766,14 +719,10 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { // In the first sub-compaction, we expect L0 compaction. smallest_key = Key(key_base, 10); largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10); - stats_checker->AddExpectedStats( - NewManualCompactionJobStats( - Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, - 2, 1, num_keys_per_L0_file * 3, - kKeySize, kValueSize, - 1, num_keys_per_L0_file * 2, - compression_ratio, - num_keys_per_L0_file)); + stats_checker->AddExpectedStats(NewManualCompactionJobStats( + Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, 2, 1, + num_keys_per_L0_file * 3, kKeySize, kValueSize, 1, + num_keys_per_L0_file * 2, compression_ratio, num_keys_per_L0_file)); ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U); Compact(1, smallest_key, largest_key); @@ -869,7 +818,7 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) { Options options; options.listeners.emplace_back(stats_checker); options.create_if_missing = true; - options.level0_file_num_compaction_trigger = kTestScale+1; + options.level0_file_num_compaction_trigger = kTestScale + 1; options.num_levels = 3; options.compression = kNoCompression; options.max_bytes_for_level_multiplier = 2; @@ -881,13 +830,10 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) { // Stage 1: Generate several L0 files and then send them to L2 by // using CompactRangeOptions and CompactRange(). These files will // have a strict subset of the keys from the full key-range - for (uint64_t start_key = key_base; - start_key <= key_base * kTestScale / 2; - start_key += key_base) { - MakeTableWithKeyValues( - &rnd, start_key, start_key + key_base - 1, - kKeySize, kValueSize, key_interval, - compression_ratio, 1); + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale / 2; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); } CompactRangeOptions cr_options; @@ -897,13 +843,10 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) { ASSERT_GT(NumTableFilesAtLevel(2, 1), 0); // Stage 2: Generate files including keys from the entire key range - for (uint64_t start_key = key_base; - start_key <= key_base * kTestScale; - start_key += key_base) { - MakeTableWithKeyValues( - &rnd, start_key, start_key + key_base - 1, - kKeySize, kValueSize, key_interval, - compression_ratio, 1); + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); } // Send these L0 files to L1 @@ -912,16 +855,16 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) { // Add a new record and flush so now there is a L0 file // with a value too (not just deletions from the next step) - ASSERT_OK(Put(1, Key(key_base-6, kKeySize), "test")); + ASSERT_OK(Put(1, Key(key_base - 6, kKeySize), "test")); ASSERT_OK(Flush(1)); // Stage 3: Generate L0 files with some deletions so now // there are files with the same key range in L0, L1, and L2 int deletion_interval = 3; CompactionJobStats first_compaction_stats; - SelectivelyDeleteKeys(key_base, largest_key_num, - key_interval, deletion_interval, kKeySize, cutoff_key_num, - &first_compaction_stats, 1); + SelectivelyDeleteKeys(key_base, largest_key_num, key_interval, + deletion_interval, kKeySize, cutoff_key_num, + &first_compaction_stats, 1); stats_checker->AddExpectedStats(first_compaction_stats); @@ -932,8 +875,7 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) { namespace { int GetUniversalCompactionInputUnits(uint32_t num_flushes) { uint32_t compaction_input_units; - for (compaction_input_units = 1; - num_flushes >= compaction_input_units; + for (compaction_input_units = 1; num_flushes >= compaction_input_units; compaction_input_units *= 2) { if ((num_flushes & compaction_input_units) != 0) { return compaction_input_units > 1 ? compaction_input_units : 0; @@ -998,13 +940,10 @@ TEST_P(CompactionJobStatsTest, UniversalCompactionTest) { } ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U); - for (uint64_t start_key = key_base; - start_key <= key_base * kTestScale; - start_key += key_base) { - MakeTableWithKeyValues( - &rnd, start_key, start_key + key_base - 1, - kKeySize, kValueSize, key_interval, - compression_ratio, 1); + for (uint64_t start_key = key_base; start_key <= key_base * kTestScale; + start_key += key_base) { + MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize, + kValueSize, key_interval, compression_ratio, 1); ASSERT_OK(static_cast_with_check(db_)->TEST_WaitForCompact()); } ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U); diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index de80e1a4ae3..0f0c5daf776 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -40,7 +40,7 @@ namespace ROCKSDB_NAMESPACE { namespace { void VerifyInitializationOfCompactionJobStats( - const CompactionJobStats& compaction_job_stats) { + const CompactionJobStats& compaction_job_stats) { #if !defined(IOS_CROSS_COMPILE) ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U); @@ -380,11 +380,14 @@ class CompactionJobTestBase : public testing::Test { } VersionEdit edit; - edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key, - smallest_seqno, largest_seqno, false, Temperature::kUnknown, - oldest_blob_file_number, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + edit.AddFile( + level, file_number, 0, file_size, smallest_key, largest_key, + smallest_seqno, largest_seqno, false, Temperature::kUnknown, + oldest_blob_file_number, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, + versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(), + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, + 0); mutex_.Lock(); EXPECT_OK( @@ -499,8 +502,7 @@ class CompactionJobTestBase : public testing::Test { // This is how the key will look like once it's written in bottommost // file - InternalKey bottommost_internal_key( - key, 0, kTypeValue); + InternalKey bottommost_internal_key(key, 0, kTypeValue); if (corrupt_id(k)) { test::CorruptKeyType(&internal_key); @@ -620,7 +622,7 @@ class CompactionJobTestBase : public testing::Test { CompactionInputFiles compaction_level; compaction_level.level = input_levels[i]; compaction_level.files.insert(compaction_level.files.end(), - level_files.begin(), level_files.end()); + level_files.begin(), level_files.end()); compaction_input_files.push_back(compaction_level); num_input_files += level_files.size(); } @@ -1656,7 +1658,7 @@ TEST_F(CompactionJobTest, ResultSerialization) { rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), - rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); + rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); } result.output_level = rnd.Uniform(10); result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index e74378e2a9c..598bffb242f 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -76,6 +76,46 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, return io_s; } +bool CompactionOutputs::UpdateFilesToCutForTTLStates( + const Slice& internal_key) { + if (!files_to_cut_for_ttl_.empty()) { + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + if (cur_files_to_cut_for_ttl_ != -1) { + // Previous key is inside the range of a file + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_] + ->largest.Encode()) > 0) { + next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1; + cur_files_to_cut_for_ttl_ = -1; + return true; + } + } else { + // Look for the key position + while (next_files_to_cut_for_ttl_ < + static_cast(files_to_cut_for_ttl_.size())) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->smallest.Encode()) >= 0) { + if (icmp->Compare(internal_key, + files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] + ->largest.Encode()) <= 0) { + // With in the current file + cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_; + return true; + } + // Beyond the current file + next_files_to_cut_for_ttl_++; + } else { + // Still fall into the gap + break; + } + } + } + } + return false; +} + size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( const Slice& internal_key) { size_t curr_key_boundary_switched_num = 0; @@ -185,18 +225,30 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes( bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { assert(c_iter.Valid()); - - // always update grandparent information like overlapped file number, size - // etc. const Slice& internal_key = c_iter.key(); const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_; - size_t num_grandparent_boundaries_crossed = - UpdateGrandparentBoundaryInfo(internal_key); + const InternalKeyComparator* icmp = + &compaction_->column_family_data()->internal_comparator(); + size_t num_grandparent_boundaries_crossed = 0; + bool should_stop_for_ttl = false; + // Always update grandparent information like overlapped file number, size + // etc., and TTL states. + // If compaction_->output_level() == 0, there is no need to update grandparent + // info, and that `grandparent` should be empty. + if (compaction_->output_level() > 0) { + num_grandparent_boundaries_crossed = + UpdateGrandparentBoundaryInfo(internal_key); + should_stop_for_ttl = UpdateFilesToCutForTTLStates(internal_key); + } if (!HasBuilder()) { return false; } + if (should_stop_for_ttl) { + return true; + } + // If there's user defined partitioner, check that first if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest( last_key_for_partitioner_, c_iter.user_key(), @@ -214,9 +266,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { return true; } - const InternalKeyComparator* icmp = - &compaction_->column_family_data()->internal_comparator(); - // Check if it needs to split for RoundRobin // Invalid local_output_split_key indicates that we do not need to split if (local_output_split_key_ != nullptr && !is_split_) { @@ -290,41 +339,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { } } - // check ttl file boundaries if there's any - if (!files_to_cut_for_ttl_.empty()) { - if (cur_files_to_cut_for_ttl_ != -1) { - // Previous key is inside the range of a file - if (icmp->Compare(internal_key, - files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_] - ->largest.Encode()) > 0) { - next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1; - cur_files_to_cut_for_ttl_ = -1; - return true; - } - } else { - // Look for the key position - while (next_files_to_cut_for_ttl_ < - static_cast(files_to_cut_for_ttl_.size())) { - if (icmp->Compare(internal_key, - files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] - ->smallest.Encode()) >= 0) { - if (icmp->Compare(internal_key, - files_to_cut_for_ttl_[next_files_to_cut_for_ttl_] - ->largest.Encode()) <= 0) { - // With in the current file - cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_; - return true; - } - // Beyond the current file - next_files_to_cut_for_ttl_++; - } else { - // Still fall into the gap - break; - } - } - } - } - return false; } @@ -399,6 +413,17 @@ Status CompactionOutputs::AddRangeDels( const Slice *lower_bound, *upper_bound; bool lower_bound_from_sub_compact = false; + // The following example does not happen since + // CompactionOutput::ShouldStopBefore() always return false for the first + // point key. But we should consider removing this dependency. Suppose for the + // first compaction output file, + // - next_table_min_key.user_key == comp_start_user_key + // - no point key is in the output file + // - there is a range tombstone @seqno to be added that covers + // comp_start_user_key + // Then meta.smallest will be set to comp_start_user_key@seqno + // and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber + // which violates the assumption that meta.smallest should be <= meta.largest. size_t output_size = outputs_.size(); if (output_size == 1) { // For the first output table, include range tombstones before the min @@ -459,20 +484,34 @@ Status CompactionOutputs::AddRangeDels( } else { it->SeekToFirst(); } + Slice last_tombstone_start_user_key{}; for (; it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); if (upper_bound != nullptr) { int cmp = ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_); - if ((has_overlapping_endpoints && cmp < 0) || - (!has_overlapping_endpoints && cmp <= 0)) { - // Tombstones starting after upper_bound only need to be included in - // the next table. If the current SST ends before upper_bound, i.e., - // `has_overlapping_endpoints == false`, we can also skip over range - // tombstones that start exactly at upper_bound. Such range - // tombstones will be included in the next file and are not relevant - // to the point keys or endpoints of the current file. + // Tombstones starting after upper_bound only need to be included in + // the next table. + // If the current SST ends before upper_bound, i.e., + // `has_overlapping_endpoints == false`, we can also skip over range + // tombstones that start exactly at upper_bound. Such range + // tombstones will be included in the next file and are not relevant + // to the point keys or endpoints of the current file. + // If the current SST ends at the same user key at upper_bound, + // i.e., `has_overlapping_endpoints == true`, AND the tombstone has + // the same start key as upper_bound, i.e., cmp == 0, then + // the tombstone is relevant only if the tombstone's sequence number + // is no larger than this file's largest key's sequence number. This + // is because the upper bound to truncate this file's range tombstone + // will be meta.largest in this case, and any tombstone that starts after + // it will not be relevant. + if (cmp < 0) { break; + } else if (cmp == 0) { + if (!has_overlapping_endpoints || + tombstone.seq_ < GetInternalKeySeqno(meta.largest.Encode())) { + break; + } } } @@ -500,30 +539,26 @@ Status CompactionOutputs::AddRangeDels( ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0); // Range tombstone is not supported by output validator yet. builder_->Add(kv.first.Encode(), kv.second); - InternalKey smallest_candidate = std::move(kv.first); + InternalKey tombstone_start = std::move(kv.first); + InternalKey smallest_candidate{tombstone_start}; if (lower_bound != nullptr && ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(), *lower_bound) <= 0) { // Pretend the smallest key has the same user key as lower_bound // (the max key in the previous table or subcompaction) in order for // files to appear key-space partitioned. - // - // When lower_bound is chosen by a subcompaction, we know that - // subcompactions over smaller keys cannot contain any keys at - // lower_bound. We also know that smaller subcompactions exist, - // because otherwise the subcompaction woud be unbounded on the left. - // As a result, we know that no other files on the output level will - // contain actual keys at lower_bound (an output file may have a - // largest key of lower_bound@kMaxSequenceNumber, but this only - // indicates a large range tombstone was truncated). Therefore, it is - // safe to use the tombstone's sequence number, to ensure that keys at - // lower_bound at lower levels are covered by truncated tombstones. - // - // If lower_bound was chosen by the smallest data key in the file, - // choose lowest seqnum so this file's smallest internal key comes - // after the previous file's largest. The fake seqnum is OK because - // the read path's file-picking code only considers user key. if (lower_bound_from_sub_compact) { + // When lower_bound is chosen by a subcompaction + // (lower_bound_from_sub_compact), we know that subcompactions over + // smaller keys cannot contain any keys at lower_bound. We also know + // that smaller subcompactions exist, because otherwise the + // subcompaction woud be unbounded on the left. As a result, we know + // that no other files on the output level will contain actual keys at + // lower_bound (an output file may have a largest key of + // lower_bound@kMaxSequenceNumber, but this only indicates a large range + // tombstone was truncated). Therefore, it is safe to use the + // tombstone's sequence number, to ensure that keys at lower_bound at + // lower levels are covered by truncated tombstones. if (ts_sz) { assert(tombstone.ts_.size() == ts_sz); smallest_candidate = InternalKey(*lower_bound, tombstone.seq_, @@ -533,10 +568,15 @@ Status CompactionOutputs::AddRangeDels( InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion); } } else { + // If lower_bound was chosen by the smallest data key in the file, + // choose lowest seqnum so this file's smallest internal key comes + // after the previous file's largest. The fake seqnum is OK because + // the read path's file-picking code only considers user key. smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion); } } - InternalKey largest_candidate = tombstone.SerializeEndKey(); + InternalKey tombstone_end = tombstone.SerializeEndKey(); + InternalKey largest_candidate{tombstone_end}; if (upper_bound != nullptr && ucmp->CompareWithoutTimestamp(*upper_bound, largest_candidate.user_key()) <= 0) { @@ -570,21 +610,174 @@ Status CompactionOutputs::AddRangeDels( InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); } } -#ifndef NDEBUG - SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber; - if (meta.smallest.size() > 0) { - smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode()); - } -#endif meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate, tombstone.seq_, icmp); - // The smallest key in a file is used for range tombstone truncation, so - // it cannot have a seqnum of 0 (unless the smallest data key in a file - // has a seqnum of 0). Otherwise, the truncated tombstone may expose - // deleted keys at lower levels. - assert(smallest_ikey_seqnum == 0 || - ExtractInternalKeyFooter(meta.smallest.Encode()) != - PackSequenceAndType(0, kTypeRangeDeletion)); + if (!bottommost_level) { + bool start_user_key_changed = + last_tombstone_start_user_key.empty() || + ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key, + it->start_key()) < 0; + last_tombstone_start_user_key = it->start_key(); + // Range tombstones are truncated at file boundaries + if (icmp.Compare(tombstone_start, meta.smallest) < 0) { + tombstone_start = meta.smallest; + } + if (icmp.Compare(tombstone_end, meta.largest) > 0) { + tombstone_end = meta.largest; + } + // this assertion validates invariant (2) in the comment below. + assert(icmp.Compare(tombstone_start, tombstone_end) <= 0); + if (start_user_key_changed) { + // if tombstone_start >= tombstone_end, then either no key range is + // covered, or that they have the same user key. If they have the same + // user key, then the internal key range should only be within this + // level, and no keys from older levels is covered. + if (ucmp->CompareWithoutTimestamp(tombstone_start.user_key(), + tombstone_end.user_key()) < 0) { + SizeApproximationOptions approx_opts; + approx_opts.files_size_error_margin = 0.1; + auto approximate_covered_size = + compaction_->input_version()->version_set()->ApproximateSize( + approx_opts, compaction_->input_version(), + tombstone_start.Encode(), tombstone_end.Encode(), + compaction_->output_level() + 1 /* start_level */, + -1 /* end_level */, kCompaction); + meta.compensated_range_deletion_size += approximate_covered_size; + } + } + } + // TODO: show invariants that ensure all necessary range tombstones are + // added + // and that file boundaries ensure no coverage is lost. + // Each range tombstone with internal key range [tombstone_start, + // tombstone_end] is being added to the current compaction output file here. + // The range tombstone is going to be truncated at range [meta.smallest, + // meta.largest] during reading/scanning. We should maintain invariants + // (1) meta.smallest <= meta.largest and, + // (2) [tombstone_start, tombstone_end] and [meta.smallest, meta.largest] + // overlaps, as there is no point adding range tombstone with a range + // outside the file's range. + // Since `tombstone_end` is always some user_key@kMaxSeqno, it is okay to + // use either open or closed range. Using closed range here to make + // reasoning easier, and it is more consistent with an ongoing work that + // tries to simplify this method. + // + // There are two cases: + // Case 1. Output file has no point key: + // First we show this case only happens when the entire compaction output + // is range tombstone only. This is true if CompactionIterator does not + // emit any point key. Suppose CompactionIterator emits some point key. + // Based on the assumption that CompactionOutputs::ShouldStopBefore() + // always return false for the first point key, the first compaction + // output file always contains a point key. Each new compaction output + // file is created if there is a point key for which ShouldStopBefore() + // returns true, and the point key would be added to the new compaction + // output file. So each new compaction file always contains a point key. + // So Case 1 only happens when CompactionIterator does not emit any + // point key. + // + // To show (1) meta.smallest <= meta.largest: + // Since the compaction output is range tombstone only, `lower_bound` and + // `upper_bound` are either null or comp_start/end_user_key respectively. + // According to how UpdateBoundariesForRange() is implemented, it blindly + // updates meta.smallest and meta.largest to smallest_candidate and + // largest_candidate the first time it is called. Subsequently, it + // compares input parameter with meta.smallest and meta.largest and only + // updates them when input is smaller/larger. So we only need to show + // smallest_candidate <= largest_candidate the first time + // UpdateBoundariesForRange() is called. Here we show something stronger + // that smallest_candidate.user_key < largest_candidate.user_key always + // hold for Case 1. + // We assume comp_start_user_key < comp_end_user_key, if provided. We + // assume that tombstone_start < tombstone_end. This assumption is based + // on that each fragment in FragmentedTombstoneList has + // start_key < end_key (user_key) and that + // FragmentedTombstoneIterator::Tombstone() returns the pair + // (start_key@tombstone_seqno with op_type kTypeRangeDeletion, end_key). + // The logic in this loop sets smallest_candidate to + // max(tombstone_start.user_key, comp_start_user_key)@tombstone.seq_ with + // op_type kTypeRangeDeletion, largest_candidate to + // min(tombstone_end.user_key, comp_end_user_key)@kMaxSequenceNumber with + // op_type kTypeRangeDeletion. When a bound is null, there is no + // truncation on that end. To show that smallest_candidate.user_key < + // largest_candidate.user_key, it suffices to show + // tombstone_start.user_key < comp_end_user_key (if not null) AND + // comp_start_user_key (if not null) < tombstone_end.user_key. + // Since the file has no point key, `has_overlapping_endpoints` is false. + // In the first sanity check of this for-loop, we compare + // tombstone_start.user_key against upper_bound = comp_end_user_key, + // and only proceed if tombstone_start.user_key < comp_end_user_key. + // We assume FragmentedTombstoneIterator::Seek(k) lands + // on a tombstone with end_key > k. So the call it->Seek(*lower_bound) + // above implies compact_start_user_key < tombstone_end.user_key. + // + // To show (2) [tombstone_start, tombstone_end] and [meta.smallest, + // meta.largest] overlaps (after the call to UpdateBoundariesForRange()): + // In the proof for (1) we have shown that + // smallest_candidate <= largest_candidate. Since tombstone_start <= + // smallest_candidate <= largest_candidate <= tombstone_end, for (2) to + // hold, it suffices to show that [smallest_candidate, largest_candidate] + // overlaps with [meta.smallest, meta.largest]. too. + // Given meta.smallest <= meta.largest shown above, we need to show + // that it is impossible to have largest_candidate < meta.smallest or + // meta.largest < smallest_candidate. If the above + // meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate) + // updates meta.largest or meta.smallest, then the two ranges overlap. + // So we assume meta.UpdateBoundariesForRange(smallest_candidate, + // largest_candidate) did not update meta.smallest nor meta.largest, which + // means meta.smallest < smallest_candidate and largest_candidate < + // meta.largest. + // + // Case 2. Output file has >= 1 point key. This means meta.smallest and + // meta.largest are not empty when AddRangeDels() is called. + // To show (1) meta.smallest <= meta.largest: + // Assume meta.smallest <= meta.largest when AddRangeDels() is called, + // this follow from how UpdateBoundariesForRange() is implemented where it + // takes min or max to update meta.smallest or meta.largest. + // + // To show (2) [tombstone_start, tombstone_end] and [meta.smallest, + // meta.largest] overlaps (after the call to UpdateBoundariesForRange()): + // When smallest_candidate <= largest_candidate, the proof in Case 1 + // applies, so we only need to show (2) holds when smallest_candidate > + // largest_candidate. When both bounds are either null or from + // subcompaction boundary, the proof in Case 1 applies, so we only need to + // show (2) holds when at least one bound is from a point key (either + // meta.smallest for lower bound or next_table_min_key for upper bound). + // + // Suppose lower bound is meta.smallest.user_key. The call + // it->Seek(*lower_bound) implies tombstone_end.user_key > + // meta.smallest.user_key. We have smallest_candidate.user_key = + // max(tombstone_start.user_key, meta.smallest.user_key). For + // smallest_candidate to be > largest_candidate, we need + // largest_candidate.user_key = upper_bound = smallest_candidate.user_key, + // where tombstone_end is truncated to largest_candidate. + // Subcase 1: + // Suppose largest_candidate.user_key = comp_end_user_key (there is no + // next point key). Subcompaction ensures any point key from this + // subcompaction has a user_key < comp_end_user_key, so 1) + // meta.smallest.user_key < comp_end_user_key, 2) + // `has_overlapping_endpoints` is false, and the first if condition in + // this for-loop ensures tombstone_start.user_key < comp_end_user_key. So + // smallest_candidate.user_key < largest_candidate.user_key. This case + // cannot happen when smallest > largest_candidate. + // Subcase 2: + // Suppose largest_candidate.user_key = next_table_min_key.user_key. + // The first if condition in this for-loop together with + // smallest_candidate.user_key = next_table_min_key.user_key = + // upper_bound implies `has_overlapping_endpoints` is true (so meta + // largest.user_key = upper_bound) and + // tombstone.seq_ < meta.largest.seqno. So + // tombstone_start < meta.largest < tombstone_end. + // + // Suppose lower bound is comp_start_user_key and upper_bound is + // next_table_min_key. The call it->Seek(*lower_bound) implies we have + // tombstone_end_key.user_key > comp_start_user_key. So + // tombstone_end_key.user_key > smallest_candidate.user_key. For + // smallest_candidate to be > largest_candidate, we need + // tombstone_start.user_key = largest_candidate.user_key = upper_bound = + // next_table_min_key.user_key. This means `has_overlapping_endpoints` is + // true (so meta.largest.user_key = upper_bound) and tombstone.seq_ < + // meta.largest.seqno. So tombstone_start < meta.largest < tombstone_end. } return Status::OK(); } diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index f40aa8215be..52233917f0f 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -221,6 +221,13 @@ class CompactionOutputs { } } + // Updates states related to file cutting for TTL. + // Returns a boolean value indicating whether the current + // compaction output file should be cut before `internal_key`. + // + // @param internal_key the current key to be added to output. + bool UpdateFilesToCutForTTLStates(const Slice& internal_key); + // update tracked grandparents information like grandparent index, if it's // in the gap between 2 grandparent files, accumulated grandparent files size // etc. diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index abdecca9f21..5fe058b56d1 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -31,27 +31,15 @@ bool FindIntraL0Compaction(const std::vector& level_files, size_t min_files_to_compact, uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, - CompactionInputFiles* comp_inputs, - SequenceNumber earliest_mem_seqno) { - // Do not pick ingested file when there is at least one memtable not flushed - // which of seqno is overlap with the sst. + CompactionInputFiles* comp_inputs) { TEST_SYNC_POINT("FindIntraL0Compaction"); + size_t start = 0; - for (; start < level_files.size(); start++) { - if (level_files[start]->being_compacted) { - return false; - } - // If there is no data in memtable, the earliest sequence number would the - // largest sequence number in last memtable. - // Because all files are sorted in descending order by largest_seqno, so we - // only need to check the first one. - if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) { - break; - } - } - if (start >= level_files.size()) { + + if (level_files.size() == 0 || level_files[start]->being_compacted) { return false; } + size_t compact_bytes = static_cast(level_files[start]->fd.file_size); size_t compact_bytes_per_del_file = std::numeric_limits::max(); // Compaction range will be [start, limit). @@ -995,6 +983,7 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( current_files[f].name + " is currently being compacted."); } + input_files->insert(TableFileNameToNumber(current_files[f].name)); } @@ -1137,7 +1126,11 @@ void CompactionPicker::RegisterCompaction(Compaction* c) { c->output_level() == 0 || !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(), c->GetPenultimateLevel())); - if (c->start_level() == 0 || + // CompactionReason::kExternalSstIngestion's start level is just a placeholder + // number without actual meaning as file ingestion technically does not have + // an input level like other compactions + if ((c->start_level() == 0 && + c->compaction_reason() != CompactionReason::kExternalSstIngestion) || ioptions_.compaction_style == kCompactionStyleUniversal) { level0_compactions_in_progress_.insert(c); } diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h index 7739dd96b63..d98af851bfe 100644 --- a/db/compaction/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -51,14 +51,15 @@ class CompactionPicker { virtual ~CompactionPicker(); // Pick level and inputs for a new compaction. + // // Returns nullptr if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that // describes the compaction. Caller should delete the result. - virtual Compaction* PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, - SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0; + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) = 0; // Return a compaction object for compacting the range [begin,end] in // the specified level. Returns nullptr if there is nothing in that @@ -91,6 +92,7 @@ class CompactionPicker { // files. If it's not possible to conver an invalid input_files // into a valid one by adding more files, the function will return a // non-ok status with specific reason. +// #ifndef ROCKSDB_LITE Status SanitizeCompactionInputFiles(std::unordered_set* input_files, const ColumnFamilyMetaData& cf_meta, @@ -255,12 +257,11 @@ class NullCompactionPicker : public CompactionPicker { virtual ~NullCompactionPicker() {} // Always return "nullptr" - Compaction* PickCompaction( - const std::string& /*cf_name*/, - const MutableCFOptions& /*mutable_cf_options*/, - const MutableDBOptions& /*mutable_db_options*/, - VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, - SequenceNumber /* earliest_memtable_seqno */) override { + Compaction* PickCompaction(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, + LogBuffer* /* log_buffer */) override { return nullptr; } @@ -304,11 +305,11 @@ class NullCompactionPicker : public CompactionPicker { // files. Cannot be nullptr. // // @return true iff compaction was found. -bool FindIntraL0Compaction( - const std::vector& level_files, size_t min_files_to_compact, - uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, - CompactionInputFiles* comp_inputs, - SequenceNumber earliest_mem_seqno = kMaxSequenceNumber); +bool FindIntraL0Compaction(const std::vector& level_files, + size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs); CompressionType GetCompressionType(const VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index 1f875e3e136..362e64e16f2 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -232,7 +232,8 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( break; } } - } else { + } else if (total_size > + mutable_cf_options.compaction_options_fifo.max_table_files_size) { // If the last level is non-L0, we actually don't know which file is // logically the oldest since the file creation time only represents // when this file was compacted to this level, which is independent @@ -247,15 +248,27 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( inputs[0].files.push_back(f); char tmp_fsize[16]; AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); - ROCKS_LOG_BUFFER(log_buffer, - "[%s] FIFO compaction: picking file %" PRIu64 - " with size %s for deletion", - cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with size %s for deletion under total size %" PRIu64 + " vs max table files size %" PRIu64, + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize, total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + if (total_size <= mutable_cf_options.compaction_options_fifo.max_table_files_size) { break; } } + } else { + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 + ", max size %" PRIu64 "\n", + cf_name.c_str(), total_size, + mutable_cf_options.compaction_options_fifo.max_table_files_size); + return nullptr; } Compaction* c = new Compaction( @@ -389,7 +402,7 @@ Compaction* FIFOCompactionPicker::PickCompactionToWarm( Compaction* FIFOCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) { + LogBuffer* log_buffer) { Compaction* c = nullptr; if (mutable_cf_options.ttl > 0) { c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options, diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h index 544259f38af..1db760185de 100644 --- a/db/compaction/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -19,11 +19,11 @@ class FIFOCompactionPicker : public CompactionPicker { const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* version, - LogBuffer* log_buffer, - SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer) override; virtual Compaction* CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index b689b6add3d..2162d30a30a 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -50,7 +50,6 @@ class LevelCompactionBuilder { public: LevelCompactionBuilder(const std::string& cf_name, VersionStorageInfo* vstorage, - SequenceNumber earliest_mem_seqno, CompactionPicker* compaction_picker, LogBuffer* log_buffer, const MutableCFOptions& mutable_cf_options, @@ -58,7 +57,6 @@ class LevelCompactionBuilder { const MutableDBOptions& mutable_db_options) : cf_name_(cf_name), vstorage_(vstorage), - earliest_mem_seqno_(earliest_mem_seqno), compaction_picker_(compaction_picker), log_buffer_(log_buffer), mutable_cf_options_(mutable_cf_options), @@ -122,7 +120,6 @@ class LevelCompactionBuilder { const std::string& cf_name_; VersionStorageInfo* vstorage_; - SequenceNumber earliest_mem_seqno_; CompactionPicker* compaction_picker_; LogBuffer* log_buffer_; int start_level_ = -1; @@ -196,7 +193,10 @@ void LevelCompactionBuilder::SetupInitialFiles() { } output_level_ = (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; - if (PickFileToCompact()) { + bool picked_file_to_compact = PickFileToCompact(); + TEST_SYNC_POINT_CALLBACK("PostPickFileToCompact", + &picked_file_to_compact); + if (picked_file_to_compact) { // found the compaction! if (start_level_ == 0) { // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` @@ -447,21 +447,21 @@ bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { compaction_inputs_.push_back(output_level_inputs_); } + // In some edge cases we could pick a compaction that will be compacting + // a key range that overlap with another running compaction, and both + // of them have the same output level. This could happen if + // (1) we are running a non-exclusive manual compaction + // (2) AddFile ingest a new file into the LSM tree + // We need to disallow this from happening. + if (compaction_picker_->FilesRangeOverlapWithCompaction( + compaction_inputs_, output_level_, + Compaction::EvaluatePenultimateLevel( + vstorage_, ioptions_, start_level_, output_level_))) { + // This compaction output could potentially conflict with the output + // of a currently running compaction, we cannot run it. + return false; + } if (!is_l0_trivial_move_) { - // In some edge cases we could pick a compaction that will be compacting - // a key range that overlap with another running compaction, and both - // of them have the same output level. This could happen if - // (1) we are running a non-exclusive manual compaction - // (2) AddFile ingest a new file into the LSM tree - // We need to disallow this from happening. - if (compaction_picker_->FilesRangeOverlapWithCompaction( - compaction_inputs_, output_level_, - Compaction::EvaluatePenultimateLevel( - vstorage_, ioptions_, start_level_, output_level_))) { - // This compaction output could potentially conflict with the output - // of a currently running compaction, we cannot run it. - return false; - } compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_, output_level_inputs_, &grandparents_); } @@ -825,16 +825,16 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() { return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, std::numeric_limits::max(), mutable_cf_options_.max_compaction_bytes, - &start_level_inputs_, earliest_mem_seqno_); + &start_level_inputs_); } } // namespace Compaction* LevelCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) { - LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this, - log_buffer, mutable_cf_options, ioptions_, + LogBuffer* log_buffer) { + LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, + mutable_cf_options, ioptions_, mutable_db_options); return builder.PickCompaction(); } diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h index 42a9b60a632..6eb0f586f4d 100644 --- a/db/compaction/compaction_picker_level.h +++ b/db/compaction/compaction_picker_level.h @@ -20,11 +20,11 @@ class LevelCompactionPicker : public CompactionPicker { LevelCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, - SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; virtual bool NeedsCompaction( const VersionStorageInfo* vstorage) const override; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 2e2e566c0ad..865518cb200 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -77,8 +77,9 @@ class CompactionPickerTestBase : public testing::Test { void NewVersionStorage(int num_levels, CompactionStyle style) { DeleteVersionStorage(); options_.num_levels = num_levels; - vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels, - style, nullptr, false)); + vstorage_.reset(new VersionStorageInfo( + &icmp_, ucmp_, options_.num_levels, style, nullptr, false, + EpochNumberRequirement::kMustPresent)); vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_); } @@ -87,7 +88,7 @@ class CompactionPickerTestBase : public testing::Test { void AddVersionStorage() { temp_vstorage_.reset(new VersionStorageInfo( &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style, - vstorage_.get(), false)); + vstorage_.get(), false, EpochNumberRequirement::kMustPresent)); } void DeleteVersionStorage() { @@ -105,7 +106,8 @@ class CompactionPickerTestBase : public testing::Test { size_t compensated_file_size = 0, bool marked_for_compact = false, Temperature temperature = Temperature::kUnknown, uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime, - Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) { + Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice(), + uint64_t epoch_number = kUnknownEpochNumber) { assert(ts_of_smallest.size() == ucmp_->timestamp_size()); assert(ts_of_largest.size() == ucmp_->timestamp_size()); @@ -145,8 +147,8 @@ class CompactionPickerTestBase : public testing::Test { file_number, path_id, file_size, smallest_ikey, largest_ikey, smallest_seq, largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; f->oldest_ancester_time = oldest_ancestor_time; @@ -2871,39 +2873,6 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { ASSERT_EQ(0, compaction->output_level()); } -TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) { - // Intra L0 compaction triggers only if there are at least - // level0_file_num_compaction_trigger + 2 L0 files. - mutable_cf_options_.level0_file_num_compaction_trigger = 3; - mutable_cf_options_.max_compaction_bytes = 999999u; - NewVersionStorage(6, kCompactionStyleLevel); - - // 4 out of 6 L0 files will be picked for intra L0 compaction due to - // being_compact limit. And the latest one L0 will be skipped due to earliest - // seqno. The one L1 file spans entire L0 key range and is marked as being - // compacted to avoid L0->L1 compaction. - Add(1, 1U, "100", "350", 200000U, 0, 110, 111); - Add(0, 2U, "301", "350", 1U, 0, 108, 109); - Add(0, 3U, "251", "300", 1U, 0, 106, 107); - Add(0, 4U, "201", "250", 1U, 0, 104, 105); - Add(0, 5U, "151", "200", 1U, 0, 102, 103); - Add(0, 6U, "100", "150", 1U, 0, 100, 101); - Add(0, 7U, "100", "100", 1U, 0, 99, 100); - vstorage_->LevelFiles(0)[5]->being_compacted = true; - vstorage_->LevelFiles(1)[0]->being_compacted = true; - UpdateVersionStorageInfo(); - - std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_, 107)); - ASSERT_TRUE(compaction.get() != nullptr); - ASSERT_EQ(1U, compaction->num_input_levels()); - ASSERT_EQ(4U, compaction->num_input_files(0)); - ASSERT_EQ(CompactionReason::kLevelL0FilesNum, - compaction->compaction_reason()); - ASSERT_EQ(0, compaction->output_level()); -} - #ifndef ROCKSDB_LITE TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { const uint64_t kFileSize = 100000; @@ -2916,9 +2885,23 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { // should fail NewVersionStorage(5, kCompactionStyleUniversal); - Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); - Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); - Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0, + /*marked_for_compact*/ false, /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 3); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 2); + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 1); Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251); Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); @@ -2940,7 +2923,11 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { AddVersionStorage(); // Simulate a flush and mark the file for compaction - Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true); + Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 4); UpdateVersionStorageInfo(); std::unique_ptr compaction2( @@ -2962,7 +2949,11 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { NewVersionStorage(5, kCompactionStyleUniversal); // Mark file number 4 for compaction - Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true); + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 1); Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250); Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); @@ -2983,8 +2974,17 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { ASSERT_EQ(1U, compaction->num_input_files(1)); AddVersionStorage(); - Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); - Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0, + /*marked_for_compact*/ false, /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 3); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 2); UpdateVersionStorageInfo(); std::unique_ptr compaction2( @@ -3150,10 +3150,29 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { NewVersionStorage(1, kCompactionStyleUniversal); // Mark file number 5 for compaction - Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300); - Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true); - Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); - Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 4); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 3); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 2); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 1); UpdateVersionStorageInfo(); std::unique_ptr compaction( @@ -3173,8 +3192,18 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { ASSERT_TRUE(file_map_[6].first->being_compacted); AddVersionStorage(); - Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); - Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0, + /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 6); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450, /*compensated_file_size*/ 0, + /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 5); UpdateVersionStorageInfo(); std::unique_ptr compaction2( diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index dbdd4934b6b..3ef4e70b3a6 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -293,7 +293,7 @@ bool UniversalCompactionPicker::NeedsCompaction( Compaction* UniversalCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) { + LogBuffer* log_buffer) { UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options, vstorage, this, log_buffer); @@ -308,9 +308,10 @@ void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf, if (file->fd.GetPathId() == 0 || !print_path) { snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber()); } else { - snprintf(out_buf, out_buf_size, "file %" PRIu64 - "(path " - "%" PRIu32 ")", + snprintf(out_buf, out_buf_size, + "file %" PRIu64 + "(path " + "%" PRIu32 ")", file->fd.GetNumber(), file->fd.GetPathId()); } } else { @@ -399,6 +400,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { // Always need to do a full compaction for periodic compaction. c = PickPeriodicCompaction(); + TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c); } // Check for size amplification. @@ -407,6 +409,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { static_cast( mutable_cf_options_.level0_file_num_compaction_trigger)) { if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { + TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr"); ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", cf_name_.c_str()); } else { @@ -416,6 +419,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { mutable_cf_options_.compaction_options_universal.size_ratio; if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { + TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr"); ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size ratio\n", cf_name_.c_str()); @@ -456,6 +460,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { if (c == nullptr) { if ((c = PickDeleteTriggeredCompaction()) != nullptr) { + TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr"); ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: delete triggered compaction\n", cf_name_.c_str()); diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h index 5f897cc9b39..558733195d8 100644 --- a/db/compaction/compaction_picker_universal.h +++ b/db/compaction/compaction_picker_universal.h @@ -18,11 +18,11 @@ class UniversalCompactionPicker : public CompactionPicker { UniversalCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, - SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } virtual bool NeedsCompaction( diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc index 1d2e99d9917..1f6c0b71010 100644 --- a/db/compaction/compaction_service_job.cc +++ b/db/compaction/compaction_service_job.cc @@ -190,6 +190,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( meta.largest.DecodeFrom(file.largest_internal_key); meta.oldest_ancester_time = file.oldest_ancester_time; meta.file_creation_time = file.file_creation_time; + meta.epoch_number = file.epoch_number; meta.marked_for_compaction = file.marked_for_compaction; meta.unique_id = file.unique_id; @@ -333,8 +334,9 @@ Status CompactionServiceCompactionJob::Run() { MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, meta.fd.largest_seqno, meta.smallest.Encode().ToString(), meta.largest.Encode().ToString(), meta.oldest_ancester_time, - meta.file_creation_time, output_file.validator.GetHash(), - meta.marked_for_compaction, meta.unique_id); + meta.file_creation_time, meta.epoch_number, + output_file.validator.GetHash(), meta.marked_for_compaction, + meta.unique_id); } InternalStats::CompactionStatsFull compaction_stats; sub_compact->AggregateCompactionStats(compaction_stats); @@ -489,6 +491,10 @@ static std::unordered_map {offsetof(struct CompactionServiceOutputFile, file_creation_time), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"epoch_number", + {offsetof(struct CompactionServiceOutputFile, epoch_number), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"paranoid_hash", {offsetof(struct CompactionServiceOutputFile, paranoid_hash), OptionType::kUInt64T, OptionVerificationType::kNormal, diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h index 13e63120fe3..c748be31bb5 100644 --- a/db/compaction/subcompaction_state.h +++ b/db/compaction/subcompaction_state.h @@ -196,8 +196,11 @@ class SubcompactionState { const CompactionFileCloseFunc& close_file_func) { // Call FinishCompactionOutputFile() even if status is not ok: it needs to // close the output file. + // CloseOutput() may open new compaction output files. + is_current_penultimate_level_ = true; Status s = penultimate_level_outputs_.CloseOutput( curr_status, open_file_func, close_file_func); + is_current_penultimate_level_ = false; s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func); return s; } diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc index aaebcfd94d1..f4837dcf9b3 100644 --- a/db/compaction/tiered_compaction_test.cc +++ b/db/compaction/tiered_compaction_test.cc @@ -663,8 +663,19 @@ TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) { cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), - 0); // tombstone has no size, even it's in hot tier + // range tombstone is not in cold tier + ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + // range tombstone is in the penultimate level + const int penultimate_level = kNumLevels - 2; + ASSERT_EQ(level_to_files[penultimate_level].size(), 1); + ASSERT_EQ(level_to_files[penultimate_level][0].num_entries, 1); + ASSERT_EQ(level_to_files[penultimate_level][0].num_deletions, 1); + ASSERT_EQ(level_to_files[penultimate_level][0].temperature, + Temperature::kUnknown); + ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_EQ("0,1,10", FilesPerLevel()); // one file is at the penultimate level which @@ -1240,7 +1251,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) { // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); int sst_num = 0; @@ -1248,7 +1259,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) { for (; sst_num < kNumTrigger; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -1302,7 +1313,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); int sst_num = 0; @@ -1310,7 +1321,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { for (; sst_num < kNumTrigger; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -1344,7 +1355,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { for (int i = 0; i < kNumKeys; i++) { // the value needs to be big enough to trigger full compaction ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100))); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -1378,7 +1389,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) { // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); int sst_num = 0; @@ -1386,7 +1397,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) { for (; sst_num < kNumTrigger; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -1452,13 +1463,13 @@ TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) { Random rnd(301); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(rnd.Uniform(10) + 1)); }); for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(i), rnd.RandomString(100))); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(rnd.Uniform(2))); }); } @@ -1505,7 +1516,7 @@ TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) { // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); int sst_num = 0; @@ -1513,7 +1524,7 @@ TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) { for (; sst_num < kNumTrigger; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -1583,7 +1594,7 @@ TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); Random rnd(301); @@ -1592,7 +1603,7 @@ TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { for (; sst_num < kNumTrigger; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100))); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -1685,7 +1696,7 @@ TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { for (int i = 0; i < kNumKeys; i++) { // the value needs to be big enough to trigger full compaction ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -1710,6 +1721,133 @@ TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { Close(); } +TEST_P(PrecludeLastLevelTestWithParms, PeriodicCompactionToPenultimateLevel) { + // Test the last level only periodic compaction should also be blocked by an + // ongoing compaction in penultimate level if tiered compaction is enabled + // otherwise, the periodic compaction should just run for the last level. + const int kNumTrigger = 4; + const int kNumLevels = 7; + const int kPenultimateLevel = kNumLevels - 2; + const int kKeyPerSec = 1; + const int kNumKeys = 100; + + bool enable_preclude_last_level = GetParam(); + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.preserve_internal_time_seconds = 20000; + options.env = mock_env_.get(); + options.level0_file_num_compaction_trigger = kNumTrigger; + options.num_levels = kNumLevels; + options.ignore_max_compaction_bytes_for_input = false; + options.periodic_compaction_seconds = 10000; + DestroyAndReopen(options); + + Random rnd(301); + + for (int i = 0; i < 3 * kNumKeys; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); + } + ASSERT_OK(Flush()); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // make sure all data is compacted to the last level + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); + + // enable preclude feature + if (enable_preclude_last_level) { + options.preclude_last_level_data_seconds = 20000; + } + options.max_background_jobs = 8; + options.last_level_temperature = Temperature::kCold; + Reopen(options); + + std::atomic_bool is_size_ratio_compaction_running = false; + std::atomic_bool verified_last_level_compaction = false; + + SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) { + auto compaction = static_cast(arg); + if (compaction->output_level() == kPenultimateLevel) { + is_size_ratio_compaction_running = true; + TEST_SYNC_POINT( + "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "SizeRatioCompaction1"); + TEST_SYNC_POINT( + "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "SizeRatioCompaction2"); + is_size_ratio_compaction_running = false; + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { + auto compaction = static_cast(arg); + + if (is_size_ratio_compaction_running) { + if (enable_preclude_last_level) { + ASSERT_TRUE(compaction == nullptr); + } else { + ASSERT_TRUE(compaction != nullptr); + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kPeriodicCompaction); + ASSERT_EQ(compaction->start_level(), kNumLevels - 1); + } + verified_last_level_compaction = true; + } + TEST_SYNC_POINT( + "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "AutoCompactionPicked"); + }); + + SyncPoint::GetInstance()->LoadDependency({ + {"PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "SizeRatioCompaction1", + "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:DoneWrite"}, + {"PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "AutoCompactionPicked", + "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:" + "SizeRatioCompaction2"}, + }); + + auto stop_token = + dbfull()->TEST_write_controler().GetCompactionPressureToken(); + + for (int i = 0; i < kNumTrigger - 1; i++) { + for (int j = 0; j < kNumKeys; j++) { + ASSERT_OK(Put(Key(i * (kNumKeys - 1) + i), rnd.RandomString(10))); + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); + } + ASSERT_OK(Flush()); + } + + TEST_SYNC_POINT( + "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:DoneWrite"); + + // wait for periodic compaction time and flush to trigger the periodic + // compaction, which should be blocked by ongoing compaction in the + // penultimate level + mock_clock_->MockSleepForSeconds(10000); + for (int i = 0; i < 3 * kNumKeys; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(10))); + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); + } + ASSERT_OK(Flush()); + + ASSERT_OK(dbfull()->WaitForCompact(true)); + + stop_token.reset(); + + Close(); +} + INSTANTIATE_TEST_CASE_P(PrecludeLastLevelTestWithParms, PrecludeLastLevelTestWithParms, testing::Bool()); @@ -1770,14 +1908,14 @@ TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) { // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); Random rnd(301); for (int i = 0; i < 300; i++) { ASSERT_OK(Put(Key(i), rnd.RandomString(100))); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); } ASSERT_OK(Flush()); @@ -1858,155 +1996,145 @@ TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) { Close(); } -struct TestPropertiesCollector : public TablePropertiesCollector { - Status AddUserKey(const Slice& key, const Slice& /*value*/, - EntryType /*type*/, SequenceNumber /*seq*/, - uint64_t /*file_size*/) override { - if (cmp->Compare(key, DBTestBase::Key(100)) == 0) { - has_key_100 = true; - } - if (cmp->Compare(key, DBTestBase::Key(200)) == 0) { - has_key_200 = true; - } - - return Status::OK(); - } - - const char* Name() const override { return "TestTablePropertiesCollector"; } - - UserCollectedProperties GetReadableProperties() const override { - UserCollectedProperties ret; - return ret; - } - - Status Finish(UserCollectedProperties* /*properties*/) override { - // The LSM tree would be like: - // L5: [0,19] [20,39] [40,299] - // L6: [0, 299] - // the 3rd file @L5 has both 100 and 200, which will be marked for - // compaction - // Also avoid marking flushed SST for compaction, which won't have both 100 - // and 200 - if (has_key_100 && has_key_200) { - need_compact_ = true; - } else { - need_compact_ = false; - } - has_key_100 = false; - has_key_200 = false; - return Status::OK(); - } - - bool NeedCompact() const override { return need_compact_; } - - const Comparator* cmp = BytewiseComparator(); - - private: - bool has_key_100 = false; - bool has_key_200 = false; - - bool need_compact_ = false; -}; - -class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory { - public: - TablePropertiesCollector* CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context /*context*/) override { - return new TestPropertiesCollector; - } - const char* Name() const override { return "TestTablePropertiesCollector"; } -}; - -TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompactionWithRangeDel) { - const int kNumTrigger = 4; +TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { const int kNumLevels = 7; - const int kKeyPerSec = 10; + const int kSecondsPerKey = 10; + const int kNumFiles = 3; + const int kValueBytes = 4 << 10; + const int kFileBytes = 4 * kValueBytes; + // `kNumKeysPerFile == 5` is determined by the current file cutting heuristics + // for this choice of `kValueBytes` and `kFileBytes`. + const int kNumKeysPerFile = 5; + const int kNumKeys = kNumFiles * kNumKeysPerFile; Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.env = mock_env_.get(); - options.level0_file_num_compaction_trigger = kNumTrigger; - options.preserve_internal_time_seconds = 10000; + options.last_level_temperature = Temperature::kCold; + options.preserve_internal_time_seconds = 600; + options.preclude_last_level_data_seconds = 1; options.num_levels = kNumLevels; - // set a small max_compaction_bytes to avoid input level expansion - options.max_compaction_bytes = 30000; - options.ignore_max_compaction_bytes_for_input = false; + options.target_file_size_base = kFileBytes; DestroyAndReopen(options); // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); + dbfull()->TEST_WaitForPeriodicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kSecondsPerKey)); + }); + // Flush an L0 file with the following contents (new to old): + // + // Range deletions [4, 6) [7, 8) [9, 11) + // --- snap2 --- + // Key(0) .. Key(14) + // --- snap1 --- + // Key(3) .. Key(17) + const auto verify_db = [&]() { + for (int i = 0; i < kNumKeys; i++) { + std::string value; + auto s = db_->Get(ReadOptions(), Key(i), &value); + if (i == 4 || i == 5 || i == 7 || i == 9 || i == 10) { + ASSERT_TRUE(s.IsNotFound()); + } else { + ASSERT_OK(s); + } + } + }; Random rnd(301); - - for (int i = 0; i < 300; i++) { - ASSERT_OK(Put(Key(i), rnd.RandomString(100))); - dbfull()->TEST_WaitForPeridicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); }); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i + 3), rnd.RandomString(kValueBytes))); + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kSecondsPerKey); }); } - ASSERT_OK(Flush()); - CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; - - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - - // make sure all data is compacted to the last level - ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); - - // Create 3 L5 files - auto factory = std::make_shared(); - options.sst_partitioner_factory = factory; - - // the user defined properties_collector will mark the 3rd file for compaction - auto collector_factory = std::make_shared(); - options.table_properties_collector_factories.resize(1); - options.table_properties_collector_factories[0] = collector_factory; - // enable tiered storage feature - options.preclude_last_level_data_seconds = 10000; - options.last_level_temperature = Temperature::kCold; - Reopen(options); - - for (int i = 0; i < kNumTrigger - 2; i++) { - for (int j = 0; j < 100; j++) { - ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10))); - } - ASSERT_OK(Flush()); + auto* snap1 = db_->GetSnapshot(); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValueBytes))); + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kSecondsPerKey); }); } + auto* snap2 = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(kNumKeysPerFile - 1), + Key(kNumKeysPerFile + 1))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(kNumKeysPerFile + 2), + Key(kNumKeysPerFile + 3))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(2 * kNumKeysPerFile - 1), + Key(2 * kNumKeysPerFile + 1))); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForPeriodicTaskRun( + [&] { mock_clock_->MockSleepForSeconds(kSecondsPerKey); }); + verify_db(); - // make sure there is one and only one compaction supports per-key placement - // but has the penultimate level output disabled. + // Count compactions supporting per-key placement std::atomic_int per_key_comp_num = 0; SyncPoint::GetInstance()->SetCallBack( "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) { auto compaction = static_cast(arg); if (compaction->SupportsPerKeyPlacement()) { ASSERT_EQ(compaction->GetPenultimateOutputRangeType(), - Compaction::PenultimateOutputRangeType::kDisabled); + Compaction::PenultimateOutputRangeType::kNonLastRange); per_key_comp_num++; } }); - SyncPoint::GetInstance()->EnableProcessing(); - for (int j = 0; j < 100; j++) { - ASSERT_OK(Put(Key(200 + j), rnd.RandomString(10))); - } - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - Key(32), Key(40))); - ASSERT_OK(Flush()); - - // Before the per-key placement compaction, the LSM tress should be like: - // L5: [0,19] [20,40] [40,299] - // L6: [0, 299] - // The 2nd file @L5 has the largest key 40 because of range del - + // The `CompactRange()` writes the following files to L5. + // + // [key000000#16,kTypeValue, + // key000005#kMaxSequenceNumber,kTypeRangeDeletion] + // [key000005#21,kTypeValue, + // key000010#kMaxSequenceNumber,kTypeRangeDeletion] + // [key000010#26,kTypeValue, key000014#30,kTypeValue] + // + // And it writes the following files to L6. + // + // [key000003#1,kTypeValue, key000007#5,kTypeValue] + // [key000008#6,kTypeValue, key000012#10,kTypeValue] + // [key000013#11,kTypeValue, key000017#15,kTypeValue] + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,3,3", FilesPerLevel()); + verify_db(); + + // Rewrite the middle file only. File endpoints should not change. + std::string begin_key_buf = Key(kNumKeysPerFile + 1), + end_key_buf = Key(kNumKeysPerFile + 2); + Slice begin_key(begin_key_buf), end_key(end_key_buf); + ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key, + &end_key)); ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,0,0,0,0,3,3", FilesPerLevel()); + ASSERT_EQ(1, per_key_comp_num); + verify_db(); - ASSERT_EQ(per_key_comp_num, 1); - - // the compaction won't move any data to the penultimate level + // Rewrite the middle file again after releasing snap2. Still file endpoints + // should not change. + db_->ReleaseSnapshot(snap2); + ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key, + &end_key)); + ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_EQ("0,0,0,0,0,3,3", FilesPerLevel()); + ASSERT_EQ(2, per_key_comp_num); + verify_db(); + + // Middle file once more after releasing snap1. This time the data in the + // middle L5 file can all be compacted to the last level. + db_->ReleaseSnapshot(snap1); + ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key, + &end_key)); + ASSERT_OK(dbfull()->WaitForCompact(true)); ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel()); + ASSERT_EQ(3, per_key_comp_num); + verify_db(); + + // Finish off the penultimate level. + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,0,3", FilesPerLevel()); + verify_db(); Close(); } diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index 229ab9a5a33..e5e3493b3e6 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -17,7 +17,6 @@ #include "util/string_util.h" #include "utilities/merge_operators.h" - namespace ROCKSDB_NAMESPACE { namespace { @@ -249,7 +248,7 @@ class TwoStrComparator : public Comparator { void FindShortSuccessor(std::string* /*key*/) const override {} }; -} // namespace +} // anonymous namespace class ComparatorDBTest : public testing::Test, @@ -470,7 +469,7 @@ void VerifySuccessor(const Slice& s, const Slice& t) { ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s)); } -} // namespace +} // anonymous namespace TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) { { diff --git a/db/convenience.cc b/db/convenience.cc index 81389112dfb..6344d356df3 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -26,8 +26,7 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, } Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, - const RangePtr* ranges, size_t n, - bool include_end) { + const RangePtr* ranges, size_t n, bool include_end) { return (static_cast_with_check(db->GetRootDB())) ->DeleteFilesInRanges(column_family, ranges, n, include_end); } @@ -47,9 +46,8 @@ Status VerifySstFileChecksum(const Options& options, InternalKeyComparator internal_comparator(options.comparator); ImmutableOptions ioptions(options); - Status s = ioptions.fs->NewRandomAccessFile(file_path, - FileOptions(env_options), - &file, nullptr); + Status s = ioptions.fs->NewRandomAccessFile( + file_path, FileOptions(env_options), &file, nullptr); if (s.ok()) { s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr); } else { diff --git a/db/corruption_test.cc b/db/corruption_test.cc index b0369297959..7544d098c44 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -65,7 +65,7 @@ class ErrorEnv : public EnvWrapper { return target()->NewWritableFile(fname, result, soptions); } }; -} // namespace +} // anonymous namespace class CorruptionTest : public testing::Test { public: std::shared_ptr env_guard_; @@ -138,9 +138,7 @@ class CorruptionTest : public testing::Test { return DB::Open(opt, dbname_, &db_); } - void Reopen(Options* options = nullptr) { - ASSERT_OK(TryReopen(options)); - } + void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); } void RepairDB() { delete db_; @@ -156,7 +154,7 @@ class CorruptionTest : public testing::Test { DBImpl* dbi = static_cast_with_check(db_); ASSERT_OK(dbi->TEST_FlushMemTable()); } - //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); + // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); Slice key = Key(i + start, &key_space); batch.Clear(); ASSERT_OK(batch.Put(key, Value(i + start, &value_space))); @@ -183,8 +181,7 @@ class CorruptionTest : public testing::Test { ASSERT_OK(iter->status()); uint64_t key; Slice in(iter->key()); - if (!ConsumeDecimalNumber(&in, &key) || - !in.empty() || + if (!ConsumeDecimalNumber(&in, &key) || !in.empty() || key < next_expected) { bad_keys++; continue; @@ -200,10 +197,11 @@ class CorruptionTest : public testing::Test { iter->status().PermitUncheckedError(); delete iter; - fprintf(stderr, - "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n", - min_expected, max_expected, correct, bad_keys, bad_values, - static_cast(missed)); + fprintf( + stderr, + "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n", + min_expected, max_expected, correct, bad_keys, bad_values, + static_cast(missed)); ASSERT_LE(min_expected, correct); ASSERT_GE(max_expected, correct); } @@ -217,8 +215,7 @@ class CorruptionTest : public testing::Test { std::string fname; int picked_number = -1; for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && - type == filetype && + if (ParseFileName(filenames[i], &number, &type) && type == filetype && static_cast(number) > picked_number) { // Pick latest file fname = dbname_ + "/" + filenames[i]; picked_number = static_cast(number); @@ -226,7 +223,8 @@ class CorruptionTest : public testing::Test { } ASSERT_TRUE(!fname.empty()) << filetype; - ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt)); + ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt, + /*verify_checksum*/ filetype == kTableFile)); } // corrupts exactly one file at level `level`. if no file found at level, @@ -244,7 +242,6 @@ class CorruptionTest : public testing::Test { FAIL() << "no file found at level"; } - int Property(const std::string& name) { std::string property; int result; @@ -522,6 +519,90 @@ TEST_F(CorruptionTest, TableFileIndexData) { ASSERT_TRUE(TryReopen().IsCorruption()); } +TEST_F(CorruptionTest, TableFileFooterMagic) { + Build(100); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + Check(100, 100); + // Corrupt the whole footer + Corrupt(kTableFile, -100, 100); + Status s = TryReopen(); + ASSERT_TRUE(s.IsCorruption()); + // Contains useful message, and magic number should be the first thing + // reported as corrupt. + ASSERT_TRUE(s.ToString().find("magic number") != std::string::npos); + // with file name + ASSERT_TRUE(s.ToString().find(".sst") != std::string::npos); +} + +TEST_F(CorruptionTest, TableFileFooterNotMagic) { + Build(100); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + Check(100, 100); + // Corrupt footer except magic number + Corrupt(kTableFile, -100, 92); + Status s = TryReopen(); + ASSERT_TRUE(s.IsCorruption()); + // The next thing checked after magic number is format_version + ASSERT_TRUE(s.ToString().find("format_version") != std::string::npos); + // with file name + ASSERT_TRUE(s.ToString().find(".sst") != std::string::npos); +} + +TEST_F(CorruptionTest, TableFileWrongSize) { + Build(100); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + Check(100, 100); + + // ******************************************** + // Make the file bigger by appending to it + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(1U, metadata.size()); + std::string filename = dbname_ + metadata[0].name; + const auto& fs = options_.env->GetFileSystem(); + { + std::unique_ptr f; + ASSERT_OK(fs->ReopenWritableFile(filename, FileOptions(), &f, nullptr)); + ASSERT_OK(f->Append("blahblah", IOOptions(), nullptr)); + ASSERT_OK(f->Close(IOOptions(), nullptr)); + } + + // DB actually accepts this without paranoid checks, relying on size + // recorded in manifest to locate the SST footer. + options_.paranoid_checks = false; + options_.skip_checking_sst_file_sizes_on_db_open = false; + Reopen(); + Check(100, 100); + + // But reports the issue with paranoid checks + options_.paranoid_checks = true; + Status s = TryReopen(); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos); + + // ******************************************** + // Make the file smaller with truncation. + // First leaving a partial footer, and then completely removing footer. + for (size_t bytes_lost : {8, 100}) { + ASSERT_OK( + test::TruncateFile(env_, filename, metadata[0].size - bytes_lost)); + + // Reported well with paranoid checks + options_.paranoid_checks = true; + s = TryReopen(); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos); + + // Without paranoid checks, not reported until read + options_.paranoid_checks = false; + Reopen(); + Check(0, 0); // Missing data + } +} + TEST_F(CorruptionTest, MissingDescriptor) { Build(1000); RepairDB(); diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 2484c402d25..868b798ea58 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -77,9 +77,7 @@ class CuckooTableDBTest : public testing::Test { return db_->Put(WriteOptions(), k, v); } - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } std::string Get(const std::string& k) { ReadOptions options; @@ -313,23 +311,21 @@ TEST_F(CuckooTableDBTest, AdaptiveTable) { // Write some keys using plain table. std::shared_ptr block_based_factory( NewBlockBasedTableFactory()); - std::shared_ptr plain_table_factory( - NewPlainTableFactory()); - std::shared_ptr cuckoo_table_factory( - NewCuckooTableFactory()); + std::shared_ptr plain_table_factory(NewPlainTableFactory()); + std::shared_ptr cuckoo_table_factory(NewCuckooTableFactory()); options.create_if_missing = false; - options.table_factory.reset(NewAdaptiveTableFactory( - plain_table_factory, block_based_factory, plain_table_factory, - cuckoo_table_factory)); + options.table_factory.reset( + NewAdaptiveTableFactory(plain_table_factory, block_based_factory, + plain_table_factory, cuckoo_table_factory)); Reopen(&options); ASSERT_OK(Put("key4", "v4")); ASSERT_OK(Put("key1", "v5")); ASSERT_OK(dbfull()->TEST_FlushMemTable()); // Write some keys using block based table. - options.table_factory.reset(NewAdaptiveTableFactory( - block_based_factory, block_based_factory, plain_table_factory, - cuckoo_table_factory)); + options.table_factory.reset( + NewAdaptiveTableFactory(block_based_factory, block_based_factory, + plain_table_factory, cuckoo_table_factory)); Reopen(&options); ASSERT_OK(Put("key5", "v6")); ASSERT_OK(Put("key2", "v7")); diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 3515040ddce..f180d3ff9cb 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -32,6 +32,9 @@ namespace ROCKSDB_NAMESPACE { +static bool enable_io_uring = true; +extern "C" bool RocksDbIOUringEnable() { return enable_io_uring; } + class DBBasicTest : public DBTestBase { public: DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {} @@ -2158,11 +2161,12 @@ class DBMultiGetAsyncIOTest : public DBBasicTest, : DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) { BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10)); - Options options = CurrentOptions(); - options.disable_auto_compactions = true; - options.statistics = statistics_; - options.table_factory.reset(NewBlockBasedTableFactory(bbto)); - Reopen(options); + options_ = CurrentOptions(); + options_.disable_auto_compactions = true; + options_.statistics = statistics_; + options_.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options_.env = Env::Default(); + Reopen(options_); int num_keys = 0; // Put all keys in the bottommost level, and overwrite some keys @@ -2227,8 +2231,26 @@ class DBMultiGetAsyncIOTest : public DBBasicTest, const std::shared_ptr& statistics() { return statistics_; } + protected: + void PrepareDBForTest() { +#ifdef ROCKSDB_IOURING_PRESENT + Reopen(options_); +#else // ROCKSDB_IOURING_PRESENT + // Warm up the block cache so we don't need to use the IO uring + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + for (iter->SeekToFirst(); iter->Valid() && iter->status().ok(); + iter->Next()) + ; + EXPECT_OK(iter->status()); + delete iter; +#endif // ROCKSDB_IOURING_PRESENT + } + + void ReopenDB() { Reopen(options_); } + private: std::shared_ptr statistics_; + Options options_; }; TEST_P(DBMultiGetAsyncIOTest, GetFromL0) { @@ -2238,6 +2260,8 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL0) { std::vector values(key_strs.size()); std::vector statuses(key_strs.size()); + PrepareDBForTest(); + ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); @@ -2256,6 +2280,7 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL0) { statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); // With async IO, lookups will happen in parallel for each key +#ifdef ROCKSDB_IOURING_PRESENT if (GetParam()) { ASSERT_EQ(multiget_io_batch_size.count, 1); ASSERT_EQ(multiget_io_batch_size.max, 3); @@ -2265,6 +2290,11 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL0) { // L0 file ASSERT_EQ(multiget_io_batch_size.count, 3); } +#else // ROCKSDB_IOURING_PRESENT + if (GetParam()) { + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); + } +#endif // ROCKSDB_IOURING_PRESENT } TEST_P(DBMultiGetAsyncIOTest, GetFromL1) { @@ -2282,6 +2312,8 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1) { values.resize(keys.size()); statuses.resize(keys.size()); + PrepareDBForTest(); + ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); @@ -2295,6 +2327,7 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1) { ASSERT_EQ(values[1], "val_l1_" + std::to_string(54)); ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); +#ifdef ROCKSDB_IOURING_PRESENT HistogramData multiget_io_batch_size; statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); @@ -2302,9 +2335,75 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1) { // A batch of 3 async IOs is expected, one for each overlapping file in L1 ASSERT_EQ(multiget_io_batch_size.count, 1); ASSERT_EQ(multiget_io_batch_size.max, 3); +#endif // ROCKSDB_IOURING_PRESENT ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); } +#ifdef ROCKSDB_IOURING_PRESENT +TEST_P(DBMultiGetAsyncIOTest, GetFromL1Error) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + key_strs.push_back(Key(33)); + key_strs.push_back(Key(54)); + key_strs.push_back(Key(102)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + keys.push_back(key_strs[2]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + int count = 0; + SyncPoint::GetInstance()->SetCallBack( + "TableCache::GetTableReader:BeforeOpenFile", [&](void* status) { + count++; + // Fail the last table reader open, which is the 6th SST file + // since 3 overlapping L0 files + 3 L1 files containing the keys + if (count == 6) { + Status* s = static_cast(status); + *s = Status::IOError(); + } + }); + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + PrepareDBForTest(); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_EQ(values.size(), 3); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::OK()); + ASSERT_EQ(statuses[2], Status::IOError()); + + HistogramData multiget_io_batch_size; + + statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); + + // A batch of 3 async IOs is expected, one for each overlapping file in L1 + ASSERT_EQ(multiget_io_batch_size.count, 1); + ASSERT_EQ(multiget_io_batch_size.max, 2); + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); +} +#endif // ROCKSDB_IOURING_PRESENT + TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) { std::vector key_strs; std::vector keys; @@ -2321,6 +2420,8 @@ TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) { values.resize(keys.size()); statuses.resize(keys.size()); + PrepareDBForTest(); + ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); @@ -2334,6 +2435,7 @@ TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) { ASSERT_EQ(values[1], "val_l1_" + std::to_string(54)); ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); +#ifdef ROCKSDB_IOURING_PRESENT HistogramData multiget_io_batch_size; statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); @@ -2344,6 +2446,7 @@ TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) { // will lookup 2 files in parallel and issue 2 async reads ASSERT_EQ(multiget_io_batch_size.count, 2); ASSERT_EQ(multiget_io_batch_size.max, 2); +#endif // ROCKSDB_IOURING_PRESENT } TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) { @@ -2362,6 +2465,8 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) { values.resize(keys.size()); statuses.resize(keys.size()); + PrepareDBForTest(); + ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); @@ -2375,6 +2480,7 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) { ASSERT_EQ(values[1], "val_l2_" + std::to_string(56)); ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); +#ifdef ROCKSDB_IOURING_PRESENT HistogramData multiget_io_batch_size; statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); @@ -2384,6 +2490,7 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) { // Otherwise, the L2 lookup will happen after L1. ASSERT_EQ(multiget_io_batch_size.count, GetParam() ? 1 : 2); ASSERT_EQ(multiget_io_batch_size.max, GetParam() ? 3 : 2); +#endif // ROCKSDB_IOURING_PRESENT } TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) { @@ -2400,6 +2507,8 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) { values.resize(keys.size()); statuses.resize(keys.size()); + PrepareDBForTest(); + ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); @@ -2415,6 +2524,7 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) { ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); } +#ifdef ROCKSDB_IOURING_PRESENT TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) { std::vector key_strs; std::vector keys; @@ -2429,6 +2539,8 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) { values.resize(keys.size()); statuses.resize(keys.size()); + PrepareDBForTest(); + ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); @@ -2458,6 +2570,8 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) { values.resize(keys.size()); statuses.resize(keys.size()); + PrepareDBForTest(); + ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); @@ -2472,6 +2586,45 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) { // Bloom filters in L0/L1 will avoid the coroutine calls in those levels ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); } +#endif // ROCKSDB_IOURING_PRESENT + +TEST_P(DBMultiGetAsyncIOTest, GetNoIOUring) { + std::vector key_strs; + std::vector keys; + std::vector values; + std::vector statuses; + + key_strs.push_back(Key(33)); + key_strs.push_back(Key(54)); + key_strs.push_back(Key(102)); + keys.push_back(key_strs[0]); + keys.push_back(key_strs[1]); + keys.push_back(key_strs[2]); + values.resize(keys.size()); + statuses.resize(keys.size()); + + enable_io_uring = false; + ReopenDB(); + + ReadOptions ro; + ro.async_io = true; + ro.optimize_multiget_for_io = GetParam(); + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + ASSERT_EQ(values.size(), 3); + ASSERT_EQ(statuses[0], Status::NotSupported()); + ASSERT_EQ(statuses[1], Status::NotSupported()); + ASSERT_EQ(statuses[2], Status::NotSupported()); + + HistogramData multiget_io_batch_size; + + statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); + + // A batch of 3 async IOs is expected, one for each overlapping file in L1 + ASSERT_EQ(multiget_io_batch_size.count, 1); + ASSERT_EQ(multiget_io_batch_size.max, 3); + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); +} INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest, testing::Bool()); @@ -2805,7 +2958,7 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.block_size = 16 * 1024; ASSERT_TRUE(table_options.block_size > - BlockBasedTable::kMultiGetReadStackBufSize); + BlockBasedTable::kMultiGetReadStackBufSize); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); @@ -2914,7 +3067,7 @@ class TableFileListener : public EventListener { InstrumentedMutex mutex_; std::unordered_map> cf_to_paths_; }; -} // namespace +} // anonymous namespace TEST_F(DBBasicTest, LastSstFileNotInManifest) { // If the last sst file is not tracked in MANIFEST, @@ -3470,24 +3623,27 @@ class DBBasicTestMultiGet : public DBTestBase { const char* Name() const override { return "MyBlockCache"; } - using Cache::Insert; - Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), + Status Insert(const Slice& key, Cache::ObjectPtr value, + const CacheItemHelper* helper, size_t charge, Handle** handle = nullptr, Priority priority = Priority::LOW) override { num_inserts_++; - return target_->Insert(key, value, charge, deleter, handle, priority); + return target_->Insert(key, value, helper, charge, handle, priority); } - using Cache::Lookup; - Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override { + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority = Priority::LOW, bool wait = true, + Statistics* stats = nullptr) override { num_lookups_++; - Handle* handle = target_->Lookup(key, stats); + Handle* handle = + target_->Lookup(key, helper, create_context, priority, wait, stats); if (handle != nullptr) { num_found_++; } return handle; } + int num_lookups() { return num_lookups_; } int num_found() { return num_found_; } diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 38ecd7a814b..1c45a8aabfe 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -13,8 +13,8 @@ #include "cache/cache_entry_roles.h" #include "cache/cache_key.h" -#include "cache/fast_lru_cache.h" #include "cache/lru_cache.h" +#include "cache/typed_cache.h" #include "db/column_family.h" #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" @@ -366,9 +366,7 @@ class PersistentCacheFromCache : public PersistentCache { } std::unique_ptr copy{new char[size]}; std::copy_n(data, size, copy.get()); - Status s = cache_->Insert( - key, copy.get(), size, - GetCacheEntryDeleterForRole()); + Status s = cache_.Insert(key, copy.get(), size); if (s.ok()) { copy.release(); } @@ -377,13 +375,13 @@ class PersistentCacheFromCache : public PersistentCache { Status Lookup(const Slice& key, std::unique_ptr* data, size_t* size) override { - auto handle = cache_->Lookup(key); + auto handle = cache_.Lookup(key); if (handle) { - char* ptr = static_cast(cache_->Value(handle)); - *size = cache_->GetCharge(handle); + char* ptr = cache_.Value(handle); + *size = cache_.get()->GetCharge(handle); data->reset(new char[*size]); std::copy_n(ptr, *size, data->get()); - cache_->Release(handle); + cache_.Release(handle); return Status::OK(); } else { return Status::NotFound(); @@ -396,10 +394,10 @@ class PersistentCacheFromCache : public PersistentCache { std::string GetPrintableOptions() const override { return ""; } - uint64_t NewId() override { return cache_->NewId(); } + uint64_t NewId() override { return cache_.get()->NewId(); } private: - std::shared_ptr cache_; + BasicTypedSharedCacheInterface cache_; bool read_only_; }; @@ -407,14 +405,14 @@ class ReadOnlyCacheWrapper : public CacheWrapper { using CacheWrapper::CacheWrapper; using Cache::Insert; - Status Insert(const Slice& /*key*/, void* /*value*/, size_t /*charge*/, - void (*)(const Slice& key, void* value) /*deleter*/, + Status Insert(const Slice& /*key*/, Cache::ObjectPtr /*value*/, + const CacheItemHelper* /*helper*/, size_t /*charge*/, Handle** /*handle*/, Priority /*priority*/) override { return Status::NotSupported(); } }; -} // namespace +} // anonymous namespace TEST_F(DBBlockCacheTest, TestWithSameCompressed) { auto table_options = GetTableOptions(); @@ -828,16 +826,15 @@ class MockCache : public LRUCache { using ShardedCache::Insert; - Status Insert(const Slice& key, void* value, - const Cache::CacheItemHelper* helper_cb, size_t charge, + Status Insert(const Slice& key, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, size_t charge, Handle** handle, Priority priority) override { - DeleterFn delete_cb = helper_cb->del_cb; if (priority == Priority::LOW) { low_pri_insert_count++; } else { high_pri_insert_count++; } - return LRUCache::Insert(key, value, charge, delete_cb, handle, priority); + return LRUCache::Insert(key, value, helper, charge, handle, priority); } }; @@ -917,7 +914,10 @@ class LookupLiarCache : public CacheWrapper { : CacheWrapper(std::move(target)) {} using Cache::Lookup; - Handle* Lookup(const Slice& key, Statistics* stats) override { + Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr, + CreateContext* create_context = nullptr, + Priority priority = Priority::LOW, bool wait = true, + Statistics* stats = nullptr) override { if (nth_lookup_not_found_ == 1) { nth_lookup_not_found_ = 0; return nullptr; @@ -925,7 +925,8 @@ class LookupLiarCache : public CacheWrapper { if (nth_lookup_not_found_ > 1) { --nth_lookup_not_found_; } - return CacheWrapper::Lookup(key, stats); + return CacheWrapper::Lookup(key, helper, create_context, priority, wait, + stats); } // 1 == next lookup, 2 == after next, etc. @@ -944,10 +945,7 @@ TEST_F(DBBlockCacheTest, AddRedundantStats) { capacity, BlockBasedTableOptions().block_size /*estimated_value_size*/, num_shard_bits) - .MakeSharedCache(), - NewFastLRUCache(capacity, 1 /*estimated_value_size*/, num_shard_bits, - false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy)}) { + .MakeSharedCache()}) { if (!base_cache) { // Skip clock cache when not supported continue; @@ -1279,12 +1277,11 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) { } static void ClearCache(Cache* cache) { - auto roles = CopyCacheDeleterRoleMap(); std::deque keys; Cache::ApplyToAllEntriesOptions opts; - auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/, - Cache::DeleterFn deleter) { - if (roles.find(deleter) == roles.end()) { + auto callback = [&](const Slice& key, Cache::ObjectPtr, size_t /*charge*/, + const Cache::CacheItemHelper* helper) { + if (helper && helper->role == CacheEntryRole::kMisc) { // Keep the stats collector return; } @@ -1306,11 +1303,6 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { capacity, BlockBasedTableOptions().block_size /*estimated_value_size*/) .MakeSharedCache()}) { - if (!cache) { - // Skip clock cache when not supported - continue; - } - ++iterations_tested; Options options = CurrentOptions(); @@ -1459,14 +1451,13 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { ClearCache(cache.get()); Cache::Handle* h = nullptr; if (strcmp(cache->Name(), "LRUCache") == 0) { - ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1, - GetNoopDeleterForRole(), - &h, Cache::Priority::HIGH)); + ASSERT_OK(cache->Insert("Fill-it-up", nullptr, &kNoopCacheItemHelper, + capacity + 1, &h, Cache::Priority::HIGH)); } else { // For ClockCache we use a 16-byte key. - ASSERT_OK(cache->Insert("Fill-it-up-xxxxx", nullptr, capacity + 1, - GetNoopDeleterForRole(), - &h, Cache::Priority::HIGH)); + ASSERT_OK(cache->Insert("Fill-it-up-xxxxx", nullptr, + &kNoopCacheItemHelper, capacity + 1, &h, + Cache::Priority::HIGH)); } ASSERT_GT(cache->GetUsage(), cache->GetCapacity()); expected = {}; @@ -1543,6 +1534,124 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { } } +namespace { + +void DummyFillCache(Cache& cache, size_t entry_size, + std::vector>& handles) { + // fprintf(stderr, "Entry size: %zu\n", entry_size); + handles.clear(); + cache.EraseUnRefEntries(); + void* fake_value = &cache; + size_t capacity = cache.GetCapacity(); + OffsetableCacheKey ck{"abc", "abc", 42}; + for (size_t my_usage = 0; my_usage < capacity;) { + size_t charge = std::min(entry_size, capacity - my_usage); + Cache::Handle* handle; + Status st = cache.Insert(ck.WithOffset(my_usage).AsSlice(), fake_value, + &kNoopCacheItemHelper, charge, &handle); + ASSERT_OK(st); + handles.emplace_back(&cache, handle); + my_usage += charge; + } +} + +class CountingLogger : public Logger { + public: + ~CountingLogger() override {} + using Logger::Logv; + void Logv(const InfoLogLevel log_level, const char* format, + va_list /*ap*/) override { + if (std::strstr(format, "HyperClockCache") == nullptr) { + // Not a match + return; + } + // static StderrLogger debug; + // debug.Logv(log_level, format, ap); + if (log_level == InfoLogLevel::INFO_LEVEL) { + ++info_count_; + } else if (log_level == InfoLogLevel::WARN_LEVEL) { + ++warn_count_; + } else if (log_level == InfoLogLevel::ERROR_LEVEL) { + ++error_count_; + } + } + + std::array PopCounts() { + std::array rv{{info_count_, warn_count_, error_count_}}; + info_count_ = warn_count_ = error_count_ = 0; + return rv; + } + + private: + int info_count_{}; + int warn_count_{}; + int error_count_{}; +}; + +} // namespace + +TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) { + size_t capacity = 1024 * 1024; + size_t value_size_est = 8 * 1024; + HyperClockCacheOptions hcc_opts{capacity, value_size_est}; + hcc_opts.num_shard_bits = 2; // 4 shards + hcc_opts.metadata_charge_policy = kDontChargeCacheMetadata; + std::shared_ptr cache = hcc_opts.MakeSharedCache(); + std::shared_ptr logger = std::make_shared(); + + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + table_options.block_cache = cache; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.info_log = logger; + // Going to sample more directly + options.stats_dump_period_sec = 0; + Reopen(options); + + std::vector> handles; + + // Clear anything from DB startup + logger->PopCounts(); + + // Fill cache based on expected size and check that when we + // don't report anything relevant in periodic stats dump + DummyFillCache(*cache, value_size_est, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 0, 0}})); + + // Same, within reasonable bounds + DummyFillCache(*cache, value_size_est - value_size_est / 4, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 0, 0}})); + + DummyFillCache(*cache, value_size_est + value_size_est / 3, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 0, 0}})); + + // Estimate too high (value size too low) eventually reports ERROR + DummyFillCache(*cache, value_size_est / 2, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 1, 0}})); + + DummyFillCache(*cache, value_size_est / 3, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 0, 1}})); + + // Estimate too low (value size too high) starts with INFO + // and is only WARNING in the worst case + DummyFillCache(*cache, value_size_est * 2, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{1, 0, 0}})); + + DummyFillCache(*cache, value_size_est * 3, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 1, 0}})); + + DummyFillCache(*cache, value_size_est * 20, handles); + dbfull()->DumpStats(); + EXPECT_EQ(logger->PopCounts(), (std::array{{0, 1, 0}})); +} + #endif // ROCKSDB_LITE class DBBlockCacheKeyTest @@ -1973,7 +2082,7 @@ struct CacheKeyDecoder { DownwardInvolution(decoded_session_counter)); } }; -} // namespace +} // anonymous namespace TEST_F(CacheKeyTest, Encodings) { // This test primarily verifies this claim from cache_key.cc: diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index f2e25b918cc..0d8329496e7 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -43,7 +43,7 @@ const std::string kStandard128Ribbon = test::Standard128RibbonFilterPolicy::kClassName(); const std::string kAutoBloom = BloomFilterPolicy::kClassName(); const std::string kAutoRibbon = RibbonFilterPolicy::kClassName(); -} // namespace +} // anonymous namespace // DB tests related to bloom filter. @@ -622,7 +622,7 @@ class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy { bool skip_; }; -} // namespace +} // anonymous namespace TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { constexpr int maxKey = 10; @@ -767,10 +767,10 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( FormatLatest, DBBloomFilterTestWithParam, - ::testing::Values( - std::make_tuple(kAutoBloom, true, kLatestFormatVersion), - std::make_tuple(kAutoBloom, false, kLatestFormatVersion), - std::make_tuple(kAutoRibbon, false, kLatestFormatVersion))); + ::testing::Values(std::make_tuple(kAutoBloom, true, kLatestFormatVersion), + std::make_tuple(kAutoBloom, false, kLatestFormatVersion), + std::make_tuple(kAutoRibbon, false, + kLatestFormatVersion))); #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) TEST_F(DBBloomFilterTest, BloomFilterRate) { @@ -840,7 +840,7 @@ std::vector kCompatibilityConfigs = { BlockBasedTableOptions().format_version}, {kCompatibilityRibbonPolicy, true, BlockBasedTableOptions().format_version}, }; -} // namespace +} // anonymous namespace TEST_F(DBBloomFilterTest, BloomFilterCompatibility) { Options options = CurrentOptions(); @@ -1229,7 +1229,7 @@ TEST_P(ChargeFilterConstructionTestWithParam, Basic) { * * The test is designed in a way such that the reservation for (p1 - b') * will trigger at least another dummy entry insertion - * (or equivelantly to saying, creating another peak). + * (or equivalently to saying, creating another peak). * * kStandard128Ribbon + FullFilter + * detect_filter_construct_corruption @@ -1678,7 +1678,7 @@ class TestingContextCustomFilterPolicy private: mutable std::string test_report_; }; -} // namespace +} // anonymous namespace TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { auto policy = std::make_shared(15, 8, 5); @@ -2186,16 +2186,14 @@ INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer, std::make_tuple(false, 2), std::make_tuple(false, 3), std::make_tuple(false, 4), - std::make_tuple(false, 5), - std::make_tuple(true, 2), - std::make_tuple(true, 3), - std::make_tuple(true, 4), + std::make_tuple(false, 5), std::make_tuple(true, 2), + std::make_tuple(true, 3), std::make_tuple(true, 4), std::make_tuple(true, 5))); #ifndef ROCKSDB_LITE namespace { static const std::string kPlainTable = "test_PlainTableBloom"; -} // namespace +} // anonymous namespace class BloomStatsTestWithParam : public DBBloomFilterTest, @@ -2408,7 +2406,7 @@ void PrefixScanInit(DBBloomFilterTest* dbtest) { dbtest->Flush(); } } -} // namespace +} // anonymous namespace TEST_F(DBBloomFilterTest, PrefixScan) { while (ChangeFilterOptions()) { @@ -2620,8 +2618,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { BottommostLevelCompaction::kSkip; compact_options.change_level = true; compact_options.target_level = 7; - ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr) - .IsNotSupported()); + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_EQ(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); @@ -3219,7 +3216,7 @@ std::pair CheckedAndUseful(uint64_t checked, uint64_t useful) { return {checked, useful}; } -} // namespace +} // anonymous namespace // This uses a prefix_extractor + comparator combination that violates // one of the old obsolete, unnecessary axioms of prefix extraction: @@ -3427,7 +3424,7 @@ class NonIdempotentFixed4Transform : public SliceTransform { bool InDomain(const Slice& src) const override { return src.size() >= 5; } }; -} // namespace +} // anonymous namespace // This uses a prefix_extractor + comparator combination that violates // two of the old obsolete, unnecessary axioms of prefix extraction: diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 0594825d61e..41ab69b85af 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -9,8 +9,10 @@ #include +#include "compaction/compaction_picker_universal.h" #include "db/blob/blob_index.h" #include "db/db_test_util.h" +#include "db/dbformat.h" #include "env/mock_env.h" #include "port/port.h" #include "port/stack_trace.h" @@ -248,9 +250,8 @@ Options DeletionTriggerOptions(Options options) { return options; } -bool HaveOverlappingKeyRanges( - const Comparator* c, - const SstFileMetaData& a, const SstFileMetaData& b) { +bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, + const SstFileMetaData& b) { if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) { if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) { // b.smallestkey <= a.smallestkey <= b.largestkey @@ -275,18 +276,15 @@ bool HaveOverlappingKeyRanges( // Identifies all files between level "min_level" and "max_level" // which has overlapping key range with "input_file_meta". void GetOverlappingFileNumbersForLevelCompaction( - const ColumnFamilyMetaData& cf_meta, - const Comparator* comparator, - int min_level, int max_level, - const SstFileMetaData* input_file_meta, + const ColumnFamilyMetaData& cf_meta, const Comparator* comparator, + int min_level, int max_level, const SstFileMetaData* input_file_meta, std::set* overlapping_file_names) { std::set overlapping_files; overlapping_files.insert(input_file_meta); for (int m = min_level; m <= max_level; ++m) { for (auto& file : cf_meta.levels[m].files) { for (auto* included_file : overlapping_files) { - if (HaveOverlappingKeyRanges( - comparator, *included_file, file)) { + if (HaveOverlappingKeyRanges(comparator, *included_file, file)) { overlapping_files.insert(&file); overlapping_file_names->insert(file.name); break; @@ -309,12 +307,9 @@ void VerifyCompactionResult( #endif } -const SstFileMetaData* PickFileRandomly( - const ColumnFamilyMetaData& cf_meta, - Random* rand, - int* level = nullptr) { - auto file_id = rand->Uniform(static_cast( - cf_meta.file_count)) + 1; +const SstFileMetaData* PickFileRandomly(const ColumnFamilyMetaData& cf_meta, + Random* rand, int* level = nullptr) { + auto file_id = rand->Uniform(static_cast(cf_meta.file_count)) + 1; for (auto& level_meta : cf_meta.levels) { if (file_id <= level_meta.files.size()) { if (level != nullptr) { @@ -740,7 +735,6 @@ TEST_F(DBCompactionTest, DisableStatsUpdateReopen) { } } - TEST_P(DBCompactionTestWithParam, CompactionTrigger) { const int kNumKeysPerFile = 100; @@ -883,7 +877,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) { TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) { Options options = CurrentOptions(); - options.write_buffer_size = 100000000; // Large write buffer + options.write_buffer_size = 100000000; // Large write buffer options.max_subcompactions = max_subcompactions_; CreateAndReopenWithCF({"pikachu"}, options); @@ -1032,6 +1026,70 @@ TEST_F(DBCompactionTest, CompactionSstPartitioner) { ASSERT_EQ("B", Get("bbbb1")); } +TEST_F(DBCompactionTest, CompactionSstPartitionWithManualCompaction) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + + DestroyAndReopen(options); + + // create first file and flush to l0 + ASSERT_OK(Put("000015", "A")); + ASSERT_OK(Put("000025", "B")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // create second file and flush to l0 + ASSERT_OK(Put("000015", "A2")); + ASSERT_OK(Put("000025", "B2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + + // CONTROL 1: compact without partitioner + CompactRangeOptions compact_options; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + // Check (compacted but no partitioning yet) + std::vector files; + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(1, files.size()); + + // Install partitioner + std::shared_ptr factory( + NewSstPartitionerFixedPrefixFactory(5)); + options.sst_partitioner_factory = factory; + Reopen(options); + + // CONTROL 2: request compaction on range with no partition boundary and no + // overlap with actual entries + Slice from("000017"); + Slice to("000019"); + ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to)); + + // Check (no partitioning yet) + files.clear(); + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(1, files.size()); + ASSERT_EQ("A2", Get("000015")); + ASSERT_EQ("B2", Get("000025")); + + // TEST: request compaction overlapping with partition boundary but no + // actual entries + // NOTE: `to` is INCLUSIVE + from = Slice("000019"); + to = Slice("000020"); + ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to)); + + // Check (must be partitioned) + files.clear(); + dbfull()->GetLiveFilesMetaData(&files); + ASSERT_EQ(2, files.size()); + ASSERT_EQ("A2", Get("000015")); + ASSERT_EQ("B2", Get("000025")); +} + TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleLevel; @@ -1069,7 +1127,7 @@ TEST_F(DBCompactionTest, ZeroSeqIdCompaction) { compact_opt.compression = kNoCompression; compact_opt.output_file_size_limit = 4096; const size_t key_len = - static_cast(compact_opt.output_file_size_limit) / 5; + static_cast(compact_opt.output_file_size_limit) / 5; DestroyAndReopen(options); @@ -1247,14 +1305,8 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { DestroyAndReopen(options); // non overlapping ranges std::vector> ranges = { - {100, 199}, - {300, 399}, - {0, 99}, - {200, 299}, - {600, 699}, - {400, 499}, - {500, 550}, - {551, 599}, + {100, 199}, {300, 399}, {0, 99}, {200, 299}, + {600, 699}, {400, 499}, {500, 550}, {551, 599}, }; int32_t value_size = 10 * 1024; // 10 KB @@ -1297,14 +1349,15 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) { DestroyAndReopen(options); // Same ranges as above but overlapping ranges = { - {100, 199}, - {300, 399}, - {0, 99}, - {200, 299}, - {600, 699}, - {400, 499}, - {500, 560}, // this range overlap with the next one - {551, 599}, + {100, 199}, + {300, 399}, + {0, 99}, + {200, 299}, + {600, 699}, + {400, 499}, + {500, 560}, // this range overlap with the next + // one + {551, 599}, }; for (size_t i = 0; i < ranges.size(); i++) { for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) { @@ -1907,7 +1960,7 @@ TEST_F(DBCompactionTest, DeleteFilesInRanges) { ASSERT_EQ("0,0,10", FilesPerLevel(0)); // file [0 => 100), [200 => 300), ... [800, 900) - for (auto i = 0; i < 10; i+=2) { + for (auto i = 0; i < 10; i += 2) { for (auto j = 0; j < 100; j++) { auto k = i * 100 + j; ASSERT_OK(Put(Key(k), values[k])); @@ -2349,14 +2402,14 @@ TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) { cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024); cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024); option_vector.emplace_back(DBOptions(options), cf_opt1); - CreateColumnFamilies({"one"},option_vector[1]); + CreateColumnFamilies({"one"}, option_vector[1]); // Configure CF2 specific paths. cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024); cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024); cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024); option_vector.emplace_back(DBOptions(options), cf_opt2); - CreateColumnFamilies({"two"},option_vector[2]); + CreateColumnFamilies({"two"}, option_vector[2]); ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); @@ -2729,7 +2782,6 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) { } } - TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) { Options options = CurrentOptions(); options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); @@ -2866,14 +2918,13 @@ TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) { auto file_meta = PickFileRandomly(cf_meta, &rnd, &level); compaction_input_file_names.push_back(file_meta->name); GetOverlappingFileNumbersForLevelCompaction( - cf_meta, options.comparator, level, output_level, - file_meta, &overlapping_file_names); + cf_meta, options.comparator, level, output_level, file_meta, + &overlapping_file_names); } - ASSERT_OK(dbfull()->CompactFiles( - CompactionOptions(), handles_[1], - compaction_input_file_names, - output_level)); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, + output_level)); // Make sure all overlapping files do not exist after compaction dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); @@ -2896,8 +2947,7 @@ TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { options.write_buffer_size = kKeysPerBuffer * kKvSize; options.max_write_buffer_number = 2; options.target_file_size_base = - options.write_buffer_size * - (options.max_write_buffer_number - 1); + options.write_buffer_size * (options.max_write_buffer_number - 1); options.level0_file_num_compaction_trigger = kNumL1Files; options.max_bytes_for_level_base = options.level0_file_num_compaction_trigger * @@ -2917,10 +2967,9 @@ TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) { DestroyAndReopen(options); - const int kNumInsertedKeys = - options.level0_file_num_compaction_trigger * - (options.max_write_buffer_number - 1) * - kKeysPerBuffer; + const int kNumInsertedKeys = options.level0_file_num_compaction_trigger * + (options.max_write_buffer_number - 1) * + kKeysPerBuffer; Random rnd(301); std::vector keys; @@ -3536,6 +3585,59 @@ TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) { Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM); } +TEST_F(DBCompactionTest, CancelCompactionWaitingOnConflict) { + // This test verifies cancellation of a compaction waiting to be scheduled due + // to conflict with a running compaction. + // + // A `CompactRange()` in universal compacts all files, waiting for files to + // become available if they are locked for another compaction. This test + // triggers an automatic compaction that blocks a `CompactRange()`, and + // verifies that `DisableManualCompaction()` can successfully cancel the + // `CompactRange()` without waiting for the automatic compaction to finish. + const int kNumSortedRuns = 4; + + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = kNumSortedRuns; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1)); + Reopen(options); + + test::SleepingBackgroundTask auto_compaction_sleeping_task; + // Block automatic compaction when it runs in the callback + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():Start", + [&](void* /*arg*/) { auto_compaction_sleeping_task.DoSleep(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Fill overlapping files in L0 to trigger an automatic compaction + Random rnd(301); + for (int i = 0; i < kNumSortedRuns; ++i) { + int key_idx = 0; + GenerateNewFile(&rnd, &key_idx, true /* nowait */); + } + auto_compaction_sleeping_task.WaitUntilSleeping(); + + // Make sure the manual compaction has seen the conflict before being canceled + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyData::CompactRange:Return", + "DBCompactionTest::CancelCompactionWaitingOnConflict:" + "PreDisableManualCompaction"}}); + auto manual_compaction_thread = port::Thread([this]() { + ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr) + .IsIncomplete()); + }); + + // Cancel it. Thread should be joinable, i.e., manual compaction was unblocked + // despite finding a conflict with an automatic compaction that is still + // running + TEST_SYNC_POINT( + "DBCompactionTest::CancelCompactionWaitingOnConflict:" + "PreDisableManualCompaction"); + db_->DisableManualCompaction(); + manual_compaction_thread.join(); +} + TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) { // Deletions can be dropped when compacted to non-last level if they fall // outside the lower-level files' key-ranges. @@ -3618,9 +3720,8 @@ TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) { ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size()); std::vector input_filenames; input_filenames.push_back(cf_meta.levels[0].files.front().name); - ASSERT_OK(dbfull() - ->CompactFiles(CompactionOptions(), input_filenames, - 0 /* output_level */)); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames, + 0 /* output_level */)); TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -4171,6 +4272,78 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBCompactionTest, LevelTtlCompactionOutputCuttingIteractingWithOther) { + // This test is for a bug fix in CompactionOutputs::ShouldStopBefore() where + // TTL states were not being updated for keys that ShouldStopBefore() would + // return true for reasons other than TTL. + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.ttl = 24 * 60 * 60; // 24 hours + options.max_open_files = -1; + options.compaction_pri = kMinOverlappingRatio; + env_->SetMockSleep(); + options.env = env_; + options.target_file_size_base = 4 << 10; + options.disable_auto_compactions = true; + options.level_compaction_dynamic_file_size = false; + + DestroyAndReopen(options); + Random rnd(301); + + // This makes sure the manual compaction below + // is not a bottommost compaction as TTL is only + // for non-bottommost compactions. + ASSERT_OK(Put(Key(3), rnd.RandomString(1 << 10))); + ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + // L2: + ASSERT_OK(Put(Key(2), rnd.RandomString(4 << 10))); + ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + // L1, overlaps in range with the file in L2 so + // that they compact together. + ASSERT_OK(Put(Key(0), rnd.RandomString(4 << 10))); + ASSERT_OK(Put(Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + ASSERT_EQ("0,1,1,0,0,0,1", FilesPerLevel()); + // 36 hours so that the file in L2 is eligible for TTL + env_->MockSleepForSeconds(36 * 60 * 60); + + CompactRangeOptions compact_range_opts; + + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 1 /* input_level */, 2 /* output_level */, compact_range_opts, + nullptr /* begin */, nullptr /* end */, true /* exclusive */, + true /* disallow_trivial_move */, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/)); + + // L2 should have 2 files: + // file 1: Key(0), Key(1) + // ShouldStopBefore(Key(2)) return true due to TTL or output file size + // file 2: Key(2), Key(3) + // + // Before the fix in this PR, L2 would have 3 files: + // file 1: Key(0), Key(1) + // CompactionOutputs::ShouldStopBefore(Key(2)) returns true due to output file + // size. + // file 2: Key(2) + // CompactionOutput::ShouldStopBefore(Key(3)) returns true + // due to TTL cutting and that TTL states were not updated + // for Key(2). + // file 3: Key(3) + ASSERT_EQ("0,0,2,0,0,0,1", FilesPerLevel()); +} + TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { env_->SetMockSleep(); const int kValueSize = 100; @@ -4503,9 +4676,9 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) { const int kValueSize = 100; Options options = CurrentOptions(); - options.ttl = 10 * 60 * 60; // 10 hours + options.ttl = 10 * 60 * 60; // 10 hours options.periodic_compaction_seconds = 48 * 60 * 60; // 2 days - options.max_open_files = -1; // needed for both periodic and ttl compactions + options.max_open_files = -1; // needed for both periodic and ttl compactions env_->SetMockSleep(); options.env = env_; @@ -4927,7 +5100,7 @@ TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) { {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - //used for the delayable flushes + // used for the delayable flushes FlushOptions flush_opts; flush_opts.allow_write_stall = true; for (int i = 0; i < kNumL0FilesLimit - 1; ++i) { @@ -4946,7 +5119,8 @@ TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) { ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024))); ASSERT_OK(dbfull()->Flush(flush_opts)); ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024))); - TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush"); + TEST_SYNC_POINT( + "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush"); manual_compaction_thread.join(); // If CompactRange's flush was skipped, the final Put above will still be @@ -5239,10 +5413,10 @@ TEST_F(DBCompactionTest, CompactionLimiter) { } std::shared_ptr unique_limiter( - NewConcurrentTaskLimiter("unique_limiter", -1)); + NewConcurrentTaskLimiter("unique_limiter", -1)); - const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", - "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" }; + const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", "6", "7", + "8", "9", "a", "b", "c", "d", "e", "f"}; const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0]; std::unordered_map cf_to_limiter; @@ -5254,10 +5428,10 @@ TEST_F(DBCompactionTest, CompactionLimiter) { options.level0_file_num_compaction_trigger = 4; options.level0_slowdown_writes_trigger = 64; options.level0_stop_writes_trigger = 64; - options.max_background_jobs = kMaxBackgroundThreads; // Enough threads + options.max_background_jobs = kMaxBackgroundThreads; // Enough threads options.memtable_factory.reset( test::NewSpecialSkipListFactory(kNumKeysPerFile)); - options.max_write_buffer_number = 10; // Enough memtables + options.max_write_buffer_number = 10; // Enough memtables DestroyAndReopen(options); std::vector option_vector; @@ -5285,9 +5459,8 @@ TEST_F(DBCompactionTest, CompactionLimiter) { CreateColumnFamilies({cf_names[cf]}, option_vector[cf]); } - ReopenWithColumnFamilies(std::vector(cf_names, - cf_names + cf_count), - option_vector); + ReopenWithColumnFamilies( + std::vector(cf_names, cf_names + cf_count), option_vector); port::Mutex mutex; @@ -5349,7 +5522,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { // Enough L0 files to trigger compaction for (unsigned int cf = 0; cf < cf_count; cf++) { ASSERT_EQ(NumTableFilesAtLevel(0, cf), - options.level0_file_num_compaction_trigger); + options.level0_file_num_compaction_trigger); } // Create more files for one column family, which triggers speed up @@ -5392,7 +5565,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) { // flush one more file to cf 1 for (int i = 0; i < kNumKeysPerFile; i++) { - ASSERT_OK(Put(cf_test, Key(keyIndex++), "")); + ASSERT_OK(Put(cf_test, Key(keyIndex++), "")); } // put extra key to trigger flush ASSERT_OK(Put(cf_test, "", "")); @@ -5427,9 +5600,7 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) { }); if (options.use_direct_io_for_flush_and_compaction) { SyncPoint::GetInstance()->SetCallBack( - "SanitizeOptions:direct_io", [&](void* /*arg*/) { - readahead = true; - }); + "SanitizeOptions:direct_io", [&](void* /*arg*/) { readahead = true; }); } SyncPoint::GetInstance()->EnableProcessing(); CreateAndReopenWithCF({"pikachu"}, options); @@ -6199,6 +6370,231 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { Close(); } +class DBCompactionTestWithOngoingFileIngestionParam + : public DBCompactionTest, + public testing::WithParamInterface { + public: + DBCompactionTestWithOngoingFileIngestionParam() : DBCompactionTest() { + compaction_path_to_test_ = GetParam(); + } + void SetupOptions() { + options_ = CurrentOptions(); + options_.create_if_missing = true; + + if (compaction_path_to_test_ == "RefitLevelCompactRange") { + options_.num_levels = 7; + } else { + options_.num_levels = 3; + } + options_.compaction_style = CompactionStyle::kCompactionStyleLevel; + if (compaction_path_to_test_ == "AutoCompaction") { + options_.disable_auto_compactions = false; + options_.level0_file_num_compaction_trigger = 1; + } else { + options_.disable_auto_compactions = true; + } + } + + void PauseCompactionThread() { + sleeping_task_.reset(new test::SleepingBackgroundTask()); + env_->SetBackgroundThreads(1, Env::LOW); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task_.get(), Env::Priority::LOW); + sleeping_task_->WaitUntilSleeping(); + } + + void ResumeCompactionThread() { + if (sleeping_task_) { + sleeping_task_->WakeUp(); + sleeping_task_->WaitUntilDone(); + } + } + + void SetupFilesToForceFutureFilesIngestedToCertainLevel() { + SstFileWriter sst_file_writer(EnvOptions(), options_); + std::string dummy = dbname_ + "/dummy.sst"; + ASSERT_OK(sst_file_writer.Open(dummy)); + ASSERT_OK(sst_file_writer.Put("k2", "dummy")); + ASSERT_OK(sst_file_writer.Finish()); + ASSERT_OK(db_->IngestExternalFile({dummy}, IngestExternalFileOptions())); + // L2 is made to contain a file overlapped with files to be ingested in + // later steps on key "k2". This will force future files ingested to L1 or + // above. + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + } + + void SetupSyncPoints() { + if (compaction_path_to_test_ == "AutoCompaction") { + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Run", [&](void*) { + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction():AfterPickCompaction", + "VersionSet::LogAndApply:WriteManifest"}}); + }); + } else if (compaction_path_to_test_ == "NonRefitLevelCompactRange") { + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Run", [&](void*) { + SyncPoint::GetInstance()->LoadDependency( + {{"ColumnFamilyData::CompactRange:Return", + "VersionSet::LogAndApply:WriteManifest"}}); + }); + } else if (compaction_path_to_test_ == "RefitLevelCompactRange") { + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Run", [&](void*) { + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::CompactRange:PostRefitLevel", + "VersionSet::LogAndApply:WriteManifest"}}); + }); + } else if (compaction_path_to_test_ == "CompactFiles") { + SyncPoint::GetInstance()->SetCallBack( + "ExternalSstFileIngestionJob::Run", [&](void*) { + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::CompactFilesImpl::PostSanitizeCompactionInputFiles", + "VersionSet::LogAndApply:WriteManifest"}}); + }); + } else { + assert(false); + } + SyncPoint::GetInstance()->LoadDependency( + {{"ExternalSstFileIngestionJob::Run", "PreCompaction"}}); + SyncPoint::GetInstance()->EnableProcessing(); + } + + void RunCompactionOverlappedWithFileIngestion() { + if (compaction_path_to_test_ == "AutoCompaction") { + TEST_SYNC_POINT("PreCompaction"); + ResumeCompactionThread(); + // Without proper range conflict check, + // this would have been `Status::Corruption` about overlapping ranges + Status s = dbfull()->TEST_WaitForCompact(); + EXPECT_OK(s); + } else if (compaction_path_to_test_ == "NonRefitLevelCompactRange") { + CompactRangeOptions cro; + cro.change_level = false; + std::string start_key = "k1"; + Slice start(start_key); + std::string end_key = "k4"; + Slice end(end_key); + TEST_SYNC_POINT("PreCompaction"); + // Without proper range conflict check, + // this would have been `Status::Corruption` about overlapping ranges + Status s = dbfull()->CompactRange(cro, &start, &end); + EXPECT_OK(s); + } else if (compaction_path_to_test_ == "RefitLevelCompactRange") { + CompactRangeOptions cro; + cro.change_level = true; + cro.target_level = 5; + std::string start_key = "k1"; + Slice start(start_key); + std::string end_key = "k4"; + Slice end(end_key); + TEST_SYNC_POINT("PreCompaction"); + Status s = dbfull()->CompactRange(cro, &start, &end); + // Without proper range conflict check, + // this would have been `Status::Corruption` about overlapping ranges + // To see this, remove the fix AND replace + // `DBImpl::CompactRange:PostRefitLevel` in sync point dependency with + // `DBImpl::ReFitLevel:PostRegisterCompaction` + EXPECT_TRUE(s.IsNotSupported()); + EXPECT_TRUE(s.ToString().find("some ongoing compaction's output") != + std::string::npos); + } else if (compaction_path_to_test_ == "CompactFiles") { + ColumnFamilyMetaData cf_meta_data; + db_->GetColumnFamilyMetaData(&cf_meta_data); + ASSERT_EQ(cf_meta_data.levels[0].files.size(), 1); + std::vector input_files; + for (const auto& file : cf_meta_data.levels[0].files) { + input_files.push_back(file.name); + } + TEST_SYNC_POINT("PreCompaction"); + Status s = db_->CompactFiles(CompactionOptions(), input_files, 1); + // Without proper range conflict check, + // this would have been `Status::Corruption` about overlapping ranges + EXPECT_TRUE(s.IsAborted()); + EXPECT_TRUE( + s.ToString().find( + "A running compaction is writing to the same output level") != + std::string::npos); + } else { + assert(false); + } + } + + void DisableSyncPoints() { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + + protected: + std::string compaction_path_to_test_; + Options options_; + std::shared_ptr sleeping_task_; +}; + +INSTANTIATE_TEST_CASE_P(DBCompactionTestWithOngoingFileIngestionParam, + DBCompactionTestWithOngoingFileIngestionParam, + ::testing::Values("AutoCompaction", + "NonRefitLevelCompactRange", + "RefitLevelCompactRange", + "CompactFiles")); + +TEST_P(DBCompactionTestWithOngoingFileIngestionParam, RangeConflictCheck) { + SetupOptions(); + DestroyAndReopen(options_); + + if (compaction_path_to_test_ == "AutoCompaction") { + PauseCompactionThread(); + } + + if (compaction_path_to_test_ != "RefitLevelCompactRange") { + SetupFilesToForceFutureFilesIngestedToCertainLevel(); + } + + // Create s1 + ASSERT_OK(Put("k1", "v")); + ASSERT_OK(Put("k4", "v")); + ASSERT_OK(Flush()); + if (compaction_path_to_test_ == "RefitLevelCompactRange") { + MoveFilesToLevel(6 /* level */); + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel(0)); + } else { + ASSERT_EQ("1,0,1", FilesPerLevel(0)); + } + + // To coerce following sequence of events + // Timeline Thread 1 (Ingest s2) Thread 2 (Compact s1) + // t0 | Decide to output to Lk + // t1 | Release lock in LogAndApply() + // t2 | Acquire lock + // t3 | Decides to compact to Lk + // | Expected to fail due to range + // | conflict check with file + // | ingestion + // t4 | Release lock in LogAndApply() + // t5 | Acquire lock again and finish + // t6 | Acquire lock again and finish + SetupSyncPoints(); + + // Ingest s2 + port::Thread thread1([&] { + SstFileWriter sst_file_writer(EnvOptions(), options_); + std::string s2 = dbname_ + "/ingested_s2.sst"; + ASSERT_OK(sst_file_writer.Open(s2)); + ASSERT_OK(sst_file_writer.Put("k2", "v2")); + ASSERT_OK(sst_file_writer.Put("k3", "v2")); + ASSERT_OK(sst_file_writer.Finish()); + ASSERT_OK(db_->IngestExternalFile({s2}, IngestExternalFileOptions())); + }); + + // Compact s1. Without proper range conflict check, + // this will encounter overlapping file corruption. + port::Thread thread2([&] { RunCompactionOverlappedWithFileIngestion(); }); + + thread1.join(); + thread2.join(); + DisableSyncPoints(); +} + TEST_F(DBCompactionTest, ConsistencyFailTest) { Options options = CurrentOptions(); options.force_consistency_checks = true; @@ -6291,172 +6687,655 @@ void IngestOneKeyValue(DBImpl* db, const std::string& key, ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt)); } -TEST_P(DBCompactionTestWithParam, - FlushAfterIntraL0CompactionCheckConsistencyFail) { - Options options = CurrentOptions(); - options.force_consistency_checks = true; - options.compression = kNoCompression; - options.level0_file_num_compaction_trigger = 5; - options.max_background_compactions = 2; - options.max_subcompactions = max_subcompactions_; - DestroyAndReopen(options); +class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest { + public: + DBCompactionTestL0FilesMisorderCorruption() : DBCompactionTest() {} + void SetupOptions(const CompactionStyle compaciton_style, + const std::string& compaction_path_to_test = "") { + options_ = CurrentOptions(); + options_.create_if_missing = true; + options_.compression = kNoCompression; + + options_.force_consistency_checks = true; + options_.compaction_style = compaciton_style; + + if (compaciton_style == CompactionStyle::kCompactionStyleLevel) { + options_.num_levels = 7; + // Level compaction's PickIntraL0Compaction() impl detail requires + // `options.level0_file_num_compaction_trigger` to be + // at least 2 files less than the actual number of level 0 files + // (i.e, 7 by design in this test) + options_.level0_file_num_compaction_trigger = 5; + options_.max_background_compactions = 2; + options_.write_buffer_size = 2 << 20; + options_.max_write_buffer_number = 6; + } else if (compaciton_style == CompactionStyle::kCompactionStyleUniversal) { + // TODO: expand test coverage to num_lvels > 1 for universal compacion, + // which requires careful unit test design to compact to level 0 despite + // num_levels > 1 + options_.num_levels = 1; + options_.level0_file_num_compaction_trigger = 5; + + CompactionOptionsUniversal universal_options; + if (compaction_path_to_test == "PickCompactionToReduceSizeAmp") { + universal_options.max_size_amplification_percent = 50; + } else if (compaction_path_to_test == + "PickCompactionToReduceSortedRuns") { + universal_options.max_size_amplification_percent = 400; + } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") { + universal_options.max_size_amplification_percent = 400; + universal_options.min_merge_width = 6; + } + options_.compaction_options_universal = universal_options; + } else if (compaciton_style == CompactionStyle::kCompactionStyleFIFO) { + options_.max_open_files = -1; + options_.num_levels = 1; + options_.level0_file_num_compaction_trigger = 3; + + CompactionOptionsFIFO fifo_options; + if (compaction_path_to_test == "FindIntraL0Compaction" || + compaction_path_to_test == "CompactRange") { + fifo_options.allow_compaction = true; + fifo_options.age_for_warm = 0; + } else if (compaction_path_to_test == "CompactFile") { + fifo_options.allow_compaction = false; + fifo_options.age_for_warm = 0; + } + options_.compaction_options_fifo = fifo_options; + } - const size_t kValueSize = 1 << 20; - Random rnd(301); - std::atomic pick_intra_l0_count(0); - std::string value(rnd.RandomString(kValueSize)); + if (compaction_path_to_test == "CompactFile" || + compaction_path_to_test == "CompactRange") { + options_.disable_auto_compactions = true; + } else { + options_.disable_auto_compactions = false; + } + } - // The L0->L1 must be picked before we begin ingesting files to trigger - // intra-L0 compaction, and must not finish until after an intra-L0 - // compaction has been picked. - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"LevelCompactionPicker::PickCompaction:Return", - "DBCompactionTestWithParam::" - "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"}, - {"LevelCompactionPicker::PickCompactionBySize:0", - "CompactionJob::Run():Start"}}); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "FindIntraL0Compaction", - [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + void Destroy(const Options& options) { + if (snapshot_) { + assert(db_); + db_->ReleaseSnapshot(snapshot_); + snapshot_ = nullptr; + } + DBTestBase::Destroy(options); + } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + void Reopen(const Options& options) { + DBTestBase::Reopen(options); + if (options.compaction_style != CompactionStyle::kCompactionStyleLevel) { + // To force assigning the global seqno to ingested file + // for our test purpose. + assert(snapshot_ == nullptr); + snapshot_ = db_->GetSnapshot(); + } + } - // prevents trivial move - for (int i = 0; i < 10; ++i) { - ASSERT_OK(Put(Key(i), "")); // prevents trivial move + void DestroyAndReopen(Options& options) { + Destroy(options); + Reopen(options); } - ASSERT_OK(Flush()); - Compact("", Key(99)); - ASSERT_EQ(0, NumTableFilesAtLevel(0)); - // Flush 5 L0 sst. - for (int i = 0; i < 5; ++i) { - ASSERT_OK(Put(Key(i + 1), value)); - ASSERT_OK(Flush()); + void PauseCompactionThread() { + sleeping_task_.reset(new test::SleepingBackgroundTask()); + env_->SetBackgroundThreads(1, Env::LOW); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task_.get(), Env::Priority::LOW); + sleeping_task_->WaitUntilSleeping(); + } + + void ResumeCompactionThread() { + if (sleeping_task_) { + sleeping_task_->WakeUp(); + sleeping_task_->WaitUntilDone(); + } } - ASSERT_EQ(5, NumTableFilesAtLevel(0)); - // Put one key, to make smallest log sequence number in this memtable is less - // than sst which would be ingested in next step. - ASSERT_OK(Put(Key(0), "a")); + void AddFilesMarkedForPeriodicCompaction(const size_t num_files) { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + Version* const current = cfd->current(); + assert(current); - ASSERT_EQ(5, NumTableFilesAtLevel(0)); - TEST_SYNC_POINT( - "DBCompactionTestWithParam::" - "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"); + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); - // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction. - for (int i = 5; i < 10; i++) { - ASSERT_EQ(i, NumTableFilesAtLevel(0)); - IngestOneKeyValue(dbfull(), Key(i), value, options); + const std::vector level0_files = storage_info->LevelFiles(0); + assert(level0_files.size() == num_files); + + for (FileMetaData* f : level0_files) { + storage_info->TEST_AddFileMarkedForPeriodicCompaction(0, f); + } } - // Put one key, to make biggest log sequence number in this memtable is bigger - // than sst which would be ingested in next step. - ASSERT_OK(Put(Key(2), "b")); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - std::vector> level_to_files; - dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), - &level_to_files); - ASSERT_GT(level_to_files[0].size(), 0); - ASSERT_GT(pick_intra_l0_count.load(), 0); + void AddFilesMarkedForCompaction(const size_t num_files) { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + Version* const current = cfd->current(); + assert(current); - ASSERT_OK(Flush()); -} + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); -TEST_P(DBCompactionTestWithParam, - IntraL0CompactionAfterFlushCheckConsistencyFail) { - Options options = CurrentOptions(); - options.force_consistency_checks = true; - options.compression = kNoCompression; - options.level0_file_num_compaction_trigger = 5; - options.max_background_compactions = 2; - options.max_subcompactions = max_subcompactions_; - options.write_buffer_size = 2 << 20; - options.max_write_buffer_number = 6; - DestroyAndReopen(options); + const std::vector level0_files = storage_info->LevelFiles(0); + assert(level0_files.size() == num_files); - const size_t kValueSize = 1 << 20; - Random rnd(301); - std::string value(rnd.RandomString(kValueSize)); - std::string value2(rnd.RandomString(kValueSize)); - std::string bigvalue = value + value; + for (FileMetaData* f : level0_files) { + storage_info->TEST_AddFileMarkedForCompaction(0, f); + } + } + + void SetupSyncPoints(const std::string& compaction_path_to_test) { + compaction_path_sync_point_called_.store(false); + if (compaction_path_to_test == "FindIntraL0Compaction" && + options_.compaction_style == CompactionStyle::kCompactionStyleLevel) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PostPickFileToCompact", [&](void* arg) { + bool* picked_file_to_compact = (bool*)arg; + // To trigger intra-L0 compaction specifically, + // we mock PickFileToCompact()'s result to be false + *picked_file_to_compact = false; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + + } else if (compaction_path_to_test == "PickPeriodicCompaction") { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PostPickPeriodicCompaction", [&](void* compaction_arg) { + Compaction* compaction = (Compaction*)compaction_arg; + if (compaction != nullptr) { + compaction_path_sync_point_called_.store(true); + } + }); + } else if (compaction_path_to_test == "PickCompactionToReduceSizeAmp") { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PickCompactionToReduceSizeAmpReturnNonnullptr", [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + } else if (compaction_path_to_test == "PickCompactionToReduceSortedRuns") { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PickCompactionToReduceSortedRunsReturnNonnullptr", + [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PickDeleteTriggeredCompactionReturnNonnullptr", [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + } else if ((compaction_path_to_test == "FindIntraL0Compaction" || + compaction_path_to_test == "CompactRange") && + options_.compaction_style == + CompactionStyle::kCompactionStyleFIFO) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + bool SyncPointsCalled() { return compaction_path_sync_point_called_.load(); } + + void DisableSyncPoints() { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } - // prevents trivial move + // Return the largest seqno of the latest L0 file based on file number + SequenceNumber GetLatestL0FileLargestSeqnoHelper() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + Version* const current = cfd->current(); + assert(current); + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + const std::vector level0_files = storage_info->LevelFiles(0); + assert(level0_files.size() >= 1); + + uint64_t latest_file_num = 0; + uint64_t latest_file_largest_seqno = 0; + for (FileMetaData* f : level0_files) { + if (f->fd.GetNumber() > latest_file_num) { + latest_file_num = f->fd.GetNumber(); + latest_file_largest_seqno = f->fd.largest_seqno; + } + } + + return latest_file_largest_seqno; + } + + protected: + Options options_; + + private: + const Snapshot* snapshot_ = nullptr; + std::atomic compaction_path_sync_point_called_; + std::shared_ptr sleeping_task_; +}; + +TEST_F(DBCompactionTestL0FilesMisorderCorruption, + FlushAfterIntraL0LevelCompactionWithIngestedFile) { + SetupOptions(CompactionStyle::kCompactionStyleLevel, ""); + DestroyAndReopen(options_); + // Prevents trivial move for (int i = 0; i < 10; ++i) { - ASSERT_OK(Put(Key(i), "")); // prevents trivial move + ASSERT_OK(Put(Key(i), "")); // Prevents trivial move } ASSERT_OK(Flush()); Compact("", Key(99)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); - std::atomic pick_intra_l0_count(0); - // The L0->L1 must be picked before we begin ingesting files to trigger - // intra-L0 compaction, and must not finish until after an intra-L0 - // compaction has been picked. - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"LevelCompactionPicker::PickCompaction:Return", - "DBCompactionTestWithParam::" - "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"}, - {"LevelCompactionPicker::PickCompactionBySize:0", - "CompactionJob::Run():Start"}}); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "FindIntraL0Compaction", - [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - // Make 6 L0 sst. + // To get accurate NumTableFilesAtLevel(0) when the number reaches + // options_.level0_file_num_compaction_trigger + PauseCompactionThread(); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1[ 5:new@12 .. 1:new@8, 0:new@7] + // L0: s6[6:new@13], s5[5:old@6] ... s1[1:old@2],s0[0:old@1] + // + // (1) Make 6 L0 sst (i.e, s0 - s5) for (int i = 0; i < 6; ++i) { if (i % 2 == 0) { - IngestOneKeyValue(dbfull(), Key(i), value, options); + IngestOneKeyValue(dbfull(), Key(i), "old", options_); } else { - ASSERT_OK(Put(Key(i), value)); + ASSERT_OK(Put(Key(i), "old")); ASSERT_OK(Flush()); } } - ASSERT_EQ(6, NumTableFilesAtLevel(0)); - // Stop run flush job - env_->SetBackgroundThreads(1, Env::HIGH); - test::SleepingBackgroundTask sleeping_tasks; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks, - Env::Priority::HIGH); - sleeping_tasks.WaitUntilSleeping(); - - // Put many keys to make memtable request to flush + // (2) Create m1 for (int i = 0; i < 6; ++i) { - ASSERT_OK(Put(Key(i), bigvalue)); + ASSERT_OK(Put(Key(i), "new")); } - ASSERT_EQ(6, NumTableFilesAtLevel(0)); - TEST_SYNC_POINT( - "DBCompactionTestWithParam::" - "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"); - // ingest file to trigger IntraL0Compaction - for (int i = 6; i < 10; ++i) { + + // (3) Ingest file (i.e, s6) to trigger IntraL0Compaction() + for (int i = 6; i < 7; ++i) { ASSERT_EQ(i, NumTableFilesAtLevel(0)); - IngestOneKeyValue(dbfull(), Key(i), value2, options); + IngestOneKeyValue(dbfull(), Key(i), "new", options_); } - // Wake up flush job - sleeping_tasks.WakeUp(); - sleeping_tasks.WaitUntilDone(); + SetupSyncPoints("FindIntraL0Compaction"); + ResumeCompactionThread(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - uint64_t error_count = 0; - db_->GetIntProperty("rocksdb.background-errors", &error_count); - ASSERT_EQ(error_count, 0); - ASSERT_GT(pick_intra_l0_count.load(), 0); + ASSERT_TRUE(SyncPointsCalled()); + DisableSyncPoints(); + + // After compaction, we have LSM tree: + // + // memtable: m1[ 5:new@12 .. 1:new@8, 0:new@7] + // L0: s7[6:new@13, 5:old@6 .. 0:old@1] + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()); + // After flush, we have LSM tree: + // + // L0: s8[5:new@12 .. 0:new@7],s7[6:new@13, 5:old@5 .. 0:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old value + // of Key(0) - Key(5) , which is caused by flushed table s8 has a + // smaller largest seqno than the compaction output file s7's largest seqno + // while the flushed table has the newer version of the values than the + // compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno); for (int i = 0; i < 6; ++i) { - ASSERT_EQ(bigvalue, Get(Key(i))); + ASSERT_EQ("new", Get(Key(i))); } - for (int i = 6; i < 10; ++i) { - ASSERT_EQ(value2, Get(Key(i))); + for (int i = 6; i < 7; ++i) { + ASSERT_EQ("new", Get(Key(i))); } } +TEST_F(DBCompactionTestL0FilesMisorderCorruption, + FlushAfterIntraL0UniversalCompactionWithIngestedFile) { + for (const std::string compaction_path_to_test : + {"PickPeriodicCompaction", "PickCompactionToReduceSizeAmp", + "PickCompactionToReduceSortedRuns", "PickDeleteTriggeredCompaction"}) { + SetupOptions(CompactionStyle::kCompactionStyleUniversal, + compaction_path_to_test); + DestroyAndReopen(options_); + + // To get accurate NumTableFilesAtLevel(0) when the number reaches + // options_.level0_file_num_compaction_trigger + PauseCompactionThread(); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1 [ k2:new@8, k1:new@7] + // L0: s4[k9:dummy@10], s3[k8:dummy@9], + // s2[k7:old@6, k6:old@5].. s0[k3:old@2, k1:old@1] + // + // (1) Create 3 existing SST file (i.e, s0 - s2) + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Put("k3", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_OK(Put("k4", "old")); + ASSERT_OK(Put("k5", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_OK(Put("k6", "old")); + ASSERT_OK(Put("k7", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + // (2) Create m1. Noted that it contains a overlaped key with s0 + ASSERT_OK(Put("k1", "new")); // overlapped key + ASSERT_OK(Put("k2", "new")); + + // (3) Ingest two SST files s3, s4 + IngestOneKeyValue(dbfull(), "k8", "dummy", options_); + IngestOneKeyValue(dbfull(), "k9", "dummy", options_); + // Up to now, L0 contains s0 - s4 + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + + if (compaction_path_to_test == "PickPeriodicCompaction") { + AddFilesMarkedForPeriodicCompaction(5); + } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") { + AddFilesMarkedForCompaction(5); + } + + SetupSyncPoints(compaction_path_to_test); + ResumeCompactionThread(); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(SyncPointsCalled()) + << "failed for compaction path to test: " << compaction_path_to_test; + DisableSyncPoints(); + + // After compaction, we have LSM tree: + // + // memtable: m1[ k2:new@8, k1:new@7] + // L0: s5[k9:dummy@10, k8@dummy@9, k7:old@6 .. k3:old@2, k1:old@1] + ASSERT_EQ(1, NumTableFilesAtLevel(0)) + << "failed for compaction path to test: " << compaction_path_to_test; + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()) << "failed for compaction path to test: " + << compaction_path_to_test; + // After flush, we have LSM tree: + // + // L0: s6[k2:new@8, k1:new@7], + // s5[k9:dummy@10, k8@dummy@9, k7:old@6 .. k3:old@2, k1:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)) + << "failed for compaction path to test: " << compaction_path_to_test; + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old + // value of "k1" , which is caused by flushed table s6 has a + // smaller largest seqno than the compaction output file s5's largest seqno + // while the flushed table has the newer version of the value + // than the compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno) + << "failed for compaction path to test: " << compaction_path_to_test; + EXPECT_EQ(Get("k1"), "new") + << "failed for compaction path to test: " << compaction_path_to_test; + } + + Destroy(options_); +} + +TEST_F(DBCompactionTestL0FilesMisorderCorruption, + FlushAfterIntraL0FIFOCompactionWithIngestedFile) { + for (const std::string compaction_path_to_test : {"FindIntraL0Compaction"}) { + SetupOptions(CompactionStyle::kCompactionStyleFIFO, + compaction_path_to_test); + DestroyAndReopen(options_); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1] + // + // (1) Create an existing SST file s0 + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Put("k3", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // (2) Create memtable m1. Noted that it contains a overlaped key with s0 + ASSERT_OK(Put("k1", "new")); // overlapped key + ASSERT_OK(Put("k2", "new")); + + // To get accurate NumTableFilesAtLevel(0) when the number reaches + // options_.level0_file_num_compaction_trigger + PauseCompactionThread(); + + // (3) Ingest two SST files s1, s2 + IngestOneKeyValue(dbfull(), "k4", "dummy", options_); + IngestOneKeyValue(dbfull(), "k5", "dummy", options_); + // Up to now, L0 contains s0, s1, s2 + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + SetupSyncPoints(compaction_path_to_test); + ResumeCompactionThread(); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(SyncPointsCalled()) + << "failed for compaction path to test: " << compaction_path_to_test; + DisableSyncPoints(); + // After compaction, we have LSM tree: + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1] + ASSERT_EQ(1, NumTableFilesAtLevel(0)) + << "failed for compaction path to test: " << compaction_path_to_test; + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()) << "failed for compaction path to test: " + << compaction_path_to_test; + // After flush, we have LSM tree: + // + // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2, + // k1:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)) + << "failed for compaction path to test: " << compaction_path_to_test; + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old + // value of "k1" , which is caused by flushed table s4 has a + // smaller largest seqno than the compaction output file s3's largest seqno + // while the flushed table has the newer version of the value + // than the compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno) + << "failed for compaction path to test: " << compaction_path_to_test; + EXPECT_EQ(Get("k1"), "new") + << "failed for compaction path to test: " << compaction_path_to_test; + } + + Destroy(options_); +} + +class DBCompactionTestL0FilesMisorderCorruptionWithParam + : public DBCompactionTestL0FilesMisorderCorruption, + public testing::WithParamInterface { + public: + DBCompactionTestL0FilesMisorderCorruptionWithParam() + : DBCompactionTestL0FilesMisorderCorruption() {} +}; + +// TODO: add `CompactionStyle::kCompactionStyleLevel` to testing parameter, +// which requires careful unit test +// design for ingesting file to L0 and CompactRange()/CompactFile() to L0 +INSTANTIATE_TEST_CASE_P( + DBCompactionTestL0FilesMisorderCorruptionWithParam, + DBCompactionTestL0FilesMisorderCorruptionWithParam, + ::testing::Values(CompactionStyle::kCompactionStyleUniversal, + CompactionStyle::kCompactionStyleFIFO)); + +TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam, + FlushAfterIntraL0CompactFileWithIngestedFile) { + SetupOptions(GetParam(), "CompactFile"); + DestroyAndReopen(options_); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1] + // + // (1) Create an existing SST file s0 + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Put("k3", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // (2) Create memtable m1. Noted that it contains a overlaped key with s0 + ASSERT_OK(Put("k1", "new")); // overlapped key + ASSERT_OK(Put("k2", "new")); + + // (3) Ingest two SST files s1, s2 + IngestOneKeyValue(dbfull(), "k4", "dummy", options_); + IngestOneKeyValue(dbfull(), "k5", "dummy", options_); + // Up to now, L0 contains s0, s1, s2 + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + ColumnFamilyMetaData cf_meta_data; + db_->GetColumnFamilyMetaData(&cf_meta_data); + ASSERT_EQ(cf_meta_data.levels[0].files.size(), 3); + std::vector input_files; + for (const auto& file : cf_meta_data.levels[0].files) { + input_files.push_back(file.name); + } + ASSERT_EQ(input_files.size(), 3); + + Status s = db_->CompactFiles(CompactionOptions(), input_files, 0); + // After compaction, we have LSM tree: + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1] + ASSERT_OK(s); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()); + // After flush, we have LSM tree: + // + // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2, + // k1:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old value + // of "1" , which is caused by flushed table s4 has a smaller + // largest seqno than the compaction output file s3's largest seqno while the + // flushed table has the newer version of the value than the + // compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno); + EXPECT_EQ(Get("k1"), "new"); + + Destroy(options_); +} + +TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam, + FlushAfterIntraL0CompactRangeWithIngestedFile) { + SetupOptions(GetParam(), "CompactRange"); + DestroyAndReopen(options_); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1] + // + // (1) Create an existing SST file s0 + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Put("k3", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // (2) Create memtable m1. Noted that it contains a overlaped key with s0 + ASSERT_OK(Put("k1", "new")); // overlapped key + ASSERT_OK(Put("k2", "new")); + + // (3) Ingest two SST files s1, s2 + IngestOneKeyValue(dbfull(), "k4", "dummy", options_); + IngestOneKeyValue(dbfull(), "k5", "dummy", options_); + // Up to now, L0 contains s0, s1, s2 + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + if (options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + SetupSyncPoints("CompactRange"); + } + // `start` and `end` is carefully chosen so that compact range: + // (1) doesn't overlap with memtable therefore the memtable won't be flushed + // (2) should target at compacting s0 with s1 and s2 + Slice start("k3"), end("k5"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + // After compaction, we have LSM tree: + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1] + if (options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + ASSERT_TRUE(SyncPointsCalled()); + DisableSyncPoints(); + } + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()); + // After flush, we have LSM tree: + // + // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2, + // k1:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old value + // of "k1" , which is caused by flushed table s4 has a smaller + // largest seqno than the compaction output file s3's largest seqno while the + // flushed table has the newer version of the value than the + // compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno); + EXPECT_EQ(Get("k1"), "new"); + + Destroy(options_); +} + TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { constexpr int kSstNum = 10; Options options = CurrentOptions(); @@ -8240,8 +9119,8 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); #else - (void) argc; - (void) argv; + (void)argc; + (void)argv; return 0; #endif } diff --git a/db/db_dynamic_level_test.cc b/db/db_dynamic_level_test.cc index 13a160958ea..17fa67cb200 100644 --- a/db/db_dynamic_level_test.cc +++ b/db/db_dynamic_level_test.cc @@ -500,8 +500,8 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); #else - (void) argc; - (void) argv; + (void)argc; + (void)argv; return 0; #endif } diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc index b7000dd7a2a..73e89d158bd 100644 --- a/db/db_encryption_test.cc +++ b/db/db_encryption_test.cc @@ -43,7 +43,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) { Env* target = GetTargetEnv(); int hits = 0; - for (auto it = fileNames.begin() ; it != fileNames.end(); ++it) { + for (auto it = fileNames.begin(); it != fileNames.end(); ++it) { if (*it == "LOCK") { continue; } @@ -64,24 +64,24 @@ TEST_F(DBEncryptionTest, CheckEncrypted) { ASSERT_OK(status); if (data.ToString().find("foo567") != std::string::npos) { - hits++; - //std::cout << "Hit in " << filePath << "\n"; + hits++; + // std::cout << "Hit in " << filePath << "\n"; } if (data.ToString().find("v1.fetdq") != std::string::npos) { - hits++; - //std::cout << "Hit in " << filePath << "\n"; + hits++; + // std::cout << "Hit in " << filePath << "\n"; } if (data.ToString().find("bar123") != std::string::npos) { - hits++; - //std::cout << "Hit in " << filePath << "\n"; + hits++; + // std::cout << "Hit in " << filePath << "\n"; } if (data.ToString().find("v2.dfgkjdfghsd") != std::string::npos) { - hits++; - //std::cout << "Hit in " << filePath << "\n"; + hits++; + // std::cout << "Hit in " << filePath << "\n"; } if (data.ToString().find("dfgk") != std::string::npos) { - hits++; - //std::cout << "Hit in " << filePath << "\n"; + hits++; + // std::cout << "Hit in " << filePath << "\n"; } } if (encrypted_env_) { @@ -119,7 +119,7 @@ TEST_F(DBEncryptionTest, ReadEmptyFile) { ASSERT_TRUE(data.empty()); } -#endif // ROCKSDB_LITE +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 515abb72890..aa9bd738a51 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -65,8 +65,7 @@ Status DBImpl::FlushForGetLiveFiles() { } Status DBImpl::GetLiveFiles(std::vector& ret, - uint64_t* manifest_file_size, - bool flush_memtable) { + uint64_t* manifest_file_size, bool flush_memtable) { *manifest_file_size = 0; mutex_.Lock(); diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index 96d5f5e8c98..5804cd3de18 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -58,7 +58,7 @@ TEST_F(DBFlushTest, FlushWhileWritingManifest) { Reopen(options); FlushOptions no_wait; no_wait.wait = false; - no_wait.allow_write_stall=true; + no_wait.allow_write_stall = true; SyncPoint::GetInstance()->LoadDependency( {{"VersionSet::LogAndApply:WriteManifest", @@ -747,6 +747,64 @@ class TestFlushListener : public EventListener { }; #endif // !ROCKSDB_LITE +// RocksDB lite does not support GetLiveFiles() +#ifndef ROCKSDB_LITE +TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) { + Options options = CurrentOptions(); + options.atomic_flush = true; + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"cf1"}, options); + + for (int idx = 0; idx < 1; ++idx) { + ASSERT_OK(Put(0, Key(idx), std::string(1, 'v'))); + ASSERT_OK(Put(1, Key(idx), std::string(1, 'v'))); + } + + // To coerce a manual flush happenning in the middle of GetLiveFiles's flush, + // we need to pause background flush thread and enable it later. + std::shared_ptr sleeping_task = + std::make_shared(); + env_->SetBackgroundThreads(1, Env::HIGH); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task.get(), Env::Priority::HIGH); + sleeping_task->WaitUntilSleeping(); + + // Coerce a manual flush happenning in the middle of GetLiveFiles's flush + bool get_live_files_paused_at_sync_point = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::AtomicFlushMemTables:AfterScheduleFlush", [&](void* /* arg */) { + if (get_live_files_paused_at_sync_point) { + // To prevent non-GetLiveFiles() flush from pausing at this sync point + return; + } + get_live_files_paused_at_sync_point = true; + + FlushOptions fo; + fo.wait = false; + fo.allow_write_stall = true; + ASSERT_OK(dbfull()->Flush(fo)); + + // Resume background flush thread so GetLiveFiles() can finish + sleeping_task->WakeUp(); + sleeping_task->WaitUntilDone(); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + std::vector files; + uint64_t manifest_file_size; + // Before the fix, a race condition on default cf's flush reason due to + // concurrent GetLiveFiles's flush and manual flush will fail + // an internal assertion. + // After the fix, such race condition is fixed and there is no assertion + // failure. + ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true)); + ASSERT_TRUE(get_live_files_paused_at_sync_point); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // !ROCKSDB_LITE + TEST_F(DBFlushTest, MemPurgeBasic) { Options options = CurrentOptions(); @@ -1823,8 +1881,8 @@ TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) { ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); #ifndef ROCKSDB_LITE uint64_t num_bg_errors; - ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors, - &num_bg_errors)); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kBackgroundErrors, &num_bg_errors)); ASSERT_GT(num_bg_errors, 0); #endif // ROCKSDB_LITE @@ -2441,7 +2499,9 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { options.atomic_flush = GetParam(); // 64MB so that memtable flush won't be trigger by the small writes. options.write_buffer_size = (static_cast(64) << 20); - + auto flush_listener = std::make_shared(); + flush_listener->expected_flush_reason = FlushReason::kManualFlush; + options.listeners.push_back(flush_listener); // Destroy the DB to recreate as a TransactionDB. Close(); Destroy(options, true); @@ -2508,7 +2568,6 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { auto cfh = static_cast(handles_[i]); ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); - ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush); } // The recovered min log number with prepared data should be non-zero. @@ -2521,13 +2580,15 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) { ASSERT_TRUE(db_impl->allow_2pc()); ASSERT_NE(db_impl->MinLogNumberToKeep(), 0); } -#endif // ROCKSDB_LITE TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { Options options = CurrentOptions(); options.create_if_missing = true; options.atomic_flush = GetParam(); options.write_buffer_size = (static_cast(64) << 20); + auto flush_listener = std::make_shared(); + flush_listener->expected_flush_reason = FlushReason::kManualFlush; + options.listeners.push_back(flush_listener); CreateAndReopenWithCF({"pikachu", "eevee"}, options); size_t num_cfs = handles_.size(); @@ -2552,11 +2613,11 @@ TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { for (size_t i = 0; i != num_cfs; ++i) { auto cfh = static_cast(handles_[i]); - ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush); ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed()); ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); } } +#endif // ROCKSDB_LITE TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) { Options options = CurrentOptions(); diff --git a/db/db_impl/compacted_db_impl.cc b/db/db_impl/compacted_db_impl.cc index c5947b06b9b..f18ee0d7239 100644 --- a/db/db_impl/compacted_db_impl.cc +++ b/db/db_impl/compacted_db_impl.cc @@ -26,16 +26,16 @@ CompactedDBImpl::CompactedDBImpl(const DBOptions& options, version_(nullptr), user_comparator_(nullptr) {} -CompactedDBImpl::~CompactedDBImpl() { -} +CompactedDBImpl::~CompactedDBImpl() {} size_t CompactedDBImpl::FindFile(const Slice& key) { size_t right = files_.num_files - 1; auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0; }; - return static_cast(std::lower_bound(files_.files, - files_.files + right, key, cmp) - files_.files); + return static_cast( + std::lower_bound(files_.files, files_.files + right, key, cmp) - + files_.files); } Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, @@ -228,8 +228,8 @@ Status CompactedDBImpl::Init(const Options& options) { return Status::NotSupported("no file exists"); } -Status CompactedDBImpl::Open(const Options& options, - const std::string& dbname, DB** dbptr) { +Status CompactedDBImpl::Open(const Options& options, const std::string& dbname, + DB** dbptr) { *dbptr = nullptr; if (options.max_open_files != -1) { diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index 9495c447c7c..eb458b85d4e 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -7,6 +7,7 @@ #ifndef ROCKSDB_LITE #include #include + #include "db/db_impl/db_impl.h" namespace ROCKSDB_NAMESPACE { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 3b453145a79..7e97bdb6801 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -606,7 +606,7 @@ Status DBImpl::CloseHelper() { while (!flush_queue_.empty()) { const FlushRequest& flush_req = PopFirstFromFlushQueue(); - for (const auto& iter : flush_req) { + for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { iter.first->UnrefAndTryDelete(); } } @@ -1061,16 +1061,31 @@ void DBImpl::DumpStats() { return; } + // Also probe block cache(s) for problems, dump to info log + UnorderedSet probed_caches; TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning"); { InstrumentedMutexLock l(&mutex_); for (auto cfd : versions_->GetRefedColumnFamilySet()) { - if (cfd->initialized()) { - // Release DB mutex for gathering cache entry stats. Pass over all - // column families for this first so that other stats are dumped - // near-atomically. - InstrumentedMutexUnlock u(&mutex_); - cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false); + if (!cfd->initialized()) { + continue; + } + + // Release DB mutex for gathering cache entry stats. Pass over all + // column families for this first so that other stats are dumped + // near-atomically. + InstrumentedMutexUnlock u(&mutex_); + cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false); + + // Probe block cache for problems (if not already via another CF) + if (immutable_db_options_.info_log) { + auto* table_factory = cfd->ioptions()->table_factory.get(); + assert(table_factory != nullptr); + Cache* cache = + table_factory->GetOptions(TableFactory::kBlockCacheOpts()); + if (cache && probed_caches.insert(cache).second) { + cache->ReportProblems(immutable_db_options_.info_log); + } } } @@ -1081,18 +1096,7 @@ void DBImpl::DumpStats() { default_cf_internal_stats_->GetStringProperty(*property_info, *property, &stats); - property = &DB::Properties::kCFStatsNoFileHistogram; - property_info = GetPropertyInfo(*property); - assert(property_info != nullptr); - assert(!property_info->need_out_of_mutex); - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->initialized()) { - cfd->internal_stats()->GetStringProperty(*property_info, *property, - &stats); - } - } - - property = &DB::Properties::kCFFileHistogram; + property = &InternalStats::kPeriodicCFStats; property_info = GetPropertyInfo(*property); assert(property_info != nullptr); assert(!property_info->need_out_of_mutex); @@ -1758,7 +1762,7 @@ Status DBImpl::SetDBOptions( file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite( file_options_for_compaction_, immutable_db_options_); versions_->ChangeFileOptions(mutable_db_options_); - //TODO(xiez): clarify why apply optimize for read to write options + // TODO(xiez): clarify why apply optimize for read to write options file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead( file_options_for_compaction_, immutable_db_options_); file_options_for_compaction_.compaction_readahead_size = @@ -1967,21 +1971,31 @@ Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) { } Status DBImpl::LockWAL() { - log_write_mutex_.Lock(); - auto cur_log_writer = logs_.back().writer; - IOStatus status = cur_log_writer->WriteBuffer(); - if (!status.ok()) { - ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s", - status.ToString().c_str()); - // In case there is a fs error we should set it globally to prevent the - // future writes - WriteStatusCheck(status); + { + InstrumentedMutexLock lock(&mutex_); + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } + + lock_wal_write_token_ = write_controller_.GetStopToken(); + + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); } - return static_cast(status); + return FlushWAL(/*sync=*/false); } Status DBImpl::UnlockWAL() { - log_write_mutex_.Unlock(); + { + InstrumentedMutexLock lock(&mutex_); + lock_wal_write_token_.reset(); + } + bg_cv_.SignalAll(); return Status::OK(); } @@ -2759,8 +2773,8 @@ std::vector DBImpl::MultiGet( std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr; LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp); - auto cfh = - static_cast_with_check(column_family[keys_read]); + auto cfh = static_cast_with_check( + column_family[keys_read]); SequenceNumber max_covering_tombstone_seq = 0; auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID()); assert(mgd_iter != multiget_cf_data.end()); @@ -4385,8 +4399,7 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) { void DBImpl::CleanupSuperVersion(SuperVersion* sv) { // Release SuperVersion if (sv->Unref()) { - bool defer_purge = - immutable_db_options().avoid_unnecessary_blocking_io; + bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io; { InstrumentedMutexLock l(&mutex_); sv->Cleanup(); @@ -5591,8 +5604,9 @@ Status DBImpl::IngestExternalFiles( for (const auto& arg : args) { auto* cfd = static_cast(arg.column_family)->cfd(); ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_, - file_options_, &snapshots_, arg.options, - &directories_, &event_logger_, io_tracer_); + mutable_db_options_, file_options_, &snapshots_, + arg.options, &directories_, &event_logger_, + io_tracer_); } // TODO(yanqin) maybe make jobs run in parallel @@ -5720,10 +5734,12 @@ Status DBImpl::IngestExternalFiles( // Run ingestion jobs. if (status.ok()) { for (size_t i = 0; i != num_cfs; ++i) { + mutex_.AssertHeld(); status = ingestion_jobs[i].Run(); if (!status.ok()) { break; } + ingestion_jobs[i].RegisterRange(); } } if (status.ok()) { @@ -5779,6 +5795,10 @@ Status DBImpl::IngestExternalFiles( } } + for (auto& job : ingestion_jobs) { + job.UnregisterRange(); + } + if (status.ok()) { for (size_t i = 0; i != num_cfs; ++i) { auto* cfd = @@ -5924,6 +5944,7 @@ Status DBImpl::CreateColumnFamilyWithImport( num_running_ingest_file_++; assert(!cfd->IsDropped()); + mutex_.AssertHeld(); status = import_job.Run(); // Install job edit [Mutex will be unlocked here] @@ -6074,8 +6095,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, } } - bool defer_purge = - immutable_db_options().avoid_unnecessary_blocking_io; + bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io; { InstrumentedMutexLock l(&mutex_); for (auto sv : sv_list) { @@ -6150,13 +6170,6 @@ void DBImpl::NotifyOnExternalFileIngested( } } -void DBImpl::WaitForIngestFile() { - mutex_.AssertHeld(); - while (num_running_ingest_file_ > 0) { - bg_cv_.Wait(); - } -} - Status DBImpl::StartTrace(const TraceOptions& trace_options, std::unique_ptr&& trace_writer) { InstrumentedMutexLock lock(&trace_mutex_); diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index eadb446f483..576e1048888 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -206,6 +207,9 @@ class DBImpl : public DB { using DB::Merge; Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) override; + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) override; + using DB::Delete; Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family, const Slice& key) override; @@ -478,12 +482,6 @@ class DBImpl : public DB { uint64_t start_time, uint64_t end_time, std::unique_ptr* stats_iterator) override; - // If immutable_db_options_.best_efforts_recovery is true, and - // RocksDbFileChecksumsVerificationEnabledOnRecovery is defined and returns - // true, and immutable_db_options_.file_checksum_gen_factory is not nullptr, - // then call VerifyFileChecksums(). - Status MaybeVerifyFileChecksums(); - #ifndef ROCKSDB_LITE using DB::ResetStats; virtual Status ResetStats() override; @@ -1209,7 +1207,7 @@ class DBImpl : public DB { int TEST_BGCompactionsAllowed() const; int TEST_BGFlushesAllowed() const; size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; - void TEST_WaitForPeridicTaskRun(std::function callback) const; + void TEST_WaitForPeriodicTaskRun(std::function callback) const; SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const; size_t TEST_EstimateInMemoryStatsHistorySize() const; @@ -1431,7 +1429,7 @@ class DBImpl : public DB { void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, - int job_id); + int job_id, FlushReason flush_reason); void NotifyOnFlushCompleted( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, @@ -1723,12 +1721,17 @@ class DBImpl : public DB { // Argument required by background flush thread. struct BGFlushArg { BGFlushArg() - : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {} + : cfd_(nullptr), + max_memtable_id_(0), + superversion_context_(nullptr), + flush_reason_(FlushReason::kOthers) {} BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id, - SuperVersionContext* superversion_context) + SuperVersionContext* superversion_context, + FlushReason flush_reason) : cfd_(cfd), max_memtable_id_(max_memtable_id), - superversion_context_(superversion_context) {} + superversion_context_(superversion_context), + flush_reason_(flush_reason) {} // Column family to flush. ColumnFamilyData* cfd_; @@ -1739,6 +1742,7 @@ class DBImpl : public DB { // installs a new superversion for the column family. This operation // requires a SuperVersionContext object (currently embedded in JobContext). SuperVersionContext* superversion_context_; + FlushReason flush_reason_; }; // Argument passed to flush thread. @@ -1867,7 +1871,7 @@ class DBImpl : public DB { // installs a new super version for the column family. Status FlushMemTableToOutputFile( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - bool* madeProgress, JobContext* job_context, + bool* madeProgress, JobContext* job_context, FlushReason flush_reason, SuperVersionContext* superversion_context, std::vector& snapshot_seqs, SequenceNumber earliest_write_conflict_snapshot, @@ -1913,7 +1917,8 @@ class DBImpl : public DB { // num_bytes: for slowdown case, delay time is calculated based on // `num_bytes` going through. - Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options); + Status DelayWrite(uint64_t num_bytes, WriteThread& write_thread, + const WriteOptions& write_options); // Begin stalling of writes when memory usage increases beyond a certain // threshold. @@ -2087,32 +2092,28 @@ class DBImpl : public DB { const int output_level, int output_path_id, JobContext* job_context, LogBuffer* log_buffer, CompactionJobInfo* compaction_job_info); - - // Wait for current IngestExternalFile() calls to finish. - // REQUIRES: mutex_ held - void WaitForIngestFile(); -#else - // IngestExternalFile is not supported in ROCKSDB_LITE so this function - // will be no-op - void WaitForIngestFile() {} #endif // ROCKSDB_LITE ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); void MaybeScheduleFlushOrCompaction(); - // A flush request specifies the column families to flush as well as the - // largest memtable id to persist for each column family. Once all the - // memtables whose IDs are smaller than or equal to this per-column-family - // specified value, this flush request is considered to have completed its - // work of flushing this column family. After completing the work for all - // column families in this request, this flush is considered complete. - using FlushRequest = std::vector>; + struct FlushRequest { + FlushReason flush_reason; + // A map from column family to flush to largest memtable id to persist for + // each column family. Once all the memtables whose IDs are smaller than or + // equal to this per-column-family specified value, this flush request is + // considered to have completed its work of flushing this column family. + // After completing the work for all column families in this request, this + // flush is considered complete. + std::unordered_map + cfd_to_max_mem_id_to_persist; + }; void GenerateFlushRequest(const autovector& cfds, - FlushRequest* req); + FlushReason flush_reason, FlushRequest* req); - void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason); + void SchedulePendingFlush(const FlushRequest& req); void SchedulePendingCompaction(ColumnFamilyData* cfd); void SchedulePendingPurge(std::string fname, std::string dir_to_sync, @@ -2761,6 +2762,10 @@ class DBImpl : public DB { // seqno_time_mapping_ stores the sequence number to time mapping, it's not // thread safe, both read and write need db mutex hold. SeqnoToTimeMapping seqno_time_mapping_; + + // stop write token that is acquired when LockWal() is called. Destructed + // when UnlockWal() is called. + std::unique_ptr lock_wal_write_token_; }; class GetWithTimestampReadCallback : public ReadCallback { diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index e05512d5131..5da5a65822e 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -156,7 +156,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, Status DBImpl::FlushMemTableToOutputFile( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, - bool* made_progress, JobContext* job_context, + bool* made_progress, JobContext* job_context, FlushReason flush_reason, SuperVersionContext* superversion_context, std::vector& snapshot_seqs, SequenceNumber earliest_write_conflict_snapshot, @@ -216,7 +216,8 @@ Status DBImpl::FlushMemTableToOutputFile( dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, - job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U), + job_context, flush_reason, log_buffer, directories_.GetDbDir(), + GetDataDir(cfd, 0U), GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, true /* sync_output_directory */, true /* write_manifest */, thread_pri, @@ -261,7 +262,8 @@ Status DBImpl::FlushMemTableToOutputFile( #ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. - NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id); + NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id, + flush_reason); #endif // ROCKSDB_LITE bool switched_to_mempurge = false; @@ -391,8 +393,9 @@ Status DBImpl::FlushMemTablesToOutputFiles( MutableCFOptions mutable_cf_options_copy = *cfd->GetLatestMutableCFOptions(); SuperVersionContext* superversion_context = bg_flush_arg.superversion_context_; + FlushReason flush_reason = bg_flush_arg.flush_reason_; Status s = FlushMemTableToOutputFile( - cfd, mutable_cf_options_copy, made_progress, job_context, + cfd, mutable_cf_options_copy, made_progress, job_context, flush_reason, superversion_context, snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, log_buffer, thread_pri); return s; @@ -421,7 +424,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( for (const auto cfd : cfds) { assert(cfd->imm()->NumNotFlushed() != 0); assert(cfd->imm()->IsFlushPending()); - assert(cfd->GetFlushReason() == cfds[0]->GetFlushReason()); + } + for (const auto bg_flush_arg : bg_flush_args) { + assert(bg_flush_arg.flush_reason_ == bg_flush_args[0].flush_reason_); } #endif /* !NDEBUG */ @@ -460,13 +465,15 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_; + FlushReason flush_reason = bg_flush_args[i].flush_reason_; jobs.emplace_back(new FlushJob( dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot, - snapshot_checker, job_context, log_buffer, directories_.GetDbDir(), - data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), - stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, + snapshot_checker, job_context, flush_reason, log_buffer, + directories_.GetDbDir(), data_dir, + GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, + &event_logger_, mutable_cf_options.report_bg_io_stats, false /* sync_output_directory */, false /* write_manifest */, thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(), &blob_callback_)); @@ -484,8 +491,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( for (int i = 0; i != num_cfs; ++i) { const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); // may temporarily unlock and lock the mutex. + FlushReason flush_reason = bg_flush_args[i].flush_reason_; NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, - job_context->job_id); + job_context->job_id, flush_reason); } #endif /* !ROCKSDB_LITE */ @@ -643,8 +651,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( bool resuming_from_bg_err = error_handler_.IsDBStopped() || - (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery || - cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush); + (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery || + bg_flush_args[0].flush_reason_ == + FlushReason::kErrorRecoveryRetryFlush); while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) { std::pair res = wait_to_install_func(); @@ -661,8 +670,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( resuming_from_bg_err = error_handler_.IsDBStopped() || - (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery || - cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush); + (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery || + bg_flush_args[0].flush_reason_ == + FlushReason::kErrorRecoveryRetryFlush); } if (!resuming_from_bg_err) { @@ -817,7 +827,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, - int job_id) { + int job_id, FlushReason flush_reason) { #ifndef ROCKSDB_LITE if (immutable_db_options_.listeners.size() == 0U) { return; @@ -850,7 +860,7 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, info.triggered_writes_stop = triggered_writes_stop; info.smallest_seqno = file_meta->fd.smallest_seqno; info.largest_seqno = file_meta->fd.largest_seqno; - info.flush_reason = cfd->GetFlushReason(); + info.flush_reason = flush_reason; for (auto listener : immutable_db_options_.listeners) { listener->OnFlushBegin(this, info); } @@ -863,6 +873,7 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, (void)file_meta; (void)mutable_cf_options; (void)job_id; + (void)flush_reason; #endif // ROCKSDB_LITE } @@ -1088,6 +1099,22 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, { SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); Version* current_version = super_version->current; + + // Might need to query the partitioner + SstPartitionerFactory* partitioner_factory = + current_version->cfd()->ioptions()->sst_partitioner_factory.get(); + std::unique_ptr partitioner; + if (partitioner_factory && begin != nullptr && end != nullptr) { + SstPartitioner::Context context; + context.is_full_compaction = false; + context.is_manual_compaction = true; + context.output_level = /*unknown*/ -1; + // Small lies about compaction range + context.smallest_user_key = *begin; + context.largest_user_key = *end; + partitioner = partitioner_factory->CreatePartitioner(context); + } + ReadOptions ro; ro.total_order_seek = true; bool overlap; @@ -1095,14 +1122,50 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, level < current_version->storage_info()->num_non_empty_levels(); level++) { overlap = true; + + // Whether to look at specific keys within files for overlap with + // compaction range, other than largest and smallest keys of the file + // known in Version metadata. + bool check_overlap_within_file = false; if (begin != nullptr && end != nullptr) { + // Typically checking overlap within files in this case + check_overlap_within_file = true; + // WART: Not known why we don't check within file in one-sided bound + // cases + if (partitioner) { + // Especially if the partitioner is new, the manual compaction + // might be used to enforce the partitioning. Checking overlap + // within files might miss cases where compaction is needed to + // partition the files, as in this example: + // * File has two keys "001" and "111" + // * Compaction range is ["011", "101") + // * Partition boundary at "100" + // In cases like this, file-level overlap with the compaction + // range is sufficient to force any partitioning that is needed + // within the compaction range. + // + // But if there's no partitioning boundary within the compaction + // range, we can be sure there's no need to fix partitioning + // within that range, thus safe to check overlap within file. + // + // Use a hypothetical trivial move query to check for partition + // boundary in range. (NOTE: in defiance of all conventions, + // `begin` and `end` here are both INCLUSIVE bounds, which makes + // this analogy to CanDoTrivialMove() accurate even when `end` is + // the first key in a partition.) + if (!partitioner->CanDoTrivialMove(*begin, *end)) { + check_overlap_within_file = false; + } + } + } + if (check_overlap_within_file) { Status status = current_version->OverlapWithLevelIterator( ro, file_options_, *begin, *end, level, &overlap); if (!status.ok()) { - overlap = current_version->storage_info()->OverlapInLevel( - level, begin, end); + check_overlap_within_file = false; } - } else { + } + if (!check_overlap_within_file) { overlap = current_version->storage_info()->OverlapInLevel(level, begin, end); } @@ -1198,6 +1261,12 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, ROCKS_LOG_INFO(immutable_db_options_.info_log, "[RefitLevel] waiting for background threads to stop"); + // TODO(hx235): remove `Enable/DisableManualCompaction` and + // `Continue/PauseBackgroundWork` once we ensure registering RefitLevel()'s + // range is sufficient (if not, what else is needed) for avoiding range + // conflicts with other activities (e.g, compaction, flush) that are + // currently avoided by `Enable/DisableManualCompaction` and + // `Continue/PauseBackgroundWork`. DisableManualCompaction(); s = PauseBackgroundWork(); if (s.ok()) { @@ -1262,13 +1331,6 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options, const_cast*>(&manual_compaction_paused_))); { InstrumentedMutexLock l(&mutex_); - - // This call will unlock/lock the mutex to wait for current running - // IngestExternalFile() calls to finish. - WaitForIngestFile(); - - // We need to get current after `WaitForIngestFile`, because - // `IngestExternalFile` may add files that overlap with `input_file_names` auto* current = cfd->current(); current->Ref(); @@ -1347,6 +1409,7 @@ Status DBImpl::CompactFilesImpl( Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( &input_set, cf_meta, output_level); + TEST_SYNC_POINT("DBImpl::CompactFilesImpl::PostSanitizeCompactionInputFiles"); if (!s.ok()) { return s; } @@ -1640,6 +1703,10 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { InstrumentedMutexLock guard_lock(&mutex_); + auto* vstorage = cfd->current()->storage_info(); + if (vstorage->LevelFiles(level).empty()) { + return Status::OK(); + } // only allow one thread refitting if (refitting_level_) { ROCKS_LOG_INFO(immutable_db_options_.info_log, @@ -1655,8 +1722,16 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); } - auto* vstorage = cfd->current()->storage_info(); if (to_level != level) { + std::vector input(1); + input[0].level = level; + for (auto& f : vstorage->LevelFiles(level)) { + input[0].files.push_back(f); + } + InternalKey refit_level_smallest; + InternalKey refit_level_largest; + cfd->compaction_picker()->GetRange(input[0], &refit_level_smallest, + &refit_level_largest); if (to_level > level) { if (level == 0) { refitting_level_ = false; @@ -1670,6 +1745,14 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { return Status::NotSupported( "Levels between source and target are not empty for a move."); } + if (cfd->RangeOverlapWithCompaction(refit_level_smallest.user_key(), + refit_level_largest.user_key(), + l)) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target " + "will have some ongoing compaction's output."); + } } } else { // to_level < level @@ -1680,22 +1763,51 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { return Status::NotSupported( "Levels between source and target are not empty for a move."); } + if (cfd->RangeOverlapWithCompaction(refit_level_smallest.user_key(), + refit_level_largest.user_key(), + l)) { + refitting_level_ = false; + return Status::NotSupported( + "Levels between source and target " + "will have some ongoing compaction's output."); + } } } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); + std::unique_ptr c(new Compaction( + vstorage, *cfd->ioptions(), mutable_cf_options, mutable_db_options_, + {input}, to_level, + MaxFileSizeForLevel( + mutable_cf_options, to_level, + cfd->ioptions() + ->compaction_style) /* output file size limit, not applicable */ + , + LLONG_MAX /* max compaction bytes, not applicable */, + 0 /* output path ID, not applicable */, mutable_cf_options.compression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompactions, not applicable */, + {} /* grandparents, not applicable */, false /* is manual */, + "" /* trim_ts */, -1 /* score, not applicable */, + false /* is deletion compaction, not applicable */, + false /* l0_files_might_overlap, not applicable */, + CompactionReason::kRefitLevel)); + cfd->compaction_picker()->RegisterCompaction(c.get()); + TEST_SYNC_POINT("DBImpl::ReFitLevel:PostRegisterCompaction"); VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : vstorage->LevelFiles(level)) { edit.DeleteFile(level, f->fd.GetNumber()); edit.AddFile( to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, - f->oldest_ancester_time, f->file_creation_time, f->file_checksum, - f->file_checksum_func_name, f->unique_id); + f->oldest_ancester_time, f->file_creation_time, f->epoch_number, + f->file_checksum, f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -1704,6 +1816,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, directories_.GetDbDir()); + cfd->compaction_picker()->UnregisterCompaction(c.get()); + c.reset(); + InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n", @@ -1920,11 +2035,19 @@ Status DBImpl::RunManualCompaction( manual.begin, manual.end, &manual.manual_end, &manual_conflict, max_file_num_to_ignore, trim_ts)) == nullptr && manual_conflict))) { - // exclusive manual compactions should not see a conflict during - // CompactRange - assert(!exclusive || !manual_conflict); - // Running either this or some other manual compaction - bg_cv_.Wait(); + if (!scheduled) { + // There is a conflicting compaction + if (manual_compaction_paused_ > 0 || manual.canceled == true) { + // Stop waiting since it was canceled. Pretend the error came from + // compaction so the below cleanup/error handling code can process it. + manual.done = true; + manual.status = + Status::Incomplete(Status::SubCode::kManualCompactionPaused); + } + } + if (!manual.done) { + bg_cv_.Wait(); + } if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) { assert(thread_pool_priority != Env::Priority::TOTAL); // unschedule all manual compactions @@ -2002,16 +2125,17 @@ Status DBImpl::RunManualCompaction( } void DBImpl::GenerateFlushRequest(const autovector& cfds, - FlushRequest* req) { + FlushReason flush_reason, FlushRequest* req) { assert(req != nullptr); - req->reserve(cfds.size()); + req->flush_reason = flush_reason; + req->cfd_to_max_mem_id_to_persist.reserve(cfds.size()); for (const auto cfd : cfds) { if (nullptr == cfd) { // cfd may be null, see DBImpl::ScheduleFlushes continue; } uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID(); - req->emplace_back(cfd, max_memtable_id); + req->cfd_to_max_mem_id_to_persist.emplace(cfd, max_memtable_id); } } @@ -2073,7 +2197,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, if (s.ok()) { if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { - FlushRequest req{{cfd, flush_memtable_id}}; + FlushRequest req{flush_reason, {{cfd, flush_memtable_id}}}; flush_reqs.emplace_back(std::move(req)); memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID()); } @@ -2101,10 +2225,10 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, "to avoid holding old logs", cfd->GetName().c_str()); s = SwitchMemtable(cfd_stats, &context); - FlushRequest req{{cfd_stats, flush_memtable_id}}; + FlushRequest req{flush_reason, {{cfd_stats, flush_memtable_id}}}; flush_reqs.emplace_back(std::move(req)); memtable_ids_to_wait.emplace_back( - cfd->imm()->GetLatestMemTableID()); + cfd_stats->imm()->GetLatestMemTableID()); } } } @@ -2112,8 +2236,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, if (s.ok() && !flush_reqs.empty()) { for (const auto& req : flush_reqs) { - assert(req.size() == 1); - ColumnFamilyData* loop_cfd = req[0].first; + assert(req.cfd_to_max_mem_id_to_persist.size() == 1); + ColumnFamilyData* loop_cfd = + req.cfd_to_max_mem_id_to_persist.begin()->first; loop_cfd->imm()->FlushRequested(); } // If the caller wants to wait for this flush to complete, it indicates @@ -2122,13 +2247,14 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, // Therefore, we increase the cfd's ref count. if (flush_options.wait) { for (const auto& req : flush_reqs) { - assert(req.size() == 1); - ColumnFamilyData* loop_cfd = req[0].first; + assert(req.cfd_to_max_mem_id_to_persist.size() == 1); + ColumnFamilyData* loop_cfd = + req.cfd_to_max_mem_id_to_persist.begin()->first; loop_cfd->Ref(); } } for (const auto& req : flush_reqs) { - SchedulePendingFlush(req, flush_reason); + SchedulePendingFlush(req); } MaybeScheduleFlushOrCompaction(); } @@ -2147,8 +2273,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, autovector flush_memtable_ids; assert(flush_reqs.size() == memtable_ids_to_wait.size()); for (size_t i = 0; i < flush_reqs.size(); ++i) { - assert(flush_reqs[i].size() == 1); - cfds.push_back(flush_reqs[i][0].first); + assert(flush_reqs[i].cfd_to_max_mem_id_to_persist.size() == 1); + cfds.push_back(flush_reqs[i].cfd_to_max_mem_id_to_persist.begin()->first); flush_memtable_ids.push_back(&(memtable_ids_to_wait[i])); } s = WaitForFlushMemTables( @@ -2270,8 +2396,8 @@ Status DBImpl::AtomicFlushMemTables( cfd->Ref(); } } - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, flush_reason); + GenerateFlushRequest(cfds, flush_reason, &flush_req); + SchedulePendingFlush(flush_req); MaybeScheduleFlushOrCompaction(); } @@ -2286,7 +2412,7 @@ Status DBImpl::AtomicFlushMemTables( TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"); if (s.ok() && flush_options.wait) { autovector flush_memtable_ids; - for (auto& iter : flush_req) { + for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { flush_memtable_ids.push_back(&(iter.second)); } s = WaitForFlushMemTables( @@ -2633,9 +2759,9 @@ DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { FlushRequest flush_req = flush_queue_.front(); flush_queue_.pop_front(); if (!immutable_db_options_.atomic_flush) { - assert(flush_req.size() == 1); + assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1); } - for (const auto& elem : flush_req) { + for (const auto& elem : flush_req.cfd_to_max_mem_id_to_persist) { if (!immutable_db_options_.atomic_flush) { ColumnFamilyData* cfd = elem.first; assert(cfd); @@ -2643,7 +2769,6 @@ DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { cfd->set_queued_for_flush(false); } } - // TODO: need to unset flush reason? return flush_req; } @@ -2673,31 +2798,29 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue( return cfd; } -void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req, - FlushReason flush_reason) { +void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) { mutex_.AssertHeld(); - if (flush_req.empty()) { + if (flush_req.cfd_to_max_mem_id_to_persist.empty()) { return; } if (!immutable_db_options_.atomic_flush) { // For the non-atomic flush case, we never schedule multiple column // families in the same flush request. - assert(flush_req.size() == 1); - ColumnFamilyData* cfd = flush_req[0].first; + assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1); + ColumnFamilyData* cfd = + flush_req.cfd_to_max_mem_id_to_persist.begin()->first; assert(cfd); if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) { cfd->Ref(); cfd->set_queued_for_flush(true); - cfd->SetFlushReason(flush_reason); ++unscheduled_flushes_; flush_queue_.push_back(flush_req); } } else { - for (auto& iter : flush_req) { + for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { ColumnFamilyData* cfd = iter.first; cfd->Ref(); - cfd->SetFlushReason(flush_reason); } ++unscheduled_flushes_; flush_queue_.push_back(flush_req); @@ -2829,10 +2952,12 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, while (!flush_queue_.empty()) { // This cfd is already referenced const FlushRequest& flush_req = PopFirstFromFlushQueue(); + FlushReason flush_reason = flush_req.flush_reason; superversion_contexts.clear(); - superversion_contexts.reserve(flush_req.size()); + superversion_contexts.reserve( + flush_req.cfd_to_max_mem_id_to_persist.size()); - for (const auto& iter : flush_req) { + for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { ColumnFamilyData* cfd = iter.first; if (cfd->GetMempurgeUsed()) { // If imm() contains silent memtables (e.g.: because @@ -2848,7 +2973,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, } superversion_contexts.emplace_back(SuperVersionContext(true)); bg_flush_args.emplace_back(cfd, iter.second, - &(superversion_contexts.back())); + &(superversion_contexts.back()), flush_reason); } if (!bg_flush_args.empty()) { break; @@ -2872,9 +2997,14 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress, job_context, log_buffer, thread_pri); TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush"); - // All the CFDs in the FlushReq must have the same flush reason, so just - // grab the first one - *reason = bg_flush_args[0].cfd_->GetFlushReason(); +// All the CFD/bg_flush_arg in the FlushReq must have the same flush reason, so +// just grab the first one +#ifndef NDEBUG + for (const auto bg_flush_arg : bg_flush_args) { + assert(bg_flush_arg.flush_reason_ == bg_flush_args[0].flush_reason_); + } +#endif /* !NDEBUG */ + *reason = bg_flush_args[0].flush_reason_; for (auto& arg : bg_flush_args) { ColumnFamilyData* cfd = arg.cfd_; if (cfd->UnrefAndTryDelete()) { @@ -2980,10 +3110,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, { InstrumentedMutexLock l(&mutex_); - // This call will unlock/lock the mutex to wait for current running - // IngestExternalFile() calls to finish. - WaitForIngestFile(); - num_running_compactions_++; std::unique_ptr::iterator> @@ -3364,8 +3490,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->file_checksum, f->file_checksum_func_name, - f->unique_id); + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size); ROCKS_LOG_BUFFER( log_buffer, @@ -3634,11 +3761,6 @@ void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) { } bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) { - if (num_running_ingest_file_ > 0) { - // We need to wait for other IngestExternalFile() calls to finish - // before running a manual compaction. - return true; - } if (m->exclusive) { return (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0); diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 449b979cde7..c971156b9a3 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -298,7 +298,7 @@ size_t DBImpl::TEST_GetWalPreallocateBlockSize( } #ifndef ROCKSDB_LITE -void DBImpl::TEST_WaitForPeridicTaskRun(std::function callback) const { +void DBImpl::TEST_WaitForPeriodicTaskRun(std::function callback) const { periodic_task_scheduler_.TEST_WaitForRun(callback); } diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index c1b1e4137da..2f732c1e47d 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -136,8 +136,9 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->file_checksum, - f->file_checksum_func_name, f->unique_id); + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 058df4da7e7..9834ff4b27f 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -315,6 +315,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } log_write_mutex_.Unlock(); mutex_.Unlock(); + TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr); log_write_mutex_.Lock(); while (!logs_.empty() && logs_.front().number < min_log_number) { auto& log = logs_.front(); @@ -360,6 +361,8 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname, } TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion", &file_deletion_status); + TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion2", + const_cast(&fname)); if (file_deletion_status.ok()) { ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id, diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index ca40dd86bf0..958652c9fc4 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -25,15 +25,6 @@ #include "test_util/sync_point.h" #include "util/rate_limiter.h" -#if !defined(ROCKSDB_LITE) && defined(OS_LINUX) -// VerifyFileChecksums is a weak symbol. -// If it is defined and returns true, and options.best_efforts_recovery = true, -// and file checksum is enabled, then the checksums of table files will be -// computed and verified with MANIFEST. -extern "C" bool RocksDbFileChecksumsVerificationEnabledOnRecovery() - __attribute__((__weak__)); -#endif // !ROCKSDB_LITE && OS_LINUX - namespace ROCKSDB_NAMESPACE { Options SanitizeOptions(const std::string& dbname, const Options& src, bool read_only, Status* logger_creation_s) { @@ -279,7 +270,8 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) { if (db_options.unordered_write && !db_options.allow_concurrent_memtable_write) { return Status::InvalidArgument( - "unordered_write is incompatible with !allow_concurrent_memtable_write"); + "unordered_write is incompatible with " + "!allow_concurrent_memtable_write"); } if (db_options.unordered_write && db_options.enable_pipelined_write) { @@ -1085,9 +1077,8 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, std::unique_ptr file_reader; { std::unique_ptr file; - status = fs_->NewSequentialFile(fname, - fs_->OptimizeForLogRead(file_options_), - &file, nullptr); + status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); if (!status.ok()) { MaybeIgnoreError(&status); if (!status.ok()) { @@ -1544,7 +1535,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, .PermitUncheckedError(); // ignore error const uint64_t current_time = static_cast(_current_time); meta.oldest_ancester_time = current_time; - + meta.epoch_number = cfd->NewEpochNumber(); { auto write_hint = cfd->CalculateSSTWriteHint(0); mutex_.Unlock(); @@ -1579,6 +1570,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, 0 /* file_creation_time */, db_id_, db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber()); SeqnoToTimeMapping empty_seqno_time_mapping; + Version* version = cfd->current(); + version->Ref(); s = BuildTable( dbname_, versions_.get(), immutable_db_options_, tboptions, file_options_for_compaction_, cfd->table_cache(), iter.get(), @@ -1588,7 +1581,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, io_tracer_, BlobFileCreationReason::kRecovery, empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */, write_hint, - nullptr /*full_history_ts_low*/, &blob_callback_); + nullptr /*full_history_ts_low*/, &blob_callback_, version); + version->Unref(); LogFlush(immutable_db_options_.info_log); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" @@ -1617,8 +1611,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.fd.smallest_seqno, meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, meta.oldest_blob_file_number, meta.oldest_ancester_time, - meta.file_creation_time, meta.file_checksum, - meta.file_checksum_func_name, meta.unique_id); + meta.file_creation_time, meta.epoch_number, + meta.file_checksum, meta.file_checksum_func_name, + meta.unique_id, meta.compensated_range_deletion_size); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); @@ -1648,22 +1643,6 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, return s; } -Status DBImpl::MaybeVerifyFileChecksums() { - Status s; -#if !defined(ROCKSDB_LITE) && defined(OS_LINUX) - // TODO: remove the VerifyFileChecksums() call because it's very expensive. - if (immutable_db_options_.best_efforts_recovery && - RocksDbFileChecksumsVerificationEnabledOnRecovery && - RocksDbFileChecksumsVerificationEnabledOnRecovery() && - immutable_db_options_.file_checksum_gen_factory) { - s = VerifyFileChecksums(ReadOptions()); - ROCKS_LOG_INFO(immutable_db_options_.info_log, - "Verified file checksums: %s\n", s.ToString().c_str()); - } -#endif // !ROCKSDB_LITE && OS_LINUX - return s; -} - Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { DBOptions db_options(options); ColumnFamilyOptions cf_options(options); diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index 1ae5350a2a0..0f10baf2497 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -305,9 +305,6 @@ Status DBImplReadOnly::OpenForReadOnlyWithoutCheck( } impl->mutex_.Unlock(); sv_context.Clean(); - if (s.ok()) { - s = impl->MaybeVerifyFileChecksums(); - } if (s.ok()) { *dbptr = impl; for (auto* h : *handles) { diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 344fa472021..b876a0fdaf0 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -9,6 +9,7 @@ #include #include + #include "db/db_impl/db_impl.h" namespace ROCKSDB_NAMESPACE { diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index f009352c20f..5189d17d984 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -157,8 +157,7 @@ Status DBImplSecondary::MaybeInitLogReader( { std::unique_ptr file; Status status = fs_->NewSequentialFile( - fname, fs_->OptimizeForLogRead(file_options_), &file, - nullptr); + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); if (!status.ok()) { *log_reader = nullptr; return status; @@ -200,7 +199,7 @@ Status DBImplSecondary::RecoverLogFiles( assert(reader != nullptr); } for (auto log_number : log_numbers) { - auto it = log_readers_.find(log_number); + auto it = log_readers_.find(log_number); assert(it != log_readers_.end()); log::FragmentBufferedReader* reader = it->second->reader_; Status* wal_read_status = it->second->status_; diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index a58ea4449a0..eb93618752c 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -47,6 +47,7 @@ class LogReaderContainer { delete reporter_; delete status_; } + private: struct LogReporter : public log::Reader::Reporter { Env* env; @@ -247,7 +248,6 @@ class DBImplSecondary : public DBImpl { // method can take long time due to all the I/O and CPU costs. Status TryCatchUpWithPrimary() override; - // Try to find log reader using log_number from log_readers_ map, initialize // if it doesn't exist Status MaybeInitLogReader(uint64_t log_number, diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 04210e836cc..3b9bd7b80eb 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -63,6 +63,15 @@ Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, } } +Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& val) { + const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + if (!s.ok()) { + return s; + } + return DB::Merge(o, column_family, key, ts, val); +} + Status DBImpl::Delete(const WriteOptions& write_options, ColumnFamilyHandle* column_family, const Slice& key) { const Status s = FailIfCfHasTs(column_family); @@ -961,6 +970,16 @@ Status DBImpl::WriteImplWALOnly( write_thread->ExitAsBatchGroupLeader(write_group, status); return status; } + } else { + InstrumentedMutexLock lock(&mutex_); + Status status = + DelayWrite(/*num_bytes=*/0ull, *write_thread, write_options); + if (!status.ok()) { + WriteThread::WriteGroup write_group; + write_thread->EnterAsBatchGroupLeader(&w, &write_group); + write_thread->ExitAsBatchGroupLeader(write_group, status); + return status; + } } WriteThread::WriteGroup write_group; @@ -1229,7 +1248,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // might happen for smaller writes but larger writes can go through. // Can optimize it if it is an issue. InstrumentedMutexLock l(&mutex_); - status = DelayWrite(last_batch_group_size_, write_options); + status = DelayWrite(last_batch_group_size_, write_thread_, write_options); PERF_TIMER_START(write_pre_and_post_process_time); } @@ -1689,14 +1708,14 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) { cfd->imm()->FlushRequested(); if (!immutable_db_options_.atomic_flush) { FlushRequest flush_req; - GenerateFlushRequest({cfd}, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWalFull); + GenerateFlushRequest({cfd}, FlushReason::kWalFull, &flush_req); + SchedulePendingFlush(flush_req); } } if (immutable_db_options_.atomic_flush) { FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWalFull); + GenerateFlushRequest(cfds, FlushReason::kWalFull, &flush_req); + SchedulePendingFlush(flush_req); } MaybeScheduleFlushOrCompaction(); } @@ -1794,14 +1813,15 @@ Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) { cfd->imm()->FlushRequested(); if (!immutable_db_options_.atomic_flush) { FlushRequest flush_req; - GenerateFlushRequest({cfd}, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + GenerateFlushRequest({cfd}, FlushReason::kWriteBufferManager, + &flush_req); + SchedulePendingFlush(flush_req); } } if (immutable_db_options_.atomic_flush) { FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager); + GenerateFlushRequest(cfds, FlushReason::kWriteBufferManager, &flush_req); + SchedulePendingFlush(flush_req); } MaybeScheduleFlushOrCompaction(); } @@ -1818,16 +1838,25 @@ uint64_t DBImpl::GetMaxTotalWalSize() const { } // REQUIRES: mutex_ is held -// REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::DelayWrite(uint64_t num_bytes, +// REQUIRES: this thread is currently at the leader for write_thread +Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, const WriteOptions& write_options) { + mutex_.AssertHeld(); uint64_t time_delayed = 0; bool delayed = false; { StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL, &time_delayed); - uint64_t delay = - write_controller_.GetDelay(immutable_db_options_.clock, num_bytes); + // To avoid parallel timed delays (bad throttling), only support them + // on the primary write queue. + uint64_t delay; + if (&write_thread == &write_thread_) { + delay = + write_controller_.GetDelay(immutable_db_options_.clock, num_bytes); + } else { + assert(num_bytes == 0); + delay = 0; + } TEST_SYNC_POINT("DBImpl::DelayWrite:Start"); if (delay > 0) { if (write_options.no_slowdown) { @@ -1835,9 +1864,9 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, } TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); - // Notify write_thread_ about the stall so it can setup a barrier and + // Notify write_thread about the stall so it can setup a barrier and // fail any pending writers with no_slowdown - write_thread_.BeginWriteStall(); + write_thread.BeginWriteStall(); mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone"); // We will delay the write until we have slept for `delay` microseconds @@ -1857,7 +1886,7 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval); } mutex_.Lock(); - write_thread_.EndWriteStall(); + write_thread.EndWriteStall(); } // Don't wait if there's a background error, even if its a soft error. We @@ -1871,12 +1900,12 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, } delayed = true; - // Notify write_thread_ about the stall so it can setup a barrier and + // Notify write_thread about the stall so it can setup a barrier and // fail any pending writers with no_slowdown - write_thread_.BeginWriteStall(); + write_thread.BeginWriteStall(); TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); bg_cv_.Wait(); - write_thread_.EndWriteStall(); + write_thread.EndWriteStall(); } } assert(!delayed || !write_options.no_slowdown); @@ -2080,13 +2109,13 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { if (immutable_db_options_.atomic_flush) { AssignAtomicFlushSeq(cfds); FlushRequest flush_req; - GenerateFlushRequest(cfds, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + GenerateFlushRequest(cfds, FlushReason::kWriteBufferFull, &flush_req); + SchedulePendingFlush(flush_req); } else { for (auto* cfd : cfds) { FlushRequest flush_req; - GenerateFlushRequest({cfd}, &flush_req); - SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull); + GenerateFlushRequest({cfd}, FlushReason::kWriteBufferFull, &flush_req); + SchedulePendingFlush(flush_req); } } MaybeScheduleFlushOrCompaction(); @@ -2568,4 +2597,21 @@ Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, } return Write(opt, &batch); } + +Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + opt.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + Status s = batch.Merge(column_family, key, ts, value); + if (!s.ok()) { + return s; + } + return Write(opt, &batch); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc index df17a5c96ff..be8d5bee1cd 100644 --- a/db/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -6,6 +6,7 @@ #include "db/db_info_dumper.h" #include + #include #include #include diff --git a/db/db_iter.cc b/db/db_iter.cc index 5bc9bb8d009..1e4a735dca4 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -158,6 +158,8 @@ void DBIter::Next() { local_stats_.next_count_++; if (ok && iter_.Valid()) { + ClearSavedValue(); + if (prefix_same_as_start_) { assert(prefix_extractor_ != nullptr); const Slice prefix = prefix_.GetUserKey(); @@ -523,11 +525,13 @@ bool DBIter::MergeValuesNewToOld() { return false; } - if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { // hit the next user key, stop right here break; } - if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type) { + if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type || + kTypeDeletionWithTimestamp == ikey.type) { // hit a delete with the same user key, stop right here // iter_ is positioned after delete iter_.Next(); @@ -542,8 +546,7 @@ bool DBIter::MergeValuesNewToOld() { // hit a put, merge the put value with operands and store the // final result in saved_value_. We are done! const Slice val = iter_.value(); - Status s = Merge(&val, ikey.user_key); - if (!s.ok()) { + if (!Merge(&val, ikey.user_key)) { return false; } // iter_ is positioned after put @@ -572,8 +575,7 @@ bool DBIter::MergeValuesNewToOld() { return false; } valid_ = true; - Status s = Merge(&blob_value_, ikey.user_key); - if (!s.ok()) { + if (!Merge(&blob_value_, ikey.user_key)) { return false; } @@ -587,11 +589,18 @@ bool DBIter::MergeValuesNewToOld() { } return true; } else if (kTypeWideColumnEntity == ikey.type) { - // TODO: support wide-column entities - status_ = Status::NotSupported( - "Merge currently not supported for wide-column entities"); - valid_ = false; - return false; + if (!MergeEntity(iter_.value(), ikey.user_key)) { + return false; + } + + // iter_ is positioned after put + iter_.Next(); + if (!iter_.status().ok()) { + valid_ = false; + return false; + } + + return true; } else { valid_ = false; status_ = Status::Corruption( @@ -610,8 +619,7 @@ bool DBIter::MergeValuesNewToOld() { // a deletion marker. // feed null as the existing value to the merge operator, such that // client can differentiate this scenario and do things accordingly. - Status s = Merge(nullptr, saved_key_.GetUserKey()); - if (!s.ok()) { + if (!Merge(nullptr, saved_key_.GetUserKey())) { return false; } assert(status_.ok()); @@ -634,6 +642,8 @@ void DBIter::Prev() { } } if (ok) { + ClearSavedValue(); + Slice prefix; if (prefix_same_as_start_) { assert(prefix_extractor_ != nullptr); @@ -956,9 +966,9 @@ bool DBIter::FindValueForCurrentKey() { case kTypeMerge: current_entry_is_merged_ = true; if (last_not_merge_type == kTypeDeletion || - last_not_merge_type == kTypeSingleDeletion) { - s = Merge(nullptr, saved_key_.GetUserKey()); - if (!s.ok()) { + last_not_merge_type == kTypeSingleDeletion || + last_not_merge_type == kTypeDeletionWithTimestamp) { + if (!Merge(nullptr, saved_key_.GetUserKey())) { return false; } return true; @@ -973,8 +983,7 @@ bool DBIter::FindValueForCurrentKey() { return false; } valid_ = true; - s = Merge(&blob_value_, saved_key_.GetUserKey()); - if (!s.ok()) { + if (!Merge(&blob_value_, saved_key_.GetUserKey())) { return false; } @@ -982,15 +991,14 @@ bool DBIter::FindValueForCurrentKey() { return true; } else if (last_not_merge_type == kTypeWideColumnEntity) { - // TODO: support wide-column entities - status_ = Status::NotSupported( - "Merge currently not supported for wide-column entities"); - valid_ = false; - return false; + if (!MergeEntity(pinned_value_, saved_key_.GetUserKey())) { + return false; + } + + return true; } else { assert(last_not_merge_type == kTypeValue); - s = Merge(&pinned_value_, saved_key_.GetUserKey()); - if (!s.ok()) { + if (!Merge(&pinned_value_, saved_key_.GetUserKey())) { return false; } return true; @@ -1159,10 +1167,12 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (!ParseKey(&ikey)) { return false; } - if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) { + if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key, + saved_key_.GetUserKey())) { break; } - if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) { + if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion || + ikey.type == kTypeDeletionWithTimestamp) { break; } if (!iter_.PrepareValue()) { @@ -1172,8 +1182,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { if (ikey.type == kTypeValue) { const Slice val = iter_.value(); - Status s = Merge(&val, saved_key_.GetUserKey()); - if (!s.ok()) { + if (!Merge(&val, saved_key_.GetUserKey())) { return false; } return true; @@ -1192,8 +1201,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return false; } valid_ = true; - Status s = Merge(&blob_value_, saved_key_.GetUserKey()); - if (!s.ok()) { + if (!Merge(&blob_value_, saved_key_.GetUserKey())) { return false; } @@ -1201,11 +1209,11 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return true; } else if (ikey.type == kTypeWideColumnEntity) { - // TODO: support wide-column entities - status_ = Status::NotSupported( - "Merge currently not supported for wide-column entities"); - valid_ = false; - return false; + if (!MergeEntity(iter_.value(), saved_key_.GetUserKey())) { + return false; + } + + return true; } else { valid_ = false; status_ = Status::Corruption( @@ -1215,8 +1223,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { } } - Status s = Merge(nullptr, saved_key_.GetUserKey()); - if (!s.ok()) { + if (!Merge(nullptr, saved_key_.GetUserKey())) { return false; } @@ -1239,21 +1246,47 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return true; } -Status DBIter::Merge(const Slice* val, const Slice& user_key) { +bool DBIter::Merge(const Slice* val, const Slice& user_key) { + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. Status s = MergeHelper::TimedFullMerge( merge_operator_, user_key, val, merge_context_.GetOperands(), - &saved_value_, logger_, statistics_, clock_, &pinned_value_, true); + &saved_value_, logger_, statistics_, clock_, &pinned_value_, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); if (!s.ok()) { valid_ = false; status_ = s; - return s; + return false; } SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_ : saved_value_); valid_ = true; - return s; + return true; +} + +bool DBIter::MergeEntity(const Slice& entity, const Slice& user_key) { + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + Status s = MergeHelper::TimedFullMergeWithEntity( + merge_operator_, user_key, entity, merge_context_.GetOperands(), + &saved_value_, logger_, statistics_, clock_, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + valid_ = false; + status_ = s; + return false; + } + + if (!SetValueAndColumnsFromEntity(saved_value_)) { + return false; + } + + valid_ = true; + return true; } // Move backwards until the key smaller than saved_key_. diff --git a/db/db_iter.h b/db/db_iter.h index d81d1f0ef43..e87c2b4c90e 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -317,7 +317,9 @@ class DBIter final : public Iterator { wide_columns_.clear(); } - Status Merge(const Slice* val, const Slice& user_key); + // If user-defined timestamp is enabled, `user_key` includes timestamp. + bool Merge(const Slice* val, const Slice& user_key); + bool MergeEntity(const Slice& entity, const Slice& user_key); const SliceTransform* prefix_extractor_; Env* const env_; diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc index ca1a1fd95f5..872f7e6bd93 100644 --- a/db/db_iter_stress_test.cc +++ b/db/db_iter_stress_test.cc @@ -392,7 +392,7 @@ struct ReferenceIterator { } }; -} // namespace +} // anonymous namespace // Use an internal iterator that sometimes returns errors and sometimes // adds/removes entries on the fly. Do random operations on a DBIter and @@ -482,12 +482,11 @@ TEST_F(DBIteratorStressTest, StressTest) { std::cout << "entries:"; for (size_t i = 0; i < data.entries.size(); ++i) { Entry& e = data.entries[i]; - std::cout - << "\n idx " << i << ": \"" << e.key << "\": \"" - << e.value << "\" seq: " << e.sequence << " type: " - << (e.type == kTypeValue - ? "val" - : e.type == kTypeDeletion ? "del" : "merge"); + std::cout << "\n idx " << i << ": \"" << e.key << "\": \"" + << e.value << "\" seq: " << e.sequence << " type: " + << (e.type == kTypeValue ? "val" + : e.type == kTypeDeletion ? "del" + : "merge"); } std::cout << std::endl; } diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 545d48a1f5f..65290bfad38 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -3,12 +3,13 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include -#include +#include "db/db_iter.h" + #include +#include #include +#include -#include "db/db_iter.h" #include "db/dbformat.h" #include "rocksdb/comparator.h" #include "rocksdb/options.h" @@ -82,8 +83,8 @@ class TestIterator : public InternalIterator { std::sort(data_.begin(), data_.end(), [this](std::pair a, std::pair b) { - return (cmp.Compare(a.first, b.first) < 0); - }); + return (cmp.Compare(a.first, b.first) < 0); + }); } // Removes the key from the set of keys over which this iterator iterates. @@ -429,7 +430,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(static_cast(get_perf_context()->internal_key_skipped_count), 1); + ASSERT_EQ(static_cast(get_perf_context()->internal_key_skipped_count), + 1); ASSERT_EQ(db_iter->key().ToString(), "b"); SetPerfLevel(kDisable); @@ -557,7 +559,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(static_cast(get_perf_context()->internal_delete_skipped_count), 0); + ASSERT_EQ( + static_cast(get_perf_context()->internal_delete_skipped_count), 0); ASSERT_EQ(db_iter->key().ToString(), "b"); SetPerfLevel(kDisable); @@ -3013,7 +3016,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } - TEST_F(DBIteratorTest, SeekPrefixTombstones) { ReadOptions ro; Options options; diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index 51c81585cb3..aaf1408b48a 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -236,7 +236,7 @@ namespace { std::string MakeLongKey(size_t length, char c) { return std::string(length, c); } -} // namespace +} // anonymous namespace TEST_P(DBIteratorTest, IterLongKeys) { ASSERT_OK(Put(MakeLongKey(20, 0), "0")); @@ -1037,7 +1037,8 @@ TEST_P(DBIteratorTest, DBIteratorBoundTest) { iter->Next(); ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(static_cast(get_perf_context()->internal_delete_skipped_count), 2); + ASSERT_EQ( + static_cast(get_perf_context()->internal_delete_skipped_count), 2); // now testing with iterate_bound Slice prefix("c"); @@ -1060,7 +1061,8 @@ TEST_P(DBIteratorTest, DBIteratorBoundTest) { // even though the key is deleted // hence internal_delete_skipped_count should be 0 ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(static_cast(get_perf_context()->internal_delete_skipped_count), 0); + ASSERT_EQ( + static_cast(get_perf_context()->internal_delete_skipped_count), 0); } } @@ -1536,7 +1538,7 @@ class DBIteratorTestForPinnedData : public DBIteratorTest { } delete iter; -} + } }; #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) @@ -2180,8 +2182,8 @@ TEST_P(DBIteratorTest, IteratorWithLocalStatistics) { ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev); ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND), (uint64_t)total_prev_found); - ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), (uint64_t)total_bytes); - + ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), + (uint64_t)total_bytes); } TEST_P(DBIteratorTest, ReadAhead) { @@ -2310,8 +2312,8 @@ TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) { EXPECT_EQ(get_perf_context()->internal_merge_count, 0); EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2); EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2); - EXPECT_EQ(1, options.statistics->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION)); + EXPECT_EQ(1, + options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION)); } TEST_P(DBIteratorTest, Refresh) { @@ -2592,7 +2594,7 @@ TEST_P(DBIteratorTest, SkipStatistics) { } ASSERT_EQ(count, 3); delete iter; - skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence + skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); iter = NewIterator(ReadOptions()); @@ -2603,7 +2605,7 @@ TEST_P(DBIteratorTest, SkipStatistics) { } ASSERT_EQ(count, 3); delete iter; - skip_count += 8; // Same as above, but in reverse order + skip_count += 8; // Same as above, but in reverse order ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); ASSERT_OK(Put("aa", "1")); @@ -2621,18 +2623,18 @@ TEST_P(DBIteratorTest, SkipStatistics) { iter = NewIterator(ro); count = 0; - for(iter->Seek("aa"); iter->Valid(); iter->Next()) { + for (iter->Seek("aa"); iter->Valid(); iter->Next()) { ASSERT_OK(iter->status()); count++; } ASSERT_EQ(count, 1); delete iter; - skip_count += 6; // 3 deletes + 3 original keys + skip_count += 6; // 3 deletes + 3 original keys ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP)); iter = NewIterator(ro); count = 0; - for(iter->SeekToLast(); iter->Valid(); iter->Prev()) { + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { ASSERT_OK(iter->status()); count++; } diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc index f0cf215e119..4e982858c4c 100644 --- a/db/db_log_iter_test.cc +++ b/db/db_log_iter_test.cc @@ -55,14 +55,13 @@ SequenceNumber ReadRecords(std::unique_ptr& iter, return res.sequence; } -void ExpectRecords( - const int expected_no_records, - std::unique_ptr& iter) { +void ExpectRecords(const int expected_no_records, + std::unique_ptr& iter) { int num_records; ReadRecords(iter, num_records); ASSERT_EQ(num_records, expected_no_records); } -} // namespace +} // anonymous namespace TEST_F(DBTestXactLogIterator, TransactionLogIterator) { do { @@ -95,10 +94,9 @@ TEST_F(DBTestXactLogIterator, TransactionLogIterator) { TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) { static const int LOG_ITERATOR_RACE_TEST_COUNT = 2; static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = { - {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1", + {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1", "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"}, - {"WalManager::GetSortedWalsOfType:1", - "WalManager::PurgeObsoleteFiles:1", + {"WalManager::GetSortedWalsOfType:1", "WalManager::PurgeObsoleteFiles:1", "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalsOfType:2"}}; for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) { @@ -300,8 +298,8 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); #else - (void) argc; - (void) argv; + (void)argc; + (void)argv; return 0; #endif } diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc index ae5cded0e05..13c16618e6b 100644 --- a/db/db_logical_block_size_cache_test.cc +++ b/db/db_logical_block_size_cache_test.cc @@ -224,8 +224,8 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) { // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry // is dropped from cache ASSERT_EQ(0, cache_->Size()); - ASSERT_OK(DestroyDB(dbname_, options, - {{"cf1", cf_options}, {"cf2", cf_options}})); + ASSERT_OK( + DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}})); } TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) { @@ -313,8 +313,8 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) { delete db; ASSERT_EQ(0, cache_->Size()); } - ASSERT_OK(DestroyDB(dbname_, options, - {{"cf1", cf_options}, {"cf2", cf_options}})); + ASSERT_OK( + DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}})); } TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) { diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc index 1ae5f32873f..629d3923f63 100644 --- a/db/db_merge_operand_test.cc +++ b/db/db_merge_operand_test.cc @@ -39,7 +39,7 @@ class LimitedStringAppendMergeOp : public StringAppendTESTOperator { private: size_t limit_ = 0; }; -} // namespace +} // anonymous namespace class DBMergeOperandTest : public DBTestBase { public: @@ -439,6 +439,48 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsLargeResultOptimization) { } } +TEST_F(DBMergeOperandTest, GetMergeOperandsBaseDeletionInImmMem) { + // In this test, "k1" has a MERGE in a mutable memtable on top of a base + // DELETE in an immutable memtable. + Options opts = CurrentOptions(); + opts.max_write_buffer_number = 10; + opts.min_write_buffer_number_to_merge = 10; + opts.merge_operator = MergeOperators::CreateDeprecatedPutOperator(); + Reopen(opts); + + ASSERT_OK(Put("k1", "val")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("k0", "val")); + ASSERT_OK(Delete("k1")); + ASSERT_OK(Put("k2", "val")); + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(Merge("k1", "val")); + + { + std::vector values(2); + + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = + static_cast(values.size()); + + std::string key = "k1", from_db; + int number_of_operands = 0; + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + key, values.data(), &merge_operands_info, + &number_of_operands)); + ASSERT_EQ(1, number_of_operands); + from_db = values[0].ToString(); + ASSERT_EQ("val", from_db); + } + + { + std::string val; + ASSERT_OK(db_->Get(ReadOptions(), "k1", &val)); + ASSERT_EQ("val", val); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc index 8e551d08dcf..f8c90c15871 100644 --- a/db/db_merge_operator_test.cc +++ b/db/db_merge_operator_test.cc @@ -84,8 +84,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) { Options options; options.create_if_missing = true; // Use only the latest two merge operands. - options.merge_operator = - std::make_shared(2, ','); + options.merge_operator = std::make_shared(2, ','); options.env = env_; Reopen(options); // All K1 values are in memtable. @@ -203,6 +202,160 @@ TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) { VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}}); } +#ifndef ROCKSDB_LITE + +TEST_F(DBMergeOperatorTest, MergeOperatorFailsWithMustMerge) { + // This is like a mini-stress test dedicated to `OpFailureScope::kMustMerge`. + // Some or most of it might be deleted upon adding that option to the actual + // stress test. + // + // "k0" and "k2" are stable (uncorrupted) keys before and after a corrupted + // key ("k1"). The outer loop (`i`) varies which write (`j`) to "k1" triggers + // the corruption. Inside that loop there are three cases: + // + // - Case 1: pure `Merge()`s + // - Case 2: `Merge()`s on top of a `Put()` + // - Case 3: `Merge()`s on top of a `Delete()` + // + // For each case we test query results before flush, after flush, and after + // compaction, as well as cleanup after deletion+compaction. The queries + // expect "k0" and "k2" to always be readable. "k1" is expected to be readable + // only by APIs that do not require merging, such as `GetMergeOperands()`. + const int kNumOperands = 3; + Options options; + options.merge_operator.reset(new TestPutOperator()); + options.env = env_; + Reopen(options); + + for (int i = 0; i < kNumOperands; ++i) { + auto check_query = [&]() { + { + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), "k0", &value)); + ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption()); + ASSERT_OK(db_->Get(ReadOptions(), "k2", &value)); + } + + { + std::unique_ptr iter; + iter.reset(db_->NewIterator(ReadOptions())); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k0", iter->key()); + iter->Next(); + ASSERT_TRUE(iter->status().IsCorruption()); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k2", iter->key()); + iter->Prev(); + ASSERT_TRUE(iter->status().IsCorruption()); + + iter->Seek("k2"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("k2", iter->key()); + } + + std::vector values(kNumOperands); + GetMergeOperandsOptions merge_operands_info; + merge_operands_info.expected_max_number_of_operands = kNumOperands; + int num_operands_found = 0; + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + "k1", values.data(), &merge_operands_info, + &num_operands_found)); + ASSERT_EQ(kNumOperands, num_operands_found); + for (int j = 0; j < num_operands_found; ++j) { + if (i == j) { + ASSERT_EQ(values[j], "corrupted_must_merge"); + } else { + ASSERT_EQ(values[j], "ok"); + } + } + }; + + ASSERT_OK(Put("k0", "val")); + ASSERT_OK(Put("k2", "val")); + + // Case 1 + for (int j = 0; j < kNumOperands; ++j) { + if (j == i) { + ASSERT_OK(Merge("k1", "corrupted_must_merge")); + } else { + ASSERT_OK(Merge("k1", "ok")); + } + } + check_query(); + ASSERT_OK(Flush()); + check_query(); + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + check_query(); + + // Case 2 + for (int j = 0; j < kNumOperands; ++j) { + Slice val; + if (j == i) { + val = "corrupted_must_merge"; + } else { + val = "ok"; + } + if (j == 0) { + ASSERT_OK(Put("k1", val)); + } else { + ASSERT_OK(Merge("k1", val)); + } + } + check_query(); + ASSERT_OK(Flush()); + check_query(); + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + check_query(); + + // Case 3 + ASSERT_OK(Delete("k1")); + for (int j = 0; j < kNumOperands; ++j) { + if (i == j) { + ASSERT_OK(Merge("k1", "corrupted_must_merge")); + } else { + ASSERT_OK(Merge("k1", "ok")); + } + } + check_query(); + ASSERT_OK(Flush()); + check_query(); + { + CompactRangeOptions cro; + cro.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + } + check_query(); + + // Verify obsolete data removal still happens + ASSERT_OK(Delete("k0")); + ASSERT_OK(Delete("k1")); + ASSERT_OK(Delete("k2")); + ASSERT_EQ("NOT_FOUND", Get("k0")); + ASSERT_EQ("NOT_FOUND", Get("k1")); + ASSERT_EQ("NOT_FOUND", Get("k2")); + CompactRangeOptions cro; + cro.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("", FilesPerLevel()); + } +} + +#endif // ROCKSDB_LITE class MergeOperatorPinningTest : public DBMergeOperatorTest, public testing::WithParamInterface { @@ -471,7 +624,7 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) { "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) { EXPECT_FALSE(stepped_to_next_operand); stepped_to_next_operand = true; - someone_else.reset(); // Unpin SuperVersion A + someone_else.reset(); // Unpin SuperVersion A }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 8ec184757eb..691081db9db 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -402,7 +402,7 @@ TEST_F(DBOptionsTest, SetWalBytesPerSync) { // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its // empty and will not get the new wal_bytes_per_sync value. low_bytes_per_sync = counter; - //5242880 = 1024 * 1024 * 5 + // 5242880 = 1024 * 1024 * 5 ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}})); ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync); counter = 0; @@ -604,7 +604,7 @@ TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) { TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) { Options options; options.create_if_missing = true; - options.max_background_compactions = 1; // default value + options.max_background_compactions = 1; // default value options.env = env_; Reopen(options); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); @@ -627,7 +627,6 @@ TEST_F(DBOptionsTest, SetBackgroundFlushThreads) { ASSERT_EQ(3, dbfull()->TEST_BGFlushesAllowed()); } - TEST_F(DBOptionsTest, SetBackgroundJobs) { Options options; options.create_if_missing = true; @@ -691,7 +690,8 @@ TEST_F(DBOptionsTest, SetDelayedWriteRateOption) { options.delayed_write_rate = 2 * 1024U * 1024U; options.env = env_; Reopen(options); - ASSERT_EQ(2 * 1024U * 1024U, dbfull()->TEST_write_controler().max_delayed_write_rate()); + ASSERT_EQ(2 * 1024U * 1024U, + dbfull()->TEST_write_controler().max_delayed_write_rate()); ASSERT_OK(dbfull()->SetDBOptions({{"delayed_write_rate", "20000"}})); ASSERT_EQ(20000, dbfull()->TEST_write_controler().max_delayed_write_rate()); diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 5d209593cd1..cbc55c97201 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -270,7 +270,8 @@ void GetExpectedTableProperties( const int kDeletionCount = kTableCount * kDeletionsPerTable; const int kMergeCount = kTableCount * kMergeOperandsPerTable; const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable; - const int kKeyCount = kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount; + const int kKeyCount = + kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount; const int kAvgSuccessorSize = kKeySize / 5; const int kEncodingSavePerKey = kKeySize / 4; expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); @@ -281,7 +282,8 @@ void GetExpectedTableProperties( expected_tp->num_merge_operands = kMergeCount; expected_tp->num_range_deletions = kRangeDeletionCount; expected_tp->num_data_blocks = - kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / + kTableCount * + (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / kBlockSize; expected_tp->data_size = kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); @@ -1120,7 +1122,8 @@ class CountingUserTblPropCollector : public TablePropertiesCollector { std::string encoded; PutVarint32(&encoded, count_); *properties = UserCollectedProperties{ - {"CountingUserTblPropCollector", message_}, {"Count", encoded}, + {"CountingUserTblPropCollector", message_}, + {"Count", encoded}, }; return Status::OK(); } @@ -1845,8 +1848,8 @@ TEST_F(DBPropertiesTest, BlobCacheProperties) { // Insert unpinned blob to the cache and check size. constexpr size_t kSize1 = 70; - ASSERT_OK(blob_cache->Insert("blob1", nullptr /*value*/, kSize1, - nullptr /*deleter*/)); + ASSERT_OK(blob_cache->Insert("blob1", nullptr /*value*/, + &kNoopCacheItemHelper, kSize1)); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value)); @@ -1858,8 +1861,8 @@ TEST_F(DBPropertiesTest, BlobCacheProperties) { // Insert pinned blob to the cache and check size. constexpr size_t kSize2 = 60; Cache::Handle* blob2 = nullptr; - ASSERT_OK(blob_cache->Insert("blob2", nullptr /*value*/, kSize2, - nullptr /*deleter*/, &blob2)); + ASSERT_OK(blob_cache->Insert("blob2", nullptr /*value*/, + &kNoopCacheItemHelper, kSize2, &blob2)); ASSERT_NE(nullptr, blob2); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); @@ -1873,8 +1876,8 @@ TEST_F(DBPropertiesTest, BlobCacheProperties) { // Insert another pinned blob to make the cache over-sized. constexpr size_t kSize3 = 80; Cache::Handle* blob3 = nullptr; - ASSERT_OK(blob_cache->Insert("blob3", nullptr /*value*/, kSize3, - nullptr /*deleter*/, &blob3)); + ASSERT_OK(blob_cache->Insert("blob3", nullptr /*value*/, + &kNoopCacheItemHelper, kSize3, &blob3)); ASSERT_NE(nullptr, blob3); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); @@ -1953,8 +1956,8 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) { // Insert unpinned item to the cache and check size. constexpr size_t kSize1 = 50; - ASSERT_OK(block_cache->Insert("item1", nullptr /*value*/, kSize1, - nullptr /*deleter*/)); + ASSERT_OK(block_cache->Insert("item1", nullptr /*value*/, + &kNoopCacheItemHelper, kSize1)); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value)); @@ -1966,8 +1969,8 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) { // Insert pinned item to the cache and check size. constexpr size_t kSize2 = 30; Cache::Handle* item2 = nullptr; - ASSERT_OK(block_cache->Insert("item2", nullptr /*value*/, kSize2, - nullptr /*deleter*/, &item2)); + ASSERT_OK(block_cache->Insert("item2", nullptr /*value*/, + &kNoopCacheItemHelper, kSize2, &item2)); ASSERT_NE(nullptr, item2); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); @@ -1980,8 +1983,8 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) { // Insert another pinned item to make the cache over-sized. constexpr size_t kSize3 = 80; Cache::Handle* item3 = nullptr; - ASSERT_OK(block_cache->Insert("item3", nullptr /*value*/, kSize3, - nullptr /*deleter*/, &item3)); + ASSERT_OK(block_cache->Insert("item3", nullptr /*value*/, + &kNoopCacheItemHelper, kSize3, &item3)); ASSERT_NE(nullptr, item2); ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value)); ASSERT_EQ(kCapacity, value); @@ -2122,7 +2125,7 @@ std::string PopMetaIndexKey(InternalIterator* meta_iter) { } } -} // namespace +} // anonymous namespace TEST_F(DBPropertiesTest, TableMetaIndexKeys) { // This is to detect unexpected churn in metaindex block keys. This is more diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 7ba1280da3a..bfabc42fb67 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -238,7 +238,8 @@ TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) { const Snapshot* snapshot = db_->GetSnapshot(); // gaps between ranges creates sentinels in our internal representation - std::vector> range_dels = {{"a", "b"}, {"c", "d"}, {"e", "f"}}; + std::vector> range_dels = { + {"a", "b"}, {"c", "d"}, {"e", "f"}}; for (const auto& range_del : range_dels) { ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), range_del.first, range_del.second)); @@ -478,7 +479,10 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) { std::vector values; // Write 100KB (100 values, each 1K) for (int k = 0; k < kNumPerFile; k++) { - values.push_back(rnd.RandomString(990)); + // For the highest level, use smaller value size such that it does not + // prematurely cause auto compaction due to range tombstone adding + // additional compensated file size + values.push_back(rnd.RandomString((i == kNumLevels - 2) ? 600 : 990)); ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k])); } // put extra key to trigger flush @@ -491,7 +495,13 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) { } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1); + if (i == kNumLevels - 2) { + // For the highest level, value size is smaller (see Put() above), + // so output file number is smaller. + ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 2); + } else { + ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1); + } } // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions // happen since input level > 0; (2) range deletions are not dropped since @@ -567,8 +577,8 @@ TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) { std::string val; PutFixed64(&val, 1); ASSERT_OK(db_->Put(WriteOptions(), "key", val)); - ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), - "key", "key_")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key", + "key_")); ASSERT_OK(db_->Merge(WriteOptions(), "key", val)); ASSERT_OK(db_->Flush(FlushOptions())); @@ -1332,7 +1342,7 @@ TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) { const int kFileBytes = 1 << 20; const int kValueBytes = 1 << 10; const int kNumFiles = 4; - const int kMaxKey = kNumFiles* kFileBytes / kValueBytes; + const int kMaxKey = kNumFiles * kFileBytes / kValueBytes; const int kKeysOverwritten = 10; Options options = CurrentOptions(); @@ -1649,7 +1659,8 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { const auto& table_props = name_and_table_props.second; // The range tombstone should only be output to the second L1 SST. if (name.size() >= l1_metadata[1].name.size() && - name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) { + name.substr(name.size() - l1_metadata[1].name.size()) + .compare(l1_metadata[1].name) == 0) { ASSERT_EQ(1, table_props->num_range_deletions); ++num_range_deletions; } else { @@ -2794,6 +2805,225 @@ TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) { ASSERT_OK(iter->status()); } +TEST_F(DBRangeDelTest, RangetombesoneCompensateFilesize) { + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + + std::vector values; + Random rnd(301); + // file in L2 + values.push_back(rnd.RandomString(1 << 10)); + ASSERT_OK(Put("a", values.back())); + values.push_back(rnd.RandomString(1 << 10)); + ASSERT_OK(Put("b", values.back())); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + uint64_t l2_size = 0; + ASSERT_OK(Size("a", "c", 0 /* cf */, &l2_size)); + ASSERT_GT(l2_size, 0); + // file in L1 + values.push_back(rnd.RandomString(1 << 10)); + ASSERT_OK(Put("d", values.back())); + values.push_back(rnd.RandomString(1 << 10)); + ASSERT_OK(Put("e", values.back())); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + uint64_t l1_size = 0; + ASSERT_OK(Size("d", "f", 0 /* cf */, &l1_size)); + ASSERT_GT(l1_size, 0); + + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f")); + ASSERT_OK(Flush()); + // Range deletion compensated size computed during flush time + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(level_to_files[0].size(), 1); + ASSERT_EQ(level_to_files[0][0].compensated_range_deletion_size, + l1_size + l2_size); + ASSERT_EQ(level_to_files[1].size(), 1); + ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, 0); + ASSERT_EQ(level_to_files[2].size(), 1); + ASSERT_EQ(level_to_files[2][0].compensated_range_deletion_size, 0); + + // Range deletion compensated size computed during compaction time + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(level_to_files[1].size(), 1); + ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size); + ASSERT_EQ(level_to_files[2].size(), 1); + ASSERT_EQ(level_to_files[2][0].compensated_range_deletion_size, 0); +} + +TEST_F(DBRangeDelTest, RangetombesoneCompensateFilesizePersistDuringReopen) { + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + + std::vector values; + Random rnd(301); + values.push_back(rnd.RandomString(1 << 10)); + ASSERT_OK(Put("a", values.back())); + values.push_back(rnd.RandomString(1 << 10)); + ASSERT_OK(Put("b", values.back())); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z")); + ASSERT_OK(Flush()); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(level_to_files[0].size(), 1); + ASSERT_EQ(level_to_files[1].size(), 1); + ASSERT_EQ(level_to_files[2].size(), 1); + uint64_t l2_size = level_to_files[2][0].fd.GetFileSize(); + uint64_t l1_size = level_to_files[1][0].fd.GetFileSize(); + ASSERT_GT(l2_size, 0); + ASSERT_GT(l1_size, 0); + ASSERT_EQ(level_to_files[0][0].compensated_range_deletion_size, + l1_size + l2_size); + ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size); + + Reopen(opts); + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(level_to_files[0].size(), 1); + ASSERT_EQ(level_to_files[0][0].compensated_range_deletion_size, + l1_size + l2_size); + ASSERT_EQ(level_to_files[1].size(), 1); + ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size); +} + +TEST_F(DBRangeDelTest, SingleKeyFile) { + // Test for a bug fix where a range tombstone could be added + // to an SST file while is not within the file's key range. + // Create 3 files in L0 and then L1 where all keys have the same user key + // `Key(2)`. The middle file will contain Key(2)@6 and Key(2)@5. Before fix, + // the range tombstone [Key(2), Key(5))@2 would be added to this file during + // compaction, but it is not in this file's key range. + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.target_file_size_base = 1 << 10; + opts.level_compaction_dynamic_file_size = false; + DestroyAndReopen(opts); + + // prevent range tombstone drop + std::vector snapshots; + snapshots.push_back(db_->GetSnapshot()); + + // write a key to bottommost file so the compactions below + // are not bottommost compactions and will calculate + // compensated range tombstone size. Before bug fix, an assert would fail + // during this process. + Random rnd(301); + ASSERT_OK(Put(Key(2), rnd.RandomString(8 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(5))); + snapshots.push_back(db_->GetSnapshot()); + std::vector values; + + values.push_back(rnd.RandomString(8 << 10)); + ASSERT_OK(Put(Key(2), rnd.RandomString(8 << 10))); + snapshots.push_back(db_->GetSnapshot()); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(2), rnd.RandomString(8 << 10))); + snapshots.push_back(db_->GetSnapshot()); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(2), rnd.RandomString(8 << 10))); + snapshots.push_back(db_->GetSnapshot()); + ASSERT_OK(Flush()); + + ASSERT_EQ(NumTableFilesAtLevel(0), 3); + CompactRangeOptions co; + co.bottommost_level_compaction = BottommostLevelCompaction::kForce; + + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 0, 1, co, nullptr, nullptr, true, true, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/)); + + for (const auto s : snapshots) { + db_->ReleaseSnapshot(s); + } +} + +TEST_F(DBRangeDelTest, DoubleCountRangeTombstoneCompensatedSize) { + // Test for a bug fix if a file has multiple range tombstones + // with same start and end key but with different sequence numbers, + // we should only calculate compensated range tombstone size + // for one of them. + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + DestroyAndReopen(opts); + + std::vector values; + Random rnd(301); + // file in L2 + ASSERT_OK(Put(Key(1), rnd.RandomString(1 << 10))); + ASSERT_OK(Put(Key(2), rnd.RandomString(1 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + uint64_t l2_size = 0; + ASSERT_OK(Size(Key(1), Key(3), 0 /* cf */, &l2_size)); + ASSERT_GT(l2_size, 0); + + // file in L1 + ASSERT_OK(Put(Key(3), rnd.RandomString(1 << 10))); + ASSERT_OK(Put(Key(4), rnd.RandomString(1 << 10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + uint64_t l1_size = 0; + ASSERT_OK(Size(Key(3), Key(5), 0 /* cf */, &l1_size)); + ASSERT_GT(l1_size, 0); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key(5))); + // so that the range tombstone above is not dropped + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key(5))); + ASSERT_OK(Flush()); + // Range deletion compensated size computed during flush time + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(level_to_files[0].size(), 1); + // instead of 2 * (l1_size + l2_size) + ASSERT_EQ(level_to_files[0][0].compensated_range_deletion_size, + l1_size + l2_size); + + // Range deletion compensated size computed during compaction time + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_EQ(level_to_files[1].size(), 1); + ASSERT_EQ(level_to_files[1][0].compensated_range_deletion_size, l2_size); + db_->ReleaseSnapshot(snapshot); +} + #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 930ff468b25..20d7534e057 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -499,7 +499,7 @@ class TraceFileEnv : public EnvWrapper { private: std::atomic files_closed_{0}; }; -} // namespace +} // anonymous namespace TEST_F(DBSecondaryTest, SecondaryCloseFiles) { Options options; diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc index 91ae972cb38..4d46553611c 100644 --- a/db/db_statistics_test.cc +++ b/db/db_statistics_test.cc @@ -70,9 +70,9 @@ TEST_F(DBStatisticsTest, CompressionStatsTest) { options.compression = kNoCompression; DestroyAndReopen(options); uint64_t currentCompressions = - options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); + options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); uint64_t currentDecompressions = - options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED); + options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED); // Check that compressions do not occur when turned off for (int i = 0; i < kNumKeysWritten; ++i) { @@ -80,14 +80,16 @@ TEST_F(DBStatisticsTest, CompressionStatsTest) { ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); } ASSERT_OK(Flush()); - ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) - - currentCompressions, 0); + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) - + currentCompressions, + 0); for (int i = 0; i < kNumKeysWritten; ++i) { auto r = Get(Key(i)); } - ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) - - currentDecompressions, 0); + ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) - + currentDecompressions, + 0); } TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) { diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index eb8c9a6030d..981a514ad8f 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -52,7 +52,7 @@ void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { VerifySstUniqueIds(props); } -} // namespace +} // anonymous namespace class DBTablePropertiesTest : public DBTestBase, public testing::WithParamInterface { @@ -240,7 +240,6 @@ TablePropertiesCollection DBTablePropertiesTest::TestGetPropertiesOfTablesInRange( std::vector ranges, std::size_t* num_properties, std::size_t* num_files) { - // Since we deref zero element in the vector it can not be empty // otherwise we pass an address to some random memory EXPECT_GT(ranges.size(), 0U); @@ -469,12 +468,12 @@ INSTANTIATE_TEST_CASE_P( class DeletionTriggeredCompactionTestListener : public EventListener { public: - void OnCompactionBegin(DB* , const CompactionJobInfo& ci) override { + void OnCompactionBegin(DB*, const CompactionJobInfo& ci) override { ASSERT_EQ(ci.compaction_reason, CompactionReason::kFilesMarkedForCompaction); } - void OnCompactionCompleted(DB* , const CompactionJobInfo& ci) override { + void OnCompactionCompleted(DB*, const CompactionJobInfo& ci) override { ASSERT_EQ(ci.compaction_reason, CompactionReason::kFilesMarkedForCompaction); } @@ -485,13 +484,13 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { int kWindowSize = 100; int kNumDelsTrigger = 90; std::shared_ptr compact_on_del = - NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger); + NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger); Options opts = CurrentOptions(); opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); opts.table_properties_collector_factories.emplace_back(compact_on_del); - if(GetParam() == "kCompactionStyleUniversal") { + if (GetParam() == "kCompactionStyleUniversal") { opts.compaction_style = kCompactionStyleUniversal; } Reopen(opts); @@ -502,8 +501,8 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { ASSERT_OK(Flush()); MoveFilesToLevel(1); - DeletionTriggeredCompactionTestListener *listener = - new DeletionTriggeredCompactionTestListener(); + DeletionTriggeredCompactionTestListener* listener = + new DeletionTriggeredCompactionTestListener(); opts.listeners.emplace_back(listener); Reopen(opts); @@ -524,10 +523,10 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { // effect kWindowSize = 50; kNumDelsTrigger = 40; - static_cast - (compact_on_del.get())->SetWindowSize(kWindowSize); - static_cast - (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger); + static_cast(compact_on_del.get()) + ->SetWindowSize(kWindowSize); + static_cast(compact_on_del.get()) + ->SetDeletionTrigger(kNumDelsTrigger); for (int i = 0; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { @@ -543,10 +542,10 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) { // Change the window size to disable delete triggered compaction kWindowSize = 0; - static_cast - (compact_on_del.get())->SetWindowSize(kWindowSize); - static_cast - (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger); + static_cast(compact_on_del.get()) + ->SetWindowSize(kWindowSize); + static_cast(compact_on_del.get()) + ->SetDeletionTrigger(kNumDelsTrigger); for (int i = 0; i < kNumKeys; ++i) { if (i >= kNumKeys - kWindowSize && i < kNumKeys - kWindowSize + kNumDelsTrigger) { @@ -611,13 +610,9 @@ TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) { } } -INSTANTIATE_TEST_CASE_P( - DBTablePropertiesTest, - DBTablePropertiesTest, - ::testing::Values( - "kCompactionStyleLevel", - "kCompactionStyleUniversal" - )); +INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest, + ::testing::Values("kCompactionStyleLevel", + "kCompactionStyleUniversal")); } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_tailing_iter_test.cc b/db/db_tailing_iter_test.cc index 16aeee9ebe4..af3194ac4b0 100644 --- a/db/db_tailing_iter_test.cc +++ b/db/db_tailing_iter_test.cc @@ -399,7 +399,7 @@ TEST_P(DBTestTailingIterator, TailingIteratorSeekToSame) { // Write rows with keys 00000, 00002, 00004 etc. for (int i = 0; i < NROWS; ++i) { char buf[100]; - snprintf(buf, sizeof(buf), "%05d", 2*i); + snprintf(buf, sizeof(buf), "%05d", 2 * i); std::string key(buf); std::string value("value"); ASSERT_OK(db_->Put(WriteOptions(), key, value)); @@ -539,7 +539,6 @@ TEST_P(DBTestTailingIterator, SeekWithUpperBoundBug) { const Slice upper_bound("cc", 3); read_options.iterate_upper_bound = &upper_bound; - // 1st L0 file ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN")); ASSERT_OK(Flush()); @@ -565,7 +564,6 @@ TEST_P(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) { const Slice upper_bound("cc", 3); read_options.iterate_upper_bound = &upper_bound; - // 1st L0 file ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN")); ASSERT_OK(Flush()); @@ -599,8 +597,8 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); #else - (void) argc; - (void) argv; + (void)argc; + (void)argv; return 0; #endif } diff --git a/db/db_test.cc b/db/db_test.cc index 00eafc3c222..79d56439432 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1137,7 +1137,7 @@ class DelayFilterFactory : public CompactionFilterFactory { private: DBTestBase* db_test; }; -} // namespace +} // anonymous namespace #ifndef ROCKSDB_LITE @@ -1205,6 +1205,8 @@ void CheckColumnFamilyMeta( file_meta_from_files.file_creation_time); ASSERT_GE(file_meta_from_cf.file_creation_time, start_time); ASSERT_LE(file_meta_from_cf.file_creation_time, end_time); + ASSERT_EQ(file_meta_from_cf.epoch_number, + file_meta_from_files.epoch_number); ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time); ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time); // More from FileStorageInfo @@ -1255,6 +1257,7 @@ void CheckLiveFilesMeta( ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString()); ASSERT_EQ(meta.oldest_blob_file_number, expected_meta.oldest_blob_file_number); + ASSERT_EQ(meta.epoch_number, expected_meta.epoch_number); // More from FileStorageInfo ASSERT_EQ(meta.file_type, kTableFile); @@ -1492,7 +1495,7 @@ bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, } return true; } -} // namespace +} // anonymous namespace TEST_F(DBTest, MinLevelToCompress1) { Options options = CurrentOptions(); @@ -2845,7 +2848,7 @@ static void MTThreadBody(void* arg) { fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); } -} // namespace +} // anonymous namespace class MultiThreadedDBTest : public DBTest, @@ -2931,7 +2934,7 @@ static void GCThreadBody(void* arg) { t->done = true; } -} // namespace +} // anonymous namespace TEST_F(DBTest, GroupCommitTest) { do { @@ -3077,6 +3080,11 @@ class ModelDB : public DB { } return Write(o, &batch); } + Status Merge(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/, + const Slice& /*k*/, const Slice& /*ts*/, + const Slice& /*value*/) override { + return Status::NotSupported(); + } using DB::Get; Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/, const Slice& key, PinnableSlice* /*value*/) override { @@ -4667,7 +4675,7 @@ void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type, } ASSERT_EQ(op_count, expected_count); } -} // namespace +} // anonymous namespace TEST_F(DBTest, GetThreadStatus) { Options options; diff --git a/db/db_test2.cc b/db/db_test2.cc index 6e9cf59cf83..b4f1664f47b 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -14,6 +14,7 @@ #include "db/db_test_util.h" #include "db/read_callback.h" +#include "db/version_edit.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" @@ -33,6 +34,18 @@ namespace ROCKSDB_NAMESPACE { class DBTest2 : public DBTestBase { public: DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {} + std::vector GetLevelFileMetadatas(int level, int cf = 0) { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = + versions->GetColumnFamilySet()->GetColumnFamily(cf); + assert(cfd); + Version* const current = cfd->current(); + assert(current); + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + return storage_info->LevelFiles(level); + } }; #ifndef ROCKSDB_LITE @@ -669,33 +682,33 @@ TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) { } namespace { - void ValidateKeyExistence(DB* db, const std::vector& keys_must_exist, - const std::vector& keys_must_not_exist) { - // Ensure that expected keys exist - std::vector values; - if (keys_must_exist.size() > 0) { - std::vector status_list = +void ValidateKeyExistence(DB* db, const std::vector& keys_must_exist, + const std::vector& keys_must_not_exist) { + // Ensure that expected keys exist + std::vector values; + if (keys_must_exist.size() > 0) { + std::vector status_list = db->MultiGet(ReadOptions(), keys_must_exist, &values); - for (size_t i = 0; i < keys_must_exist.size(); i++) { - ASSERT_OK(status_list[i]); - } + for (size_t i = 0; i < keys_must_exist.size(); i++) { + ASSERT_OK(status_list[i]); } + } - // Ensure that given keys don't exist - if (keys_must_not_exist.size() > 0) { - std::vector status_list = + // Ensure that given keys don't exist + if (keys_must_not_exist.size() > 0) { + std::vector status_list = db->MultiGet(ReadOptions(), keys_must_not_exist, &values); - for (size_t i = 0; i < keys_must_not_exist.size(); i++) { - ASSERT_TRUE(status_list[i].IsNotFound()); - } + for (size_t i = 0; i < keys_must_not_exist.size(); i++) { + ASSERT_TRUE(status_list[i].IsNotFound()); } } +} -} // namespace +} // anonymous namespace TEST_F(DBTest2, WalFilterTest) { class TestWalFilter : public WalFilter { - private: + private: // Processing option that is requested to be applied at the given index WalFilter::WalProcessingOption wal_processing_option_; // Index at which to apply wal_processing_option_ @@ -705,12 +718,12 @@ TEST_F(DBTest2, WalFilterTest) { // Current record index, incremented with each record encountered. size_t current_record_index_; - public: + public: TestWalFilter(WalFilter::WalProcessingOption wal_processing_option, - size_t apply_option_for_record_index) - : wal_processing_option_(wal_processing_option), - apply_option_at_record_index_(apply_option_for_record_index), - current_record_index_(0) {} + size_t apply_option_for_record_index) + : wal_processing_option_(wal_processing_option), + apply_option_at_record_index_(apply_option_for_record_index), + current_record_index_(0) {} WalProcessingOption LogRecord(const WriteBatch& /*batch*/, WriteBatch* /*new_batch*/, @@ -719,8 +732,7 @@ TEST_F(DBTest2, WalFilterTest) { if (current_record_index_ == apply_option_at_record_index_) { option_to_return = wal_processing_option_; - } - else { + } else { option_to_return = WalProcessingOption::kContinueProcessing; } @@ -747,12 +759,12 @@ TEST_F(DBTest2, WalFilterTest) { // Test with all WAL processing options for (int option = 0; - option < static_cast( - WalFilter::WalProcessingOption::kWalProcessingOptionMax); - option++) { + option < static_cast( + WalFilter::WalProcessingOption::kWalProcessingOptionMax); + option++) { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - CreateAndReopenWithCF({ "pikachu" }, options); + CreateAndReopenWithCF({"pikachu"}, options); // Write given keys in given batches for (size_t i = 0; i < batch_keys.size(); i++) { @@ -764,28 +776,27 @@ TEST_F(DBTest2, WalFilterTest) { } WalFilter::WalProcessingOption wal_processing_option = - static_cast(option); + static_cast(option); // Create a test filter that would apply wal_processing_option at the first // record size_t apply_option_for_record_index = 1; TestWalFilter test_wal_filter(wal_processing_option, - apply_option_for_record_index); + apply_option_for_record_index); // Reopen database with option to use WAL filter options = OptionsForLogIterTest(); options.wal_filter = &test_wal_filter; Status status = - TryReopenWithColumnFamilies({ "default", "pikachu" }, options); + TryReopenWithColumnFamilies({"default", "pikachu"}, options); if (wal_processing_option == - WalFilter::WalProcessingOption::kCorruptedRecord) { + WalFilter::WalProcessingOption::kCorruptedRecord) { ASSERT_NOK(status); // In case of corruption we can turn off paranoid_checks to reopen // databse options.paranoid_checks = false; - ReopenWithColumnFamilies({ "default", "pikachu" }, options); - } - else { + ReopenWithColumnFamilies({"default", "pikachu"}, options); + } else { ASSERT_OK(status); } @@ -794,56 +805,54 @@ TEST_F(DBTest2, WalFilterTest) { std::vector keys_must_exist; std::vector keys_must_not_exist; switch (wal_processing_option) { - case WalFilter::WalProcessingOption::kCorruptedRecord: - case WalFilter::WalProcessingOption::kContinueProcessing: { - fprintf(stderr, "Testing with complete WAL processing\n"); - // we expect all records to be processed - for (size_t i = 0; i < batch_keys.size(); i++) { - for (size_t j = 0; j < batch_keys[i].size(); j++) { - keys_must_exist.push_back(Slice(batch_keys[i][j])); - } - } - break; - } - case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: { - fprintf(stderr, - "Testing with ignoring record %" ROCKSDB_PRIszt " only\n", - apply_option_for_record_index); - // We expect the record with apply_option_for_record_index to be not - // found. - for (size_t i = 0; i < batch_keys.size(); i++) { - for (size_t j = 0; j < batch_keys[i].size(); j++) { - if (i == apply_option_for_record_index) { - keys_must_not_exist.push_back(Slice(batch_keys[i][j])); - } - else { + case WalFilter::WalProcessingOption::kCorruptedRecord: + case WalFilter::WalProcessingOption::kContinueProcessing: { + fprintf(stderr, "Testing with complete WAL processing\n"); + // we expect all records to be processed + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { keys_must_exist.push_back(Slice(batch_keys[i][j])); } } + break; } - break; - } - case WalFilter::WalProcessingOption::kStopReplay: { - fprintf(stderr, - "Testing with stopping replay from record %" ROCKSDB_PRIszt - "\n", - apply_option_for_record_index); - // We expect records beyond apply_option_for_record_index to be not - // found. - for (size_t i = 0; i < batch_keys.size(); i++) { - for (size_t j = 0; j < batch_keys[i].size(); j++) { - if (i >= apply_option_for_record_index) { - keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: { + fprintf(stderr, + "Testing with ignoring record %" ROCKSDB_PRIszt " only\n", + apply_option_for_record_index); + // We expect the record with apply_option_for_record_index to be not + // found. + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i == apply_option_for_record_index) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } } - else { - keys_must_exist.push_back(Slice(batch_keys[i][j])); + } + break; + } + case WalFilter::WalProcessingOption::kStopReplay: { + fprintf(stderr, + "Testing with stopping replay from record %" ROCKSDB_PRIszt + "\n", + apply_option_for_record_index); + // We expect records beyond apply_option_for_record_index to be not + // found. + for (size_t i = 0; i < batch_keys.size(); i++) { + for (size_t j = 0; j < batch_keys[i].size(); j++) { + if (i >= apply_option_for_record_index) { + keys_must_not_exist.push_back(Slice(batch_keys[i][j])); + } else { + keys_must_exist.push_back(Slice(batch_keys[i][j])); + } } } + break; } - break; - } - default: - FAIL(); // unhandled case + default: + FAIL(); // unhandled case } bool checked_after_reopen = false; @@ -861,7 +870,7 @@ TEST_F(DBTest2, WalFilterTest) { //(even if they were skipped) // reopn database with option to use WAL filter options = OptionsForLogIterTest(); - ReopenWithColumnFamilies({ "default", "pikachu" }, options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); checked_after_reopen = true; } @@ -870,7 +879,7 @@ TEST_F(DBTest2, WalFilterTest) { TEST_F(DBTest2, WalFilterTestWithChangeBatch) { class ChangeBatchHandler : public WriteBatch::Handler { - private: + private: // Batch to insert keys in WriteBatch* new_write_batch_; // Number of keys to add in the new batch @@ -878,12 +887,12 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { // Number of keys added to new batch size_t num_keys_added_; - public: + public: ChangeBatchHandler(WriteBatch* new_write_batch, - size_t num_keys_to_add_in_new_batch) - : new_write_batch_(new_write_batch), - num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), - num_keys_added_(0) {} + size_t num_keys_to_add_in_new_batch) + : new_write_batch_(new_write_batch), + num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), + num_keys_added_(0) {} void Put(const Slice& key, const Slice& value) override { if (num_keys_added_ < num_keys_to_add_in_new_batch_) { ASSERT_OK(new_write_batch_->Put(key, value)); @@ -893,7 +902,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { }; class TestWalFilterWithChangeBatch : public WalFilter { - private: + private: // Index at which to start changing records size_t change_records_from_index_; // Number of keys to add in the new batch @@ -901,12 +910,12 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { // Current record index, incremented with each record encountered. size_t current_record_index_; - public: + public: TestWalFilterWithChangeBatch(size_t change_records_from_index, - size_t num_keys_to_add_in_new_batch) - : change_records_from_index_(change_records_from_index), - num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), - current_record_index_(0) {} + size_t num_keys_to_add_in_new_batch) + : change_records_from_index_(change_records_from_index), + num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), + current_record_index_(0) {} WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch, @@ -925,7 +934,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { // object, however we modify it for our own purpose here and hence // cast the constness away. (const_cast(this) - ->current_record_index_)++; + ->current_record_index_)++; return WalProcessingOption::kContinueProcessing; } @@ -944,7 +953,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - CreateAndReopenWithCF({ "pikachu" }, options); + CreateAndReopenWithCF({"pikachu"}, options); // Write given keys in given batches for (size_t i = 0; i < batch_keys.size(); i++) { @@ -960,12 +969,12 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { size_t change_records_from_index = 1; size_t num_keys_to_add_in_new_batch = 1; TestWalFilterWithChangeBatch test_wal_filter_with_change_batch( - change_records_from_index, num_keys_to_add_in_new_batch); + change_records_from_index, num_keys_to_add_in_new_batch); // Reopen database with option to use WAL filter options = OptionsForLogIterTest(); options.wal_filter = &test_wal_filter_with_change_batch; - ReopenWithColumnFamilies({ "default", "pikachu" }, options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); // Ensure that all keys exist before change_records_from_index_ // And after that index only single key exists @@ -977,8 +986,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { for (size_t j = 0; j < batch_keys[i].size(); j++) { if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) { keys_must_not_exist.push_back(Slice(batch_keys[i][j])); - } - else { + } else { keys_must_exist.push_back(Slice(batch_keys[i][j])); } } @@ -999,7 +1007,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { //(even if they were skipped) // reopn database with option to use WAL filter options = OptionsForLogIterTest(); - ReopenWithColumnFamilies({ "default", "pikachu" }, options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); checked_after_reopen = true; } @@ -1007,22 +1015,23 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) { TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter { - public: - WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch, - bool* batch_changed) const override { - *new_batch = batch; - Status s = new_batch->Put("key_extra", "value_extra"); - if (s.ok()) { - *batch_changed = true; - } else { - assert(false); - } - return WalProcessingOption::kContinueProcessing; - } - - const char* Name() const override { - return "WalFilterTestWithChangeBatchExtraKeys"; - } + public: + WalProcessingOption LogRecord(const WriteBatch& batch, + WriteBatch* new_batch, + bool* batch_changed) const override { + *new_batch = batch; + Status s = new_batch->Put("key_extra", "value_extra"); + if (s.ok()) { + *batch_changed = true; + } else { + assert(false); + } + return WalProcessingOption::kContinueProcessing; + } + + const char* Name() const override { + return "WalFilterTestWithChangeBatchExtraKeys"; + } }; std::vector> batch_keys(3); @@ -1036,7 +1045,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - CreateAndReopenWithCF({ "pikachu" }, options); + CreateAndReopenWithCF({"pikachu"}, options); // Write given keys in given batches for (size_t i = 0; i < batch_keys.size(); i++) { @@ -1059,7 +1068,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { // Reopen without filter, now reopen should succeed - previous // attempt to open must not have altered the db. options = OptionsForLogIterTest(); - ReopenWithColumnFamilies({ "default", "pikachu" }, options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); std::vector keys_must_exist; std::vector keys_must_not_exist; // empty vector @@ -1075,7 +1084,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { class TestWalFilterWithColumnFamilies : public WalFilter { - private: + private: // column_family_id -> log_number map (provided to WALFilter) std::map cf_log_number_map_; // column_family_name -> column_family_id map (provided to WALFilter) @@ -1085,31 +1094,34 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { // during recovery (i.e. aren't already flushed to SST file(s)) // for verification against the keys we expect. std::map> cf_wal_keys_; - public: - void ColumnFamilyLogNumberMap( - const std::map& cf_lognumber_map, - const std::map& cf_name_id_map) override { - cf_log_number_map_ = cf_lognumber_map; - cf_name_id_map_ = cf_name_id_map; - } - - WalProcessingOption LogRecordFound(unsigned long long log_number, - const std::string& /*log_file_name*/, - const WriteBatch& batch, - WriteBatch* /*new_batch*/, - bool* /*batch_changed*/) override { - class LogRecordBatchHandler : public WriteBatch::Handler { - private: - const std::map & cf_log_number_map_; - std::map> & cf_wal_keys_; + + public: + void ColumnFamilyLogNumberMap( + const std::map& cf_lognumber_map, + const std::map& cf_name_id_map) override { + cf_log_number_map_ = cf_lognumber_map; + cf_name_id_map_ = cf_name_id_map; + } + + WalProcessingOption LogRecordFound(unsigned long long log_number, + const std::string& /*log_file_name*/, + const WriteBatch& batch, + WriteBatch* /*new_batch*/, + bool* /*batch_changed*/) override { + class LogRecordBatchHandler : public WriteBatch::Handler { + private: + const std::map& cf_log_number_map_; + std::map>& cf_wal_keys_; unsigned long long log_number_; - public: - LogRecordBatchHandler(unsigned long long current_log_number, - const std::map & cf_log_number_map, - std::map> & cf_wal_keys) : - cf_log_number_map_(cf_log_number_map), - cf_wal_keys_(cf_wal_keys), - log_number_(current_log_number){} + + public: + LogRecordBatchHandler( + unsigned long long current_log_number, + const std::map& cf_log_number_map, + std::map>& cf_wal_keys) + : cf_log_number_map_(cf_log_number_map), + cf_wal_keys_(cf_wal_keys), + log_number_(current_log_number) {} Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& /*value*/) override { @@ -1120,8 +1132,8 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { // (i.e. isn't flushed to SST file(s) for column_family_id) // add it to the cf_wal_keys_ map for verification. if (log_number_ >= log_number_for_cf) { - cf_wal_keys_[column_family_id].push_back(std::string(key.data(), - key.size())); + cf_wal_keys_[column_family_id].push_back( + std::string(key.data(), key.size())); } return Status::OK(); } @@ -1134,17 +1146,17 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { } return WalProcessingOption::kContinueProcessing; - } + } - const char* Name() const override { - return "WalFilterTestWithColumnFamilies"; - } + const char* Name() const override { + return "WalFilterTestWithColumnFamilies"; + } const std::map>& GetColumnFamilyKeys() { return cf_wal_keys_; } - const std::map & GetColumnFamilyNameIdMap() { + const std::map& GetColumnFamilyNameIdMap() { return cf_name_id_map_; } }; @@ -1160,7 +1172,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); - CreateAndReopenWithCF({ "pikachu" }, options); + CreateAndReopenWithCF({"pikachu"}, options); // Write given keys in given batches for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { @@ -1174,7 +1186,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } - //Flush default column-family + // Flush default column-family ASSERT_OK(db_->Flush(FlushOptions(), handles_[0])); // Do some more writes @@ -1208,8 +1220,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { // Reopen database with option to use WAL filter options = OptionsForLogIterTest(); options.wal_filter = &test_wal_filter_column_families; - Status status = - TryReopenWithColumnFamilies({ "default", "pikachu" }, options); + Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_TRUE(status.ok()); // verify that handles_[0] only has post_flush keys @@ -1218,7 +1229,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap(); size_t index = 0; auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]]; - //default column-family, only post_flush keys are expected + // default column-family, only post_flush keys are expected for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { Slice key_from_the_log(keys_cf[index++]); @@ -1230,7 +1241,7 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { index = 0; keys_cf = cf_wal_keys[name_id_map["pikachu"]]; - //pikachu column-family, all keys are expected + // pikachu column-family, all keys are expected for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { Slice key_from_the_log(keys_cf[index++]); @@ -1280,7 +1291,7 @@ TEST_F(DBTest2, PresetCompressionDict) { #if LZ4_VERSION_NUMBER >= 10400 // r124+ compression_types.push_back(kLZ4Compression); compression_types.push_back(kLZ4HCCompression); -#endif // LZ4_VERSION_NUMBER >= 10400 +#endif // LZ4_VERSION_NUMBER >= 10400 if (ZSTD_Supported()) { compression_types.push_back(kZSTD); } @@ -1960,7 +1971,8 @@ TEST_F(DBTest2, CompressionOptions) { class CompactionStallTestListener : public EventListener { public: - CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {} + CompactionStallTestListener() + : compacting_files_cnt_(0), compacted_files_cnt_(0) {} void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { ASSERT_EQ(ci.cf_name, "default"); @@ -2039,7 +2051,8 @@ TEST_F(DBTest2, CompactionStall) { options.level0_file_num_compaction_trigger); ASSERT_GT(listener->compacted_files_cnt_.load(), 10 - options.level0_file_num_compaction_trigger); - ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load()); + ASSERT_EQ(listener->compacting_files_cnt_.load(), + listener->compacted_files_cnt_.load()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -2664,7 +2677,7 @@ namespace { void CountSyncPoint() { TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */); } -} // namespace +} // anonymous namespace TEST_F(DBTest2, SyncPointMarker) { std::atomic sync_point_called(0); @@ -2797,7 +2810,7 @@ TEST_F(DBTest2, ReadAmpBitmap) { } } -#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented +#ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { { const int kIdBufLen = 100; @@ -2899,7 +2912,6 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { size_t total_loaded_bytes_iter2 = options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); - // Read amp is on average 100% since we read all what we loaded in memory if (k == 0) { ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2, @@ -2911,7 +2923,7 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { } } } -#endif // !OS_SOLARIS +#endif // !OS_SOLARIS #ifndef ROCKSDB_LITE TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) { @@ -4524,7 +4536,7 @@ TEST_F(DBTest2, TraceAndReplay) { ASSERT_OK(replayer->Prepare()); // Replay using 1 thread, 1x speed. ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb)); - ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 8); ASSERT_EQ(res_handler.GetNumGets(), 3); ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); @@ -4550,7 +4562,7 @@ TEST_F(DBTest2, TraceAndReplay) { // Re-replay using 2 threads, 2x speed. ASSERT_OK(replayer->Prepare()); ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb)); - ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 8); ASSERT_EQ(res_handler.GetNumGets(), 3); ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); @@ -4560,7 +4572,7 @@ TEST_F(DBTest2, TraceAndReplay) { // Re-replay using 2 threads, 1/2 speed. ASSERT_OK(replayer->Prepare()); ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb)); - ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 8); ASSERT_EQ(res_handler.GetNumGets(), 3); ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); @@ -4758,7 +4770,7 @@ TEST_F(DBTest2, TraceAndManualReplay) { // end, or Prepare() was not called. ASSERT_TRUE(s.IsIncomplete()); ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete()); - ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 9); ASSERT_EQ(res_handler.GetNumGets(), 3); ASSERT_EQ(res_handler.GetNumIterSeeks(), 8); @@ -4792,7 +4804,7 @@ TEST_F(DBTest2, TraceAndManualReplay) { ASSERT_EQ("write1", value); ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value)); ASSERT_EQ("write2", value); - ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 1); ASSERT_EQ(res_handler.GetNumGets(), 0); ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); @@ -4817,7 +4829,7 @@ TEST_F(DBTest2, TraceAndManualReplay) { record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++)); ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); ASSERT_TRUE(result == nullptr); - ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 0); ASSERT_EQ(res_handler.GetNumGets(), 2); ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); @@ -4846,7 +4858,7 @@ TEST_F(DBTest2, TraceAndManualReplay) { ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); ASSERT_TRUE(result == nullptr); } - ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 0); ASSERT_EQ(res_handler.GetNumGets(), 0); ASSERT_EQ(res_handler.GetNumIterSeeks(), 4); // Seek x 2 in two iterations @@ -4901,7 +4913,7 @@ TEST_F(DBTest2, TraceAndManualReplay) { std::vector({"a"}), fake_ts++)); ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument()); ASSERT_TRUE(result == nullptr); - ASSERT_GT(res_handler.GetAvgLatency(), 0.0); + ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 0); ASSERT_EQ(res_handler.GetNumGets(), 0); ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); @@ -5192,7 +5204,7 @@ TEST_F(DBTest2, TraceWithFilter) { ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions())); handles.clear(); - DB* db3 = nullptr; + DB* db3 = nullptr; ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3)); env_->SleepForMicroseconds(100); @@ -5200,12 +5212,12 @@ TEST_F(DBTest2, TraceWithFilter) { ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound()); - //The tracer will not record the READ ops. + // The tracer will not record the READ ops. trace_opts.filter = TraceFilterType::kTraceFilterGet; std::string trace_filename3 = dbname_ + "/rocksdb.trace_3"; std::unique_ptr trace_writer3; ASSERT_OK( - NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3)); + NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3)); ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3))); ASSERT_OK(db3->Put(wo, handles[0], "a", "1")); @@ -5227,7 +5239,7 @@ TEST_F(DBTest2, TraceWithFilter) { std::unique_ptr trace_reader3; ASSERT_OK( - NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3)); + NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3)); // Count the number of records in the trace file; int count = 0; @@ -5503,16 +5515,20 @@ TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { port::Thread user_thread1([&]() { auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID()); ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); - TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1"); - TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"); + TEST_SYNC_POINT( + "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1"); + TEST_SYNC_POINT( + "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"); ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); }); port::Thread user_thread2([&]() { - TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"); + TEST_SYNC_POINT( + "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"); auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID()); ASSERT_EQ(cfh->GetID(), handles_[1]->GetID()); - TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2"); + TEST_SYNC_POINT( + "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2"); ASSERT_EQ(cfh->GetID(), handles_[1]->GetID()); }); @@ -5666,7 +5682,7 @@ class DummyOldStats : public Statistics { std::atomic num_rt{0}; std::atomic num_mt{0}; }; -} // namespace +} // anonymous namespace TEST_F(DBTest2, OldStatsInterface) { DummyOldStats* dos = new DummyOldStats(); @@ -7322,6 +7338,218 @@ TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { ReopenWithColumnFamilies({"default", "test1", "test2"}, options); } +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, SortL0FilesByEpochNumber) { + Options options = CurrentOptions(); + options.num_levels = 1; + options.compaction_style = kCompactionStyleUniversal; + DestroyAndReopen(options); + + // Set up L0 files to be sorted by their epoch_number + ASSERT_OK(Put("key1", "seq1")); + + SstFileWriter sst_file_writer{EnvOptions(), options}; + std::string external_file1 = dbname_ + "/test_files1.sst"; + std::string external_file2 = dbname_ + "/test_files2.sst"; + ASSERT_OK(sst_file_writer.Open(external_file1)); + ASSERT_OK(sst_file_writer.Put("key2", "seq0")); + ASSERT_OK(sst_file_writer.Finish()); + ASSERT_OK(sst_file_writer.Open(external_file2)); + ASSERT_OK(sst_file_writer.Put("key3", "seq0")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK(Put("key4", "seq2")); + ASSERT_OK(Flush()); + + auto* handle = db_->DefaultColumnFamily(); + ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file2}, + IngestExternalFileOptions())); + + // To verify L0 files are sorted by epoch_number in descending order + // instead of largest_seqno + std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 3); + + EXPECT_EQ(level0_files[0]->epoch_number, 3); + EXPECT_EQ(level0_files[0]->fd.largest_seqno, 0); + ASSERT_EQ(level0_files[0]->num_entries, 1); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key3")); + + EXPECT_EQ(level0_files[1]->epoch_number, 2); + EXPECT_EQ(level0_files[1]->fd.largest_seqno, 0); + ASSERT_EQ(level0_files[1]->num_entries, 1); + ASSERT_TRUE(level0_files[1]->largest.user_key() == Slice("key2")); + + EXPECT_EQ(level0_files[2]->epoch_number, 1); + EXPECT_EQ(level0_files[2]->fd.largest_seqno, 2); + ASSERT_EQ(level0_files[2]->num_entries, 2); + ASSERT_TRUE(level0_files[2]->largest.user_key() == Slice("key4")); + ASSERT_TRUE(level0_files[2]->smallest.user_key() == Slice("key1")); + + // To verify compacted file is assigned with the minimum epoch_number + // among input files' + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + EXPECT_EQ(level0_files[0]->epoch_number, 1); + ASSERT_EQ(level0_files[0]->num_entries, 4); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key4")); + ASSERT_TRUE(level0_files[0]->smallest.user_key() == Slice("key1")); +} + +TEST_F(DBTest2, SameEpochNumberAfterCompactRangeChangeLevel) { + Options options = CurrentOptions(); + options.num_levels = 7; + options.compaction_style = CompactionStyle::kCompactionStyleLevel; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Set up the file in L1 to be moved to L0 in later step of CompactRange() + ASSERT_OK(Put("key1", "seq1")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1, 0); + std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 0); + std::vector level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 1); + std::vector level2_files = GetLevelFileMetadatas(2 /* level*/); + ASSERT_EQ(level2_files.size(), 0); + + ASSERT_EQ(level1_files[0]->epoch_number, 1); + + // To verify CompactRange() moving file to L0 still keeps the file's + // epoch_number + CompactRangeOptions croptions; + croptions.change_level = true; + croptions.target_level = 0; + ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); + level0_files = GetLevelFileMetadatas(0 /* level*/); + level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + ASSERT_EQ(level1_files.size(), 0); + + EXPECT_EQ(level0_files[0]->epoch_number, 1); + + ASSERT_EQ(level0_files[0]->num_entries, 1); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key1")); +} + +TEST_F(DBTest2, RecoverEpochNumber) { + for (bool allow_ingest_behind : {true, false}) { + Options options = CurrentOptions(); + options.allow_ingest_behind = allow_ingest_behind; + options.num_levels = 7; + options.compaction_style = kCompactionStyleLevel; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"cf1"}, options); + VersionSet* versions = dbfull()->GetVersionSet(); + assert(versions); + const ColumnFamilyData* default_cf = + versions->GetColumnFamilySet()->GetDefault(); + const ColumnFamilyData* cf1 = + versions->GetColumnFamilySet()->GetColumnFamily("cf1"); + + // Set up files in default CF to recover in later step + ASSERT_OK(Put("key1", "epoch1")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1 /* level*/, 0 /* cf*/); + ASSERT_OK(Put("key2", "epoch2")); + ASSERT_OK(Flush()); + + std::vector level0_files = + GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + ASSERT_EQ(level0_files[0]->epoch_number, + allow_ingest_behind + ? 2 + kReservedEpochNumberForFileIngestedBehind + : 2); + ASSERT_EQ(level0_files[0]->num_entries, 1); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2")); + + std::vector level1_files = + GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 1); + ASSERT_EQ(level1_files[0]->epoch_number, + allow_ingest_behind + ? 1 + kReservedEpochNumberForFileIngestedBehind + : 1); + ASSERT_EQ(level1_files[0]->num_entries, 1); + ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1")); + + // Set up files in cf1 to recover in later step + ASSERT_OK(Put(1 /* cf */, "cf1_key1", "epoch1")); + ASSERT_OK(Flush(1 /* cf */)); + + std::vector level0_files_cf1 = + GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/); + ASSERT_EQ(level0_files_cf1.size(), 1); + ASSERT_EQ(level0_files_cf1[0]->epoch_number, + allow_ingest_behind + ? 1 + kReservedEpochNumberForFileIngestedBehind + : 1); + ASSERT_EQ(level0_files_cf1[0]->num_entries, 1); + ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1")); + + ASSERT_EQ(default_cf->GetNextEpochNumber(), + allow_ingest_behind + ? 3 + kReservedEpochNumberForFileIngestedBehind + : 3); + ASSERT_EQ(cf1->GetNextEpochNumber(), + allow_ingest_behind + ? 2 + kReservedEpochNumberForFileIngestedBehind + : 2); + + // To verify epoch_number of files of different levels/CFs are + // persisted and recovered correctly + ReopenWithColumnFamilies({"default", "cf1"}, options); + versions = dbfull()->GetVersionSet(); + assert(versions); + default_cf = versions->GetColumnFamilySet()->GetDefault(); + cf1 = versions->GetColumnFamilySet()->GetColumnFamily("cf1"); + + level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + EXPECT_EQ(level0_files[0]->epoch_number, + allow_ingest_behind + ? 2 + kReservedEpochNumberForFileIngestedBehind + : 2); + ASSERT_EQ(level0_files[0]->num_entries, 1); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2")); + + level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 1); + EXPECT_EQ(level1_files[0]->epoch_number, + allow_ingest_behind + ? 1 + kReservedEpochNumberForFileIngestedBehind + : 1); + ASSERT_EQ(level1_files[0]->num_entries, 1); + ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1")); + + level0_files_cf1 = GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/); + ASSERT_EQ(level0_files_cf1.size(), 1); + EXPECT_EQ(level0_files_cf1[0]->epoch_number, + allow_ingest_behind + ? 1 + kReservedEpochNumberForFileIngestedBehind + : 1); + ASSERT_EQ(level0_files_cf1[0]->num_entries, 1); + ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1")); + + // To verify next epoch number is recovered correctly + EXPECT_EQ(default_cf->GetNextEpochNumber(), + allow_ingest_behind + ? 3 + kReservedEpochNumberForFileIngestedBehind + : 3); + EXPECT_EQ(cf1->GetNextEpochNumber(), + allow_ingest_behind + ? 2 + kReservedEpochNumberForFileIngestedBehind + : 2); + } +} + +#endif // ROCKSDB_LITE + TEST_F(DBTest2, RenameDirectory) { Options options = CurrentOptions(); DestroyAndReopen(options); diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 45ac60fce19..9d76bbb91dd 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -28,10 +28,6 @@ #include "util/random.h" #include "logging/logging.h" -extern "C" bool RocksDbFileChecksumsVerificationEnabledOnRecovery() { - return true; -} - namespace ROCKSDB_NAMESPACE { namespace { @@ -40,7 +36,7 @@ int64_t MaybeCurrentTime(Env* env) { env->GetCurrentTime(&time).PermitUncheckedError(); return time; } -} // namespace +} // anonymous namespace // Special Env used to delay background operations @@ -171,54 +167,54 @@ DBTestBase::~DBTestBase() { bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) { #ifdef ROCKSDB_LITE - // These options are not supported in ROCKSDB_LITE - if (option_config == kHashSkipList || - option_config == kPlainTableFirstBytePrefix || - option_config == kPlainTableCappedPrefix || - option_config == kPlainTableCappedPrefixNonMmap || - option_config == kPlainTableAllBytesPrefix || - option_config == kVectorRep || option_config == kHashLinkList || - option_config == kUniversalCompaction || - option_config == kUniversalCompactionMultiLevel || - option_config == kUniversalSubcompactions || - option_config == kFIFOCompaction || - option_config == kConcurrentSkipList) { - return true; - } + // These options are not supported in ROCKSDB_LITE + if (option_config == kHashSkipList || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap || + option_config == kPlainTableAllBytesPrefix || + option_config == kVectorRep || option_config == kHashLinkList || + option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel || + option_config == kUniversalSubcompactions || + option_config == kFIFOCompaction || + option_config == kConcurrentSkipList) { + return true; + } #endif - if ((skip_mask & kSkipUniversalCompaction) && - (option_config == kUniversalCompaction || - option_config == kUniversalCompactionMultiLevel || - option_config == kUniversalSubcompactions)) { - return true; - } - if ((skip_mask & kSkipMergePut) && option_config == kMergePut) { - return true; - } - if ((skip_mask & kSkipNoSeekToLast) && - (option_config == kHashLinkList || option_config == kHashSkipList)) { - return true; - } - if ((skip_mask & kSkipPlainTable) && - (option_config == kPlainTableAllBytesPrefix || - option_config == kPlainTableFirstBytePrefix || - option_config == kPlainTableCappedPrefix || - option_config == kPlainTableCappedPrefixNonMmap)) { - return true; - } - if ((skip_mask & kSkipHashIndex) && - (option_config == kBlockBasedTableWithPrefixHashIndex || - option_config == kBlockBasedTableWithWholeKeyHashIndex)) { - return true; - } - if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) { - return true; - } - if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) { - return true; - } - return false; + if ((skip_mask & kSkipUniversalCompaction) && + (option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel || + option_config == kUniversalSubcompactions)) { + return true; + } + if ((skip_mask & kSkipMergePut) && option_config == kMergePut) { + return true; + } + if ((skip_mask & kSkipNoSeekToLast) && + (option_config == kHashLinkList || option_config == kHashSkipList)) { + return true; + } + if ((skip_mask & kSkipPlainTable) && + (option_config == kPlainTableAllBytesPrefix || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap)) { + return true; + } + if ((skip_mask & kSkipHashIndex) && + (option_config == kBlockBasedTableWithPrefixHashIndex || + option_config == kBlockBasedTableWithWholeKeyHashIndex)) { + return true; + } + if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) { + return true; + } + if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) { + return true; + } + return false; } bool DBTestBase::ShouldSkipAwsOptions(int option_config) { @@ -491,13 +487,13 @@ Options DBTestBase::GetOptions( options.allow_concurrent_memtable_write = false; options.unordered_write = false; break; - case kDirectIO: { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - options.compaction_readahead_size = 2 * 1024 * 1024; - SetupSyncPointsToMockDirectIO(); - break; - } + case kDirectIO: { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + options.compaction_readahead_size = 2 * 1024 * 1024; + SetupSyncPointsToMockDirectIO(); + break; + } #endif // ROCKSDB_LITE case kMergePut: options.merge_operator = MergeOperators::CreatePutOperator(); @@ -1441,12 +1437,14 @@ void DBTestBase::GetSstFiles(Env* env, std::string path, std::vector* files) { EXPECT_OK(env->GetChildren(path, files)); - files->erase( - std::remove_if(files->begin(), files->end(), [](std::string name) { - uint64_t number; - FileType type; - return !(ParseFileName(name, &number, &type) && type == kTableFile); - }), files->end()); + files->erase(std::remove_if(files->begin(), files->end(), + [](std::string name) { + uint64_t number; + FileType type; + return !(ParseFileName(name, &number, &type) && + type == kTableFile); + }), + files->end()); } int DBTestBase::GetSstFileCount(std::string path) { @@ -1716,8 +1714,8 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, iter_cnt++; total_reads++; } - ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / " - << true_data.size(); + ASSERT_EQ(data_iter, true_data.end()) + << iter_cnt << " / " << true_data.size(); delete iter; // Verify Iterator::Prev() @@ -1739,8 +1737,8 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, iter_cnt++; total_reads++; } - ASSERT_EQ(data_rev, true_data.rend()) << iter_cnt << " / " - << true_data.size(); + ASSERT_EQ(data_rev, true_data.rend()) + << iter_cnt << " / " << true_data.size(); // Verify Iterator::Seek() for (auto kv : true_data) { @@ -1770,8 +1768,8 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, iter_cnt++; total_reads++; } - ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / " - << true_data.size(); + ASSERT_EQ(data_iter, true_data.end()) + << iter_cnt << " / " << true_data.size(); // Verify ForwardIterator::Seek() for (auto kv : true_data) { @@ -1854,12 +1852,13 @@ TargetCacheChargeTrackingCache::TargetCacheChargeTrackingCache( cache_charge_increments_sum_(0) {} template -Status TargetCacheChargeTrackingCache::Insert( - const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), Handle** handle, - Priority priority) { - Status s = target_->Insert(key, value, charge, deleter, handle, priority); - if (deleter == kNoopDeleter) { +Status TargetCacheChargeTrackingCache::Insert(const Slice& key, + ObjectPtr value, + const CacheItemHelper* helper, + size_t charge, Handle** handle, + Priority priority) { + Status s = target_->Insert(key, value, helper, charge, handle, priority); + if (helper == kCrmHelper) { if (last_peak_tracked_) { cache_charge_peak_ = 0; cache_charge_increment_ = 0; @@ -1878,8 +1877,8 @@ Status TargetCacheChargeTrackingCache::Insert( template bool TargetCacheChargeTrackingCache::Release(Handle* handle, bool erase_if_last_ref) { - auto deleter = GetDeleter(handle); - if (deleter == kNoopDeleter) { + auto helper = GetCacheItemHelper(handle); + if (helper == kCrmHelper) { if (!last_peak_tracked_) { cache_charge_peaks_.push_back(cache_charge_peak_); cache_charge_increments_sum_ += cache_charge_increment_; @@ -1892,8 +1891,8 @@ bool TargetCacheChargeTrackingCache::Release(Handle* handle, } template -const Cache::DeleterFn TargetCacheChargeTrackingCache::kNoopDeleter = - CacheReservationManagerImpl::TEST_GetNoopDeleterForRole(); +const Cache::CacheItemHelper* TargetCacheChargeTrackingCache::kCrmHelper = + CacheReservationManagerImpl::TEST_GetCacheItemHelperForRole(); template class TargetCacheChargeTrackingCache< CacheEntryRole::kFilterConstruction>; diff --git a/db/db_test_util.h b/db/db_test_util.h index c81f73e805a..06212868a8a 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -49,7 +49,8 @@ #include "util/string_util.h" #include "utilities/merge_operators.h" -extern "C" bool RocksDbFileChecksumsVerificationEnabledOnRecovery(); +// In case defined by Windows headers +#undef small namespace ROCKSDB_NAMESPACE { class MockEnv; @@ -219,9 +220,7 @@ class SpecialEnv : public EnvWrapper { Env::IOPriority GetIOPriority() override { return base_->GetIOPriority(); } - bool use_direct_io() const override { - return base_->use_direct_io(); - } + bool use_direct_io() const override { return base_->use_direct_io(); } Status Allocate(uint64_t offset, uint64_t len) override { return base_->Allocate(offset, len); } @@ -874,17 +873,34 @@ class FlushCounterListener : public EventListener { #endif // A test merge operator mimics put but also fails if one of merge operands is -// "corrupted". +// "corrupted", "corrupted_try_merge", or "corrupted_must_merge". class TestPutOperator : public MergeOperator { public: virtual bool FullMergeV2(const MergeOperationInput& merge_in, MergeOperationOutput* merge_out) const override { + static const std::map + bad_operand_to_op_failure_scope = { + {"corrupted", MergeOperator::OpFailureScope::kDefault}, + {"corrupted_try_merge", MergeOperator::OpFailureScope::kTryMerge}, + {"corrupted_must_merge", + MergeOperator::OpFailureScope::kMustMerge}}; + auto check_operand = + [](Slice operand_val, + MergeOperator::OpFailureScope* op_failure_scope) -> bool { + auto iter = bad_operand_to_op_failure_scope.find(operand_val.ToString()); + if (iter != bad_operand_to_op_failure_scope.end()) { + *op_failure_scope = iter->second; + return false; + } + return true; + }; if (merge_in.existing_value != nullptr && - *(merge_in.existing_value) == "corrupted") { + !check_operand(*merge_in.existing_value, + &merge_out->op_failure_scope)) { return false; } for (auto value : merge_in.operand_list) { - if (value == "corrupted") { + if (!check_operand(value, &merge_out->op_failure_scope)) { return false; } } @@ -904,17 +920,18 @@ class CacheWrapper : public Cache { const char* Name() const override { return target_->Name(); } - using Cache::Insert; - Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), + Status Insert(const Slice& key, ObjectPtr value, + const CacheItemHelper* helper, size_t charge, Handle** handle = nullptr, Priority priority = Priority::LOW) override { - return target_->Insert(key, value, charge, deleter, handle, priority); + return target_->Insert(key, value, helper, charge, handle, priority); } - using Cache::Lookup; - Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override { - return target_->Lookup(key, stats); + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority = Priority::LOW, bool wait = true, + Statistics* stats = nullptr) override { + return target_->Lookup(key, helper, create_context, priority, wait, stats); } bool Ref(Handle* handle) override { return target_->Ref(handle); } @@ -924,7 +941,7 @@ class CacheWrapper : public Cache { return target_->Release(handle, erase_if_last_ref); } - void* Value(Handle* handle) override { return target_->Value(handle); } + ObjectPtr Value(Handle* handle) override { return target_->Value(handle); } void Erase(const Slice& key) override { target_->Erase(key); } uint64_t NewId() override { return target_->NewId(); } @@ -953,18 +970,13 @@ class CacheWrapper : public Cache { return target_->GetCharge(handle); } - DeleterFn GetDeleter(Handle* handle) const override { - return target_->GetDeleter(handle); - } - - void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override { - target_->ApplyToAllCacheEntries(callback, thread_safe); + const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { + return target_->GetCacheItemHelper(handle); } void ApplyToAllEntries( - const std::function& callback, + const std::function& callback, const ApplyToAllEntriesOptions& opts) override { target_->ApplyToAllEntries(callback, opts); } @@ -992,9 +1004,8 @@ class TargetCacheChargeTrackingCache : public CacheWrapper { public: explicit TargetCacheChargeTrackingCache(std::shared_ptr target); - using Cache::Insert; - Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), + Status Insert(const Slice& key, ObjectPtr value, + const CacheItemHelper* helper, size_t charge, Handle** handle = nullptr, Priority priority = Priority::LOW) override; @@ -1010,7 +1021,7 @@ class TargetCacheChargeTrackingCache : public CacheWrapper { } private: - static const Cache::DeleterFn kNoopDeleter; + static const Cache::CacheItemHelper* kCrmHelper; std::size_t cur_cache_charge_; std::size_t cache_charge_peak_; diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 9b63e6e16cb..f53c36f229a 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -39,8 +39,8 @@ class DBTestUniversalCompactionBase class DBTestUniversalCompaction : public DBTestUniversalCompactionBase { public: - DBTestUniversalCompaction() : - DBTestUniversalCompactionBase("/db_universal_compaction_test") {} + DBTestUniversalCompaction() + : DBTestUniversalCompactionBase("/db_universal_compaction_test") {} }; class DBTestUniversalCompaction2 : public DBTestBase { @@ -93,7 +93,7 @@ class KeepFilterFactory : public CompactionFilterFactory { std::atomic_bool expect_full_compaction_; std::atomic_bool expect_manual_compaction_; }; -} // namespace +} // anonymous namespace // Make sure we don't trigger a problem if the trigger condtion is given // to be 0, which is invalid. @@ -563,8 +563,7 @@ TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) { } if (compaction_input_file_names.size() == 0) { - compaction_input_file_names.push_back( - cf_meta.levels[0].files[0].name); + compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name); } // expect fail since universal compaction only allow L0 output @@ -574,28 +573,23 @@ TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) { .ok()); // expect ok and verify the compacted files no longer exist. - ASSERT_OK(dbfull()->CompactFiles( - CompactionOptions(), handles_[1], - compaction_input_file_names, 0)); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, 0)); dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); VerifyCompactionResult( - cf_meta, - std::set(compaction_input_file_names.begin(), - compaction_input_file_names.end())); + cf_meta, std::set(compaction_input_file_names.begin(), + compaction_input_file_names.end())); compaction_input_file_names.clear(); // Pick the first and the last file, expect everything is // compacted into one single file. + compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name); compaction_input_file_names.push_back( - cf_meta.levels[0].files[0].name); - compaction_input_file_names.push_back( - cf_meta.levels[0].files[ - cf_meta.levels[0].files.size() - 1].name); - ASSERT_OK(dbfull()->CompactFiles( - CompactionOptions(), handles_[1], - compaction_input_file_names, 0)); + cf_meta.levels[0].files[cf_meta.levels[0].files.size() - 1].name); + ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1], + compaction_input_file_names, 0)); dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); ASSERT_EQ(cf_meta.levels[0].files.size(), 1U); @@ -604,7 +598,7 @@ TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) { TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; - options.write_buffer_size = 100 << 10; // 100KB + options.write_buffer_size = 100 << 10; // 100KB options.num_levels = 7; options.disable_auto_compactions = true; DestroyAndReopen(options); @@ -640,9 +634,9 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) { class DBTestUniversalCompactionMultiLevels : public DBTestUniversalCompactionBase { public: - DBTestUniversalCompactionMultiLevels() : - DBTestUniversalCompactionBase( - "/db_universal_compaction_multi_levels_test") {} + DBTestUniversalCompactionMultiLevels() + : DBTestUniversalCompactionBase( + "/db_universal_compaction_multi_levels_test") {} }; TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) { @@ -725,12 +719,11 @@ INSTANTIATE_TEST_CASE_P(MultiLevels, DBTestUniversalCompactionMultiLevels, ::testing::Combine(::testing::Values(3, 20), ::testing::Bool())); -class DBTestUniversalCompactionParallel : - public DBTestUniversalCompactionBase { +class DBTestUniversalCompactionParallel : public DBTestUniversalCompactionBase { public: - DBTestUniversalCompactionParallel() : - DBTestUniversalCompactionBase( - "/db_universal_compaction_prallel_test") {} + DBTestUniversalCompactionParallel() + : DBTestUniversalCompactionBase("/db_universal_compaction_prallel_test") { + } }; TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) { @@ -919,8 +912,8 @@ INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel, TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; - options.write_buffer_size = 105 << 10; // 105KB - options.arena_block_size = 4 << 10; // 4KB + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; // 4KB options.target_file_size_base = 32 << 10; // 32KB options.level0_file_num_compaction_trigger = 4; options.num_levels = num_levels_; @@ -951,8 +944,8 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) { TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; - options.write_buffer_size = 105 << 10; // 105KB - options.arena_block_size = 4 << 10; // 4KB + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; // 4KB options.target_file_size_base = 32 << 10; // 32KB // trigger compaction if there are >= 4 files options.level0_file_num_compaction_trigger = 4; @@ -1353,7 +1346,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) { cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024); cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024); option_vector.emplace_back(DBOptions(options), cf_opt1); - CreateColumnFamilies({"one"},option_vector[1]); + CreateColumnFamilies({"one"}, option_vector[1]); // Configura CF2 specific paths. cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024); @@ -1361,7 +1354,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) { cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024); cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024); option_vector.emplace_back(DBOptions(options), cf_opt2); - CreateColumnFamilies({"two"},option_vector[2]); + CreateColumnFamilies({"two"}, option_vector[2]); ReopenWithColumnFamilies({"default", "one", "two"}, option_vector); @@ -1567,7 +1560,6 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { verify_func(max_key3); } - TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) { if (!Snappy_Supported()) { return; @@ -1829,9 +1821,9 @@ INSTANTIATE_TEST_CASE_P(NumLevels, DBTestUniversalCompaction, class DBTestUniversalManualCompactionOutputPathId : public DBTestUniversalCompactionBase { public: - DBTestUniversalManualCompactionOutputPathId() : - DBTestUniversalCompactionBase( - "/db_universal_compaction_manual_pid_test") {} + DBTestUniversalManualCompactionOutputPathId() + : DBTestUniversalCompactionBase( + "/db_universal_compaction_manual_pid_test") {} }; TEST_P(DBTestUniversalManualCompactionOutputPathId, @@ -2236,8 +2228,8 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); #else - (void) argc; - (void) argv; + (void)argc; + (void)argv; return 0; #endif } diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 5b5ec76afa5..99d0b3c4c8d 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -610,6 +610,52 @@ TEST_F(DBWALTest, WALWithChecksumHandoff) { #endif // ROCKSDB_ASSERT_STATUS_CHECKED } +#ifndef ROCKSDB_LITE +TEST_F(DBWALTest, LockWal) { + do { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBWALTest::LockWal:AfterGetSortedWal", + "DBWALTest::LockWal:BeforeFlush:1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put("foo", "v")); + ASSERT_OK(Put("bar", "v")); + port::Thread worker([&]() { + TEST_SYNC_POINT("DBWALTest::LockWal:BeforeFlush:1"); + Status tmp_s = db_->Flush(FlushOptions()); + ASSERT_OK(tmp_s); + }); + + ASSERT_OK(db_->LockWAL()); + // Verify writes are stopped + WriteOptions wopts; + wopts.no_slowdown = true; + Status s = db_->Put(wopts, "foo", "dontcare"); + ASSERT_TRUE(s.IsIncomplete()); + { + VectorLogPtr wals; + ASSERT_OK(db_->GetSortedWalFiles(wals)); + ASSERT_FALSE(wals.empty()); + } + TEST_SYNC_POINT("DBWALTest::LockWal:AfterGetSortedWal"); + FlushOptions flush_opts; + flush_opts.wait = false; + s = db_->Flush(flush_opts); + ASSERT_TRUE(s.IsTryAgain()); + ASSERT_OK(db_->UnlockWAL()); + ASSERT_OK(db_->Put(WriteOptions(), "foo", "dontcare")); + + worker.join(); + + SyncPoint::GetInstance()->DisableProcessing(); + } while (ChangeWalOptions()); +} +#endif //! ROCKSDB_LITE + class DBRecoveryTestBlobError : public DBWALTest, public testing::WithParamInterface { @@ -1599,6 +1645,89 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) { delete db1; } +TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) { + Options options = CurrentOptions(); + options.track_and_verify_wals_in_manifest = true; + DestroyAndReopen(options); + + // Accumulate memtable m1 and create the 1st wal (i.e, 4.log) + ASSERT_OK(Put(Key(1), "")); + ASSERT_OK(Put(Key(2), "")); + ASSERT_OK(Put(Key(3), "")); + + const std::string wal_file_path = db_->GetName() + "/000004.log"; + + // Coerce the following sequence of events: + // (1) Flush() marks 4.log to be obsoleted, 8.log to be the latest (i.e, + // active) log and release the lock + // (2) SyncWAL() proceeds with the lock. It + // creates a new manifest and syncs all the inactive wals before the latest + // (i.e, active log), which is 4.log. Note that SyncWAL() is not aware of the + // fact that 4.log has marked as to be obseleted. Prior to the fix, such wal + // sync will then add a WAL addition record of 4.log to the new manifest + // without any special treatment. + // (3) BackgroundFlush() will eventually purge 4.log. + bool wal_synced = false; + SyncPoint::GetInstance()->SetCallBack( + "FindObsoleteFiles::PostMutexUnlock", [&](void*) { + ASSERT_OK(env_->FileExists(wal_file_path)); + + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:" + "PostDecidingCreateNewManifestOrNot", + [&](void* arg) { + bool* new_descriptor_log = (bool*)arg; + *new_descriptor_log = true; + }); + + ASSERT_OK(db_->SyncWAL()); + wal_synced = true; + }); + + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DeleteObsoleteFileImpl:AfterDeletion2", [&](void* arg) { + std::string* file_name = (std::string*)arg; + if (*file_name == wal_file_path) { + TEST_SYNC_POINT( + "DBWALTest::" + "FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" + "PostDeleteWAL"); + } + }); + + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCallFlush:FilesFound", + "PreConfrimObsoletedWALSynced"}, + {"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" + "PostDeleteWAL", + "PreConfrimWALDeleted"}}); + + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Flush()); + + TEST_SYNC_POINT("PreConfrimObsoletedWALSynced"); + ASSERT_TRUE(wal_synced); + + TEST_SYNC_POINT("PreConfrimWALDeleted"); + // BackgroundFlush() purged 4.log + // because the memtable associated with the WAL was flushed and new WAL was + // created (i.e, 8.log) + ASSERT_TRUE(env_->FileExists(wal_file_path).IsNotFound()); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + // To verify the corruption of "Missing WAL with log number: 4" under + // `options.track_and_verify_wals_in_manifest = true` is fixed. + // + // Before the fix, `db_->SyncWAL()` will sync and record WAL addtion of the + // obseleted WAL 4.log in a new manifest without any special treament. + // This will result in missing-wal corruption in DB::Reopen(). + Status s = TryReopen(options); + EXPECT_OK(s); +} + // Test scope: // - We expect to open data store under all circumstances // - We expect only data upto the point where the first error was encountered diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index 347f22951dc..4208169236d 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -18,6 +18,7 @@ #endif #include "test_util/testutil.h" #include "utilities/fault_injection_env.h" +#include "utilities/merge_operators/string_append/stringappend2.h" namespace ROCKSDB_NAMESPACE { class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase { @@ -50,7 +51,7 @@ TEST_F(DBBasicTestWithTimestamp, SanityChecks) { db_->Put(WriteOptions(), "key", dummy_ts, "value").IsInvalidArgument()); ASSERT_TRUE(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), "key", dummy_ts, "value") - .IsNotSupported()); + .IsInvalidArgument()); ASSERT_TRUE(db_->Delete(WriteOptions(), "key", dummy_ts).IsInvalidArgument()); ASSERT_TRUE( db_->SingleDelete(WriteOptions(), "key", dummy_ts).IsInvalidArgument()); @@ -96,7 +97,7 @@ TEST_F(DBBasicTestWithTimestamp, SanityChecks) { ASSERT_TRUE(db_->Put(WriteOptions(), handle, "key", wrong_ts, "value") .IsInvalidArgument()); ASSERT_TRUE(db_->Merge(WriteOptions(), handle, "key", wrong_ts, "value") - .IsNotSupported()); + .IsInvalidArgument()); ASSERT_TRUE( db_->Delete(WriteOptions(), handle, "key", wrong_ts).IsInvalidArgument()); ASSERT_TRUE(db_->SingleDelete(WriteOptions(), handle, "key", wrong_ts) @@ -3690,6 +3691,213 @@ TEST_F(DBBasicTestWithTimestamp, DeleteRangeGetIteratorWithSnapshot) { db_->ReleaseSnapshot(after_tombstone); Close(); } + +TEST_F(DBBasicTestWithTimestamp, MergeBasic) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.merge_operator = std::make_shared('.'); + DestroyAndReopen(options); + + const std::array write_ts_strs = { + Timestamp(100, 0), Timestamp(200, 0), Timestamp(300, 0)}; + constexpr size_t kNumOfUniqKeys = 100; + ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); + + for (size_t i = 0; i < write_ts_strs.size(); ++i) { + for (size_t j = 0; j < kNumOfUniqKeys; ++j) { + Status s; + if (i == 0) { + const std::string val = "v" + std::to_string(j) + "_0"; + s = db_->Put(WriteOptions(), Key1(j), write_ts_strs[i], val); + } else { + const std::string merge_op = std::to_string(i); + s = db_->Merge(WriteOptions(), default_cf, Key1(j), write_ts_strs[i], + merge_op); + } + ASSERT_OK(s); + } + } + + std::array read_ts_strs = { + Timestamp(150, 0), Timestamp(250, 0), Timestamp(350, 0)}; + + const auto verify_db_with_get = [&]() { + for (size_t i = 0; i < kNumOfUniqKeys; ++i) { + const std::string base_val = "v" + std::to_string(i) + "_0"; + const std::array expected_values = { + base_val, base_val + ".1", base_val + ".1.2"}; + const std::array& expected_ts = write_ts_strs; + ReadOptions read_opts; + for (size_t j = 0; j < read_ts_strs.size(); ++j) { + Slice read_ts = read_ts_strs[j]; + read_opts.timestamp = &read_ts; + std::string value; + std::string ts; + const Status s = db_->Get(read_opts, Key1(i), &value, &ts); + ASSERT_OK(s); + ASSERT_EQ(expected_values[j], value); + ASSERT_EQ(expected_ts[j], ts); + + // Do Seek/SeekForPrev + std::unique_ptr it(db_->NewIterator(read_opts)); + it->Seek(Key1(i)); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(expected_values[j], it->value()); + ASSERT_EQ(expected_ts[j], it->timestamp()); + + it->SeekForPrev(Key1(i)); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ(expected_values[j], it->value()); + ASSERT_EQ(expected_ts[j], it->timestamp()); + } + } + }; + + const auto verify_db_with_iterator = [&]() { + std::string value_suffix; + for (size_t i = 0; i < read_ts_strs.size(); ++i) { + ReadOptions read_opts; + Slice read_ts = read_ts_strs[i]; + read_opts.timestamp = &read_ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + size_t key_int_val = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++key_int_val) { + const std::string key = Key1(key_int_val); + const std::string value = + "v" + std::to_string(key_int_val) + "_0" + value_suffix; + ASSERT_EQ(key, it->key()); + ASSERT_EQ(value, it->value()); + ASSERT_EQ(write_ts_strs[i], it->timestamp()); + } + ASSERT_EQ(kNumOfUniqKeys, key_int_val); + + key_int_val = kNumOfUniqKeys - 1; + for (it->SeekToLast(); it->Valid(); it->Prev(), --key_int_val) { + const std::string key = Key1(key_int_val); + const std::string value = + "v" + std::to_string(key_int_val) + "_0" + value_suffix; + ASSERT_EQ(key, it->key()); + ASSERT_EQ(value, it->value()); + ASSERT_EQ(write_ts_strs[i], it->timestamp()); + } + ASSERT_EQ(std::numeric_limits::max(), key_int_val); + + value_suffix = value_suffix + "." + std::to_string(i + 1); + } + }; + + verify_db_with_get(); + verify_db_with_iterator(); + + ASSERT_OK(db_->Flush(FlushOptions())); + + verify_db_with_get(); + verify_db_with_iterator(); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, MergeAfterDeletion) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + options.merge_operator = std::make_shared('.'); + DestroyAndReopen(options); + + ColumnFamilyHandle* const column_family = db_->DefaultColumnFamily(); + + const size_t num_keys_per_file = 10; + const size_t num_merges_per_key = 2; + for (size_t i = 0; i < num_keys_per_file; ++i) { + std::string ts = Timestamp(i + 10000, 0); + Status s = db_->Delete(WriteOptions(), Key1(i), ts); + ASSERT_OK(s); + for (size_t j = 1; j <= num_merges_per_key; ++j) { + ts = Timestamp(i + 10000 + j, 0); + s = db_->Merge(WriteOptions(), column_family, Key1(i), ts, + std::to_string(j)); + ASSERT_OK(s); + } + } + + const auto verify_db = [&]() { + ReadOptions read_opts; + std::string read_ts_str = Timestamp(20000, 0); + Slice ts = read_ts_str; + read_opts.timestamp = &ts; + std::unique_ptr it(db_->NewIterator(read_opts)); + size_t count = 0; + for (it->SeekToFirst(); it->Valid(); it->Next(), ++count) { + std::string key = Key1(count); + ASSERT_EQ(key, it->key()); + std::string value; + for (size_t j = 1; j <= num_merges_per_key; ++j) { + value.append(std::to_string(j)); + if (j < num_merges_per_key) { + value.push_back('.'); + } + } + ASSERT_EQ(value, it->value()); + std::string ts1 = Timestamp(count + 10000 + num_merges_per_key, 0); + ASSERT_EQ(ts1, it->timestamp()); + } + ASSERT_OK(it->status()); + ASSERT_EQ(num_keys_per_file, count); + for (it->SeekToLast(); it->Valid(); it->Prev(), --count) { + std::string key = Key1(count - 1); + ASSERT_EQ(key, it->key()); + std::string value; + for (size_t j = 1; j <= num_merges_per_key; ++j) { + value.append(std::to_string(j)); + if (j < num_merges_per_key) { + value.push_back('.'); + } + } + ASSERT_EQ(value, it->value()); + std::string ts1 = Timestamp(count - 1 + 10000 + num_merges_per_key, 0); + ASSERT_EQ(ts1, it->timestamp()); + } + ASSERT_OK(it->status()); + ASSERT_EQ(0, count); + }; + + verify_db(); + + Close(); +} + +TEST_F(DBBasicTestWithTimestamp, RangeTombstoneApproximateSize) { + // Test code path for calculating range tombstone compensated size + // during flush and compaction. + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + // So that the compaction below is non-bottommost and will calcualte + // compensated range tombstone size. + ASSERT_OK(db_->Put(WriteOptions(), Key(1), Timestamp(1, 0), "val")); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(1), Timestamp(1, 0))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1), + Key(2), Timestamp(2, 0))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 0 /* input_level */, 1 /* output_level */, CompactRangeOptions(), + nullptr /* begin */, nullptr /* end */, true /* exclusive */, + true /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + "" /*trim_ts*/)); +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_with_timestamp_compaction_test.cc b/db/db_with_timestamp_compaction_test.cc index d28f67e05a7..403d9907c57 100644 --- a/db/db_with_timestamp_compaction_test.cc +++ b/db/db_with_timestamp_compaction_test.cc @@ -323,6 +323,27 @@ TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL1) { static_cast(compaction_job_info.input_files.size())); } } + +TEST_F(TimestampCompatibleCompactionTest, EmptyCompactionOutput) { + Options options = CurrentOptions(); + options.env = env_; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + DestroyAndReopen(options); + + std::string ts_str = Timestamp(1); + WriteOptions wopts; + ASSERT_OK( + db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3", ts_str)); + ASSERT_OK(Flush()); + + ts_str = Timestamp(3); + Slice ts = ts_str; + CompactRangeOptions cro; + // range tombstone will be dropped during compaction + cro.full_history_ts_low = &ts; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); +} #endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 2aa45acfa96..1ce2b14b20e 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -170,7 +170,8 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) { TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { Options options = GetOptions(); - options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger = + 4; std::vector threads; std::atomic thread_num(0); port::Mutex mutex; @@ -195,7 +196,7 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { Status s = dbfull()->Put(wo, key, "bar"); ASSERT_TRUE(s.ok() || s.IsIncomplete()); }; - std::function unblock_main_thread_func = [&](void *) { + std::function unblock_main_thread_func = [&](void*) { mutex.Lock(); ++writers; cv.SignalAll(); @@ -254,8 +255,9 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); // This would have triggered a write stall. Unblock the write group leader TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2"); - // The leader is going to create missing newer links. When the leader finishes, - // the next leader is going to delay writes and fail writers with no_slowdown + // The leader is going to create missing newer links. When the leader + // finishes, the next leader is going to delay writes and fail writers with + // no_slowdown TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3"); for (auto& t : threads) { @@ -623,42 +625,43 @@ TEST_P(DBWriteTest, LockWalInEffect) { } TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) { - Options options = GetOptions(); - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.statistics->set_stats_level(StatsLevel::kAll); - Reopen(options); - std::string wal_key_prefix = "WAL_KEY_"; - std::string no_wal_key_prefix = "K_"; - // 100 KB value each for NO-WAL operation - std::string no_wal_value(1024 * 100, 'X'); - // 1B value each for WAL operation - std::string wal_value = "0"; - std::thread threads[10]; - for (int t = 0; t < 10; t++) { - threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix, no_wal_value, this] { - for(int i = 0; i < 10; i++) { - ROCKSDB_NAMESPACE::WriteOptions write_option_disable; - write_option_disable.disableWAL = true; - ROCKSDB_NAMESPACE::WriteOptions write_option_default; - std::string no_wal_key = no_wal_key_prefix + std::to_string(t) + - "_" + std::to_string(i); - ASSERT_OK( - this->Put(no_wal_key, no_wal_value, write_option_disable)); - std::string wal_key = - wal_key_prefix + std::to_string(i) + "_" + std::to_string(i); - ASSERT_OK(this->Put(wal_key, wal_value, write_option_default)); - ASSERT_OK(dbfull()->SyncWAL()); - } - return; - }); - } - for (auto& t: threads) { - t.join(); - } - uint64_t bytes_num = options.statistics->getTickerCount( - ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES); - // written WAL size should less than 100KB (even included HEADER & FOOTER overhead) - ASSERT_LE(bytes_num, 1024 * 100); + Options options = GetOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kAll); + Reopen(options); + std::string wal_key_prefix = "WAL_KEY_"; + std::string no_wal_key_prefix = "K_"; + // 100 KB value each for NO-WAL operation + std::string no_wal_value(1024 * 100, 'X'); + // 1B value each for WAL operation + std::string wal_value = "0"; + std::thread threads[10]; + for (int t = 0; t < 10; t++) { + threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix, + no_wal_value, this] { + for (int i = 0; i < 10; i++) { + ROCKSDB_NAMESPACE::WriteOptions write_option_disable; + write_option_disable.disableWAL = true; + ROCKSDB_NAMESPACE::WriteOptions write_option_default; + std::string no_wal_key = + no_wal_key_prefix + std::to_string(t) + "_" + std::to_string(i); + ASSERT_OK(this->Put(no_wal_key, no_wal_value, write_option_disable)); + std::string wal_key = + wal_key_prefix + std::to_string(i) + "_" + std::to_string(i); + ASSERT_OK(this->Put(wal_key, wal_value, write_option_default)); + ASSERT_OK(dbfull()->SyncWAL()); + } + return; + }); + } + for (auto& t : threads) { + t.join(); + } + uint64_t bytes_num = options.statistics->getTickerCount( + ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES); + // written WAL size should less than 100KB (even included HEADER & FOOTER + // overhead) + ASSERT_LE(bytes_num, 1024 * 100); } TEST_P(DBWriteTest, DisableWriteStall) { diff --git a/db/dbformat.cc b/db/dbformat.cc index b0ac6c3393b..2c3581ca005 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -150,6 +150,31 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a, return r; } +int InternalKeyComparator::Compare(const Slice& a, + const ParsedInternalKey& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + // decreasing type (though sequence# should be enough to disambiguate) + int r = user_comparator_.Compare(ExtractUserKey(a), b.user_key); + if (r == 0) { + const uint64_t anum = + DecodeFixed64(a.data() + a.size() - kNumInternalBytes); + const uint64_t bnum = (b.sequence << 8) | b.type; + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + +int InternalKeyComparator::Compare(const ParsedInternalKey& a, + const Slice& b) const { + return -Compare(b, a); +} + LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, const Slice* ts) { size_t usize = _user_key.size(); diff --git a/db/dbformat.h b/db/dbformat.h index b3981fc74f0..d9fadea1ca9 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -129,6 +129,12 @@ struct ParsedInternalKey { const char* addr = user_key.data() + user_key.size() - ts.size(); memcpy(const_cast(addr), ts.data(), ts.size()); } + + Slice GetTimestamp(size_t ts_sz) { + assert(ts_sz <= user_key.size()); + const char* addr = user_key.data() + user_key.size() - ts_sz; + return Slice(const_cast(addr), ts_sz); + } }; // Return the length of the encoding of "key". @@ -277,6 +283,8 @@ class InternalKeyComparator int Compare(const InternalKey& a, const InternalKey& b) const; int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; + int Compare(const Slice& a, const ParsedInternalKey& b) const; + int Compare(const ParsedInternalKey& a, const Slice& b) const; // In this `Compare()` overload, the sequence numbers provided in // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a` // and `b`, respectively. To disable sequence number override(s), provide the @@ -439,6 +447,8 @@ class IterKey { void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; } // Returns the key in whichever format that was provided to KeyIter + // If user-defined timestamp is enabled, then timestamp is included in the + // return result. Slice GetKey() const { return Slice(key_, key_size_); } Slice GetInternalKey() const { @@ -446,6 +456,8 @@ class IterKey { return Slice(key_, key_size_); } + // If user-defined timestamp is enabled, then timestamp is included in the + // return result of GetUserKey(); Slice GetUserKey() const { if (IsUserKey()) { return Slice(key_, key_size_); @@ -495,6 +507,9 @@ class IterKey { return SetKeyImpl(key, copy); } + // If user-defined timestamp is enabled, then `key` includes timestamp. + // TODO(yanqin) this is also used to set prefix, which do not include + // timestamp. Should be handled. Slice SetUserKey(const Slice& key, bool copy = true) { is_user_key_ = true; return SetKeyImpl(key, copy); @@ -689,6 +704,8 @@ extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, // slice they point to. // Tag is defined as ValueType. // input will be advanced to after the record. +// If user-defined timestamp is enabled for a column family, then the `key` +// resulting from this call will include timestamp. extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, uint32_t* column_family, Slice* key, Slice* value, Slice* blob, Slice* xid); diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index b52b0192cb7..8dc3387df03 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -15,8 +15,7 @@ namespace ROCKSDB_NAMESPACE { -static std::string IKey(const std::string& user_key, - uint64_t seq, +static std::string IKey(const std::string& user_key, uint64_t seq, ValueType vt) { std::string encoded; AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); @@ -37,9 +36,7 @@ static std::string ShortSuccessor(const std::string& s) { return result; } -static void TestKey(const std::string& key, - uint64_t seq, - ValueType vt) { +static void TestKey(const std::string& key, uint64_t seq, ValueType vt) { std::string encoded = IKey(key, seq, vt); Slice in(encoded); @@ -56,13 +53,19 @@ static void TestKey(const std::string& key, class FormatTest : public testing::Test {}; TEST_F(FormatTest, InternalKey_EncodeDecode) { - const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; - const uint64_t seq[] = { - 1, 2, 3, - (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, - (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, - (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 - }; + const char* keys[] = {"", "k", "hello", "longggggggggggggggggggggg"}; + const uint64_t seq[] = {1, + 2, + 3, + (1ull << 8) - 1, + 1ull << 8, + (1ull << 8) + 1, + (1ull << 16) - 1, + 1ull << 16, + (1ull << 16) + 1, + (1ull << 32) - 1, + 1ull << 32, + (1ull << 32) + 1}; for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { TestKey(keys[k], seq[s], kTypeValue); @@ -74,27 +77,25 @@ TEST_F(FormatTest, InternalKey_EncodeDecode) { TEST_F(FormatTest, InternalKeyShortSeparator) { // When user keys are same ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 99, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 101, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeDeletion))); + Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 99, kTypeValue))); + ASSERT_EQ( + IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 101, kTypeValue))); + ASSERT_EQ( + IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeValue))); + ASSERT_EQ( + IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeDeletion))); // When user keys are misordered ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("bar", 99, kTypeValue))); + Shorten(IKey("foo", 100, kTypeValue), IKey("bar", 99, kTypeValue))); // When user keys are different, but correctly ordered - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - Shorten(IKey("foo", 100, kTypeValue), - IKey("hello", 200, kTypeValue))); + ASSERT_EQ( + IKey("g", kMaxSequenceNumber, kValueTypeForSeek), + Shorten(IKey("foo", 100, kTypeValue), IKey("hello", 200, kTypeValue))); ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek), Shorten(IKey("ABC1AAAAA", 100, kTypeValue), @@ -121,14 +122,14 @@ TEST_F(FormatTest, InternalKeyShortSeparator) { Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue))); // When start user key is prefix of limit user key - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foobar", 200, kTypeValue))); + ASSERT_EQ( + IKey("foo", 100, kTypeValue), + Shorten(IKey("foo", 100, kTypeValue), IKey("foobar", 200, kTypeValue))); // When limit user key is prefix of start user key - ASSERT_EQ(IKey("foobar", 100, kTypeValue), - Shorten(IKey("foobar", 100, kTypeValue), - IKey("foo", 200, kTypeValue))); + ASSERT_EQ( + IKey("foobar", 100, kTypeValue), + Shorten(IKey("foobar", 100, kTypeValue), IKey("foo", 200, kTypeValue))); } TEST_F(FormatTest, InternalKeyShortestSuccessor) { diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 18f2577e978..34925e828b7 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -10,9 +10,11 @@ #ifndef ROCKSDB_LITE #include + #include #include #include + #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/version_set.h" @@ -55,7 +57,7 @@ class DeleteFileTest : public DBTestBase { WriteOptions options; options.sync = false; ReadOptions roptions; - for (int i = startkey; i < (numkeys + startkey) ; i++) { + for (int i = startkey; i < (numkeys + startkey); i++) { std::string temp = std::to_string(i); Slice key(temp); Slice value(temp); @@ -63,10 +65,8 @@ class DeleteFileTest : public DBTestBase { } } - int numKeysInLevels( - std::vector &metadata, - std::vector *keysperlevel = nullptr) { - + int numKeysInLevels(std::vector& metadata, + std::vector* keysperlevel = nullptr) { if (keysperlevel != nullptr) { keysperlevel->resize(numlevels_); } @@ -82,8 +82,7 @@ class DeleteFileTest : public DBTestBase { } fprintf(stderr, "level %d name %s smallest %s largest %s\n", metadata[i].level, metadata[i].name.c_str(), - metadata[i].smallestkey.c_str(), - metadata[i].largestkey.c_str()); + metadata[i].smallestkey.c_str(), metadata[i].largestkey.c_str()); } return numKeys; } @@ -214,7 +213,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { // this time, we keep an iterator alive Reopen(options); - Iterator *itr = nullptr; + Iterator* itr = nullptr; CreateTwoLevels(); itr = db_->NewIterator(ReadOptions()); ASSERT_OK(itr->status()); @@ -481,12 +480,12 @@ TEST_F(DeleteFileTest, DeleteFileWithIterator) { } Status status = db_->DeleteFile(level2file); - fprintf(stdout, "Deletion status %s: %s\n", - level2file.c_str(), status.ToString().c_str()); + fprintf(stdout, "Deletion status %s: %s\n", level2file.c_str(), + status.ToString().c_str()); ASSERT_OK(status); it->SeekToFirst(); int numKeysIterated = 0; - while(it->Valid()) { + while (it->Valid()) { numKeysIterated++; it->Next(); } diff --git a/db/error_handler.cc b/db/error_handler.cc index 1df01267faa..7f68bb026c2 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -234,8 +234,8 @@ void ErrorHandler::CancelErrorRecovery() { // We'll release the lock before calling sfm, so make sure no new // recovery gets scheduled at that point auto_recovery_ = false; - SstFileManagerImpl* sfm = reinterpret_cast( - db_options_.sst_file_manager.get()); + SstFileManagerImpl* sfm = + reinterpret_cast(db_options_.sst_file_manager.get()); if (sfm) { // This may or may not cancel a pending recovery db_mutex_->Unlock(); @@ -292,8 +292,8 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, bool found = false; { - auto entry = ErrorSeverityMap.find(std::make_tuple(reason, bg_err.code(), - bg_err.subcode(), paranoid)); + auto entry = ErrorSeverityMap.find( + std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid)); if (entry != ErrorSeverityMap.end()) { sev = entry->second; found = true; @@ -301,8 +301,8 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, } if (!found) { - auto entry = DefaultErrorSeverityMap.find(std::make_tuple(reason, - bg_err.code(), paranoid)); + auto entry = DefaultErrorSeverityMap.find( + std::make_tuple(reason, bg_err.code(), paranoid)); if (entry != DefaultErrorSeverityMap.end()) { sev = entry->second; found = true; diff --git a/db/error_handler.h b/db/error_handler.h index e7c47b76370..34e08a525d7 100644 --- a/db/error_handler.h +++ b/db/error_handler.h @@ -26,100 +26,99 @@ struct DBRecoverContext { }; class ErrorHandler { - public: - ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options, - InstrumentedMutex* db_mutex) - : db_(db), - db_options_(db_options), - cv_(db_mutex), - end_recovery_(false), - recovery_thread_(nullptr), - db_mutex_(db_mutex), - auto_recovery_(false), - recovery_in_prog_(false), - soft_error_no_bg_work_(false), - is_db_stopped_(false), - bg_error_stats_(db_options.statistics) { - // Clear the checked flag for uninitialized errors - bg_error_.PermitUncheckedError(); - recovery_error_.PermitUncheckedError(); - recovery_io_error_.PermitUncheckedError(); - } - - void EnableAutoRecovery() { auto_recovery_ = true; } - - Status::Severity GetErrorSeverity(BackgroundErrorReason reason, - Status::Code code, - Status::SubCode subcode); - - const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason); - - Status GetBGError() const { return bg_error_; } - - Status GetRecoveryError() const { return recovery_error_; } - - Status ClearBGError(); - - bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); } - - bool IsBGWorkStopped() { - assert(db_mutex_); - db_mutex_->AssertHeld(); - return !bg_error_.ok() && - (bg_error_.severity() >= Status::Severity::kHardError || - !auto_recovery_ || soft_error_no_bg_work_); - } - - bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; } - - bool IsRecoveryInProgress() { return recovery_in_prog_; } - - Status RecoverFromBGError(bool is_manual = false); - void CancelErrorRecovery(); - - void EndAutoRecovery(); - - private: - DBImpl* db_; - const ImmutableDBOptions& db_options_; - Status bg_error_; - // A separate Status variable used to record any errors during the - // recovery process from hard errors - Status recovery_error_; - // A separate IO Status variable used to record any IO errors during - // the recovery process. At the same time, recovery_error_ is also set. - IOStatus recovery_io_error_; - // The condition variable used with db_mutex during auto resume for time - // wait. - InstrumentedCondVar cv_; - bool end_recovery_; - std::unique_ptr recovery_thread_; - - InstrumentedMutex* db_mutex_; - // A flag indicating whether automatic recovery from errors is enabled - bool auto_recovery_; - bool recovery_in_prog_; - // A flag to indicate that for the soft error, we should not allow any - // background work except the work is from recovery. - bool soft_error_no_bg_work_; - - // Used to store the context for recover, such as flush reason. - DBRecoverContext recover_context_; - std::atomic is_db_stopped_; - - // The pointer of DB statistics. - std::shared_ptr bg_error_stats_; - - const Status& HandleKnownErrors(const Status& bg_err, - BackgroundErrorReason reason); - Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); - void RecoverFromNoSpace(); - const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); - void RecoverFromRetryableBGIOError(); - // First, if it is in recovery and the recovery_error is ok. Set the - // recovery_error_ to bg_err. Second, if the severity is higher than the - // current bg_error_, overwrite it. - void CheckAndSetRecoveryAndBGError(const Status& bg_err); + public: + ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options, + InstrumentedMutex* db_mutex) + : db_(db), + db_options_(db_options), + cv_(db_mutex), + end_recovery_(false), + recovery_thread_(nullptr), + db_mutex_(db_mutex), + auto_recovery_(false), + recovery_in_prog_(false), + soft_error_no_bg_work_(false), + is_db_stopped_(false), + bg_error_stats_(db_options.statistics) { + // Clear the checked flag for uninitialized errors + bg_error_.PermitUncheckedError(); + recovery_error_.PermitUncheckedError(); + recovery_io_error_.PermitUncheckedError(); + } + + void EnableAutoRecovery() { auto_recovery_ = true; } + + Status::Severity GetErrorSeverity(BackgroundErrorReason reason, + Status::Code code, Status::SubCode subcode); + + const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason); + + Status GetBGError() const { return bg_error_; } + + Status GetRecoveryError() const { return recovery_error_; } + + Status ClearBGError(); + + bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); } + + bool IsBGWorkStopped() { + assert(db_mutex_); + db_mutex_->AssertHeld(); + return !bg_error_.ok() && + (bg_error_.severity() >= Status::Severity::kHardError || + !auto_recovery_ || soft_error_no_bg_work_); + } + + bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; } + + bool IsRecoveryInProgress() { return recovery_in_prog_; } + + Status RecoverFromBGError(bool is_manual = false); + void CancelErrorRecovery(); + + void EndAutoRecovery(); + + private: + DBImpl* db_; + const ImmutableDBOptions& db_options_; + Status bg_error_; + // A separate Status variable used to record any errors during the + // recovery process from hard errors + Status recovery_error_; + // A separate IO Status variable used to record any IO errors during + // the recovery process. At the same time, recovery_error_ is also set. + IOStatus recovery_io_error_; + // The condition variable used with db_mutex during auto resume for time + // wait. + InstrumentedCondVar cv_; + bool end_recovery_; + std::unique_ptr recovery_thread_; + + InstrumentedMutex* db_mutex_; + // A flag indicating whether automatic recovery from errors is enabled + bool auto_recovery_; + bool recovery_in_prog_; + // A flag to indicate that for the soft error, we should not allow any + // background work except the work is from recovery. + bool soft_error_no_bg_work_; + + // Used to store the context for recover, such as flush reason. + DBRecoverContext recover_context_; + std::atomic is_db_stopped_; + + // The pointer of DB statistics. + std::shared_ptr bg_error_stats_; + + const Status& HandleKnownErrors(const Status& bg_err, + BackgroundErrorReason reason); + Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); + void RecoverFromNoSpace(); + const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error); + void RecoverFromRetryableBGIOError(); + // First, if it is in recovery and the recovery_error is ok. Set the + // recovery_error_ to bg_err. Second, if the severity is higher than the + // current bg_error_, overwrite it. + void CheckAndSetRecoveryAndBGError(const Status& bg_err); }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/event_helpers.cc b/db/event_helpers.cc index 6a5c93661ef..7987b8ec6a5 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -23,7 +23,7 @@ template inline T SafeDivide(T a, T b) { return b == 0 ? 0 : a / b; } -} // namespace +} // anonymous namespace void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) { *jwriter << "time_micros" diff --git a/db/event_helpers.h b/db/event_helpers.h index ad299670ff8..68d819fe6bd 100644 --- a/db/event_helpers.h +++ b/db/event_helpers.h @@ -39,9 +39,9 @@ class EventHelpers { const std::string& file_checksum, const std::string& file_checksum_func_name); static void LogAndNotifyTableFileDeletion( - EventLogger* event_logger, int job_id, - uint64_t file_number, const std::string& file_path, - const Status& status, const std::string& db_name, + EventLogger* event_logger, int job_id, uint64_t file_number, + const std::string& file_path, const Status& status, + const std::string& db_name, const std::vector>& listeners); static void NotifyOnErrorRecoveryEnd( const std::vector>& listeners, diff --git a/db/experimental.cc b/db/experimental.cc index d838ebde598..20b5daa970b 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -112,8 +112,9 @@ Status UpdateManifestForFilesState( lf->smallest, lf->largest, lf->fd.smallest_seqno, lf->fd.largest_seqno, lf->marked_for_compaction, temp, lf->oldest_blob_file_number, lf->oldest_ancester_time, - lf->file_creation_time, lf->file_checksum, - lf->file_checksum_func_name, lf->unique_id); + lf->file_creation_time, lf->epoch_number, lf->file_checksum, + lf->file_checksum_func_name, lf->unique_id, + lf->compensated_range_deletion_size); } } } else { diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 665c89869e2..c12313a4ffd 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -694,6 +694,7 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) { bool verify_checksums_before_ingest = std::get<1>(GetParam()); do { Options options = CurrentOptions(); + options.disable_auto_compactions = true; DestroyAndReopen(options); std::map true_data; @@ -800,6 +801,7 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) { bool verify_checksums_before_ingest = std::get<1>(GetParam()); do { Options options = CurrentOptions(); + options.disable_auto_compactions = true; options.merge_operator.reset(new TestPutOperator()); DestroyAndReopen(options); std::map true_data; @@ -927,6 +929,7 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) { bool verify_checksums_before_ingest = std::get<1>(GetParam()); do { Options options = CurrentOptions(); + options.disable_auto_compactions = true; options.merge_operator.reset(new TestPutOperator()); DestroyAndReopen(options); std::map true_data; diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 5a9d15388b9..80fd82dab98 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -106,9 +106,8 @@ Status ExternalSstFileIngestionJob::Prepare( for (IngestedFileInfo& f : files_to_ingest_) { f.copy_file = false; const std::string path_outside_db = f.external_file_path; - const std::string path_inside_db = - TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(), - f.fd.GetPathId()); + const std::string path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); if (ingestion_options_.move_files) { status = fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); @@ -472,14 +471,90 @@ Status ExternalSstFileIngestionJob::Run() { f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber, - oldest_ancester_time, current_time, f.file_checksum, - f.file_checksum_func_name, f.unique_id); + oldest_ancester_time, current_time, + ingestion_options_.ingest_behind + ? kReservedEpochNumberForFileIngestedBehind + : cfd_->NewEpochNumber(), + f.file_checksum, f.file_checksum_func_name, f.unique_id, 0); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } + + CreateEquivalentFileIngestingCompactions(); return status; } +void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() { + // A map from output level to input of compactions equivalent to this + // ingestion job. + // TODO: simplify below logic to creating compaction per ingested file + // instead of per output level, once we figure out how to treat ingested files + // with adjacent range deletion tombstones to same output level in the same + // job as non-overlapping compactions. + std::map + output_level_to_file_ingesting_compaction_input; + + for (const auto& pair : edit_.GetNewFiles()) { + int output_level = pair.first; + const FileMetaData& f_metadata = pair.second; + + CompactionInputFiles& input = + output_level_to_file_ingesting_compaction_input[output_level]; + if (input.files.empty()) { + // Treat the source level of ingested files to be level 0 + input.level = 0; + } + + compaction_input_metdatas_.push_back(new FileMetaData(f_metadata)); + input.files.push_back(compaction_input_metdatas_.back()); + } + + for (const auto& pair : output_level_to_file_ingesting_compaction_input) { + int output_level = pair.first; + const CompactionInputFiles& input = pair.second; + + const auto& mutable_cf_options = *(cfd_->GetLatestMutableCFOptions()); + file_ingesting_compactions_.push_back(new Compaction( + cfd_->current()->storage_info(), *cfd_->ioptions(), mutable_cf_options, + mutable_db_options_, {input}, output_level, + MaxFileSizeForLevel( + mutable_cf_options, output_level, + cfd_->ioptions()->compaction_style) /* output file size + limit, + * not applicable + */ + , + LLONG_MAX /* max compaction bytes, not applicable */, + 0 /* output path ID, not applicable */, mutable_cf_options.compression, + mutable_cf_options.compression_opts, Temperature::kUnknown, + 0 /* max_subcompaction, not applicable */, + {} /* grandparents, not applicable */, false /* is manual */, + "" /* trim_ts */, -1 /* score, not applicable */, + false /* is deletion compaction, not applicable */, + files_overlap_ /* l0_files_might_overlap, not applicable */, + CompactionReason::kExternalSstIngestion)); + } +} + +void ExternalSstFileIngestionJob::RegisterRange() { + for (const auto& c : file_ingesting_compactions_) { + cfd_->compaction_picker()->RegisterCompaction(c); + } +} + +void ExternalSstFileIngestionJob::UnregisterRange() { + for (const auto& c : file_ingesting_compactions_) { + cfd_->compaction_picker()->UnregisterCompaction(c); + delete c; + } + file_ingesting_compactions_.clear(); + + for (const auto& f : compaction_input_metdatas_) { + delete f; + } + compaction_input_metdatas_.clear(); +} + void ExternalSstFileIngestionJob::UpdateStats() { // Update internal stats for new ingested files uint64_t total_keys = 0; @@ -493,7 +568,8 @@ void ExternalSstFileIngestionJob::UpdateStats() { stream.StartArray(); for (IngestedFileInfo& f : files_to_ingest_) { - InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1); + InternalStats::CompactionStats stats( + CompactionReason::kExternalSstIngestion, 1); stats.micros = total_time; // If actual copy occurred for this file, then we need to count the file // size as the actual bytes written. If the file was linked, then we ignore @@ -593,8 +669,8 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( std::unique_ptr sst_file; std::unique_ptr sst_file_reader; - status = fs_->NewRandomAccessFile(external_file, env_options_, - &sst_file, nullptr); + status = + fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); if (!status.ok()) { return status; } @@ -660,9 +736,9 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( assert(seqno_iter == uprops.end()); file_to_ingest->original_seqno = 0; if (ingestion_options_.allow_blocking_flush || - ingestion_options_.allow_global_seqno) { + ingestion_options_.allow_global_seqno) { return Status::InvalidArgument( - "External SST file V1 does not support global seqno"); + "External SST file V1 does not support global seqno"); } } else { return Status::InvalidArgument("External file version is not supported"); @@ -797,8 +873,16 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( if (lvl > 0 && lvl < vstorage->base_level()) { continue; } - - if (vstorage->NumLevelFiles(lvl) > 0) { + if (cfd_->RangeOverlapWithCompaction( + file_to_ingest->smallest_internal_key.user_key(), + file_to_ingest->largest_internal_key.user_key(), lvl)) { + // We must use L0 or any level higher than `lvl` to be able to overwrite + // the compaction output keys that we overlap with in this level, We also + // need to assign this file a seqno to overwrite the compaction output + // keys in level `lvl` + overlap_with_db = true; + break; + } else if (vstorage->NumLevelFiles(lvl) > 0) { bool overlap_with_level = false; status = sv->current->OverlapWithLevelIterator( ro, env_options_, file_to_ingest->smallest_internal_key.user_key(), @@ -855,11 +939,12 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( target_level < cfd_->NumberLevels() - 1) { status = Status::TryAgain( "Files cannot be ingested to Lmax. Please make sure key range of Lmax " + "and ongoing compaction's output to Lmax" "does not overlap with files to ingest."); return status; } - TEST_SYNC_POINT_CALLBACK( + TEST_SYNC_POINT_CALLBACK( "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile", &overlap_with_db); file_to_ingest->picked_level = target_level; @@ -872,22 +957,22 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile( IngestedFileInfo* file_to_ingest) { auto* vstorage = cfd_->current()->storage_info(); - // first check if new files fit in the bottommost level + // First, check if new files fit in the bottommost level int bottom_lvl = cfd_->NumberLevels() - 1; - if(!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) { + if (!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) { return Status::InvalidArgument( - "Can't ingest_behind file as it doesn't fit " - "at the bottommost level!"); + "Can't ingest_behind file as it doesn't fit " + "at the bottommost level!"); } - // second check if despite allow_ingest_behind=true we still have 0 seqnums + // Second, check if despite allow_ingest_behind=true we still have 0 seqnums // at some upper level for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) { for (auto file : vstorage->LevelFiles(lvl)) { if (file->fd.smallest_seqno == 0) { return Status::InvalidArgument( - "Can't ingest_behind file as despite allow_ingest_behind=true " - "there are files with 0 seqno in database at upper levels!"); + "Can't ingest_behind file as despite allow_ingest_behind=true " + "there are files with 0 seqno in database at upper levels!"); } } } @@ -914,9 +999,8 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile( // If the file system does not support random write, then we should not. // Otherwise we should. std::unique_ptr rwfile; - Status status = - fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_, - &rwfile, nullptr); + Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path, + env_options_, &rwfile, nullptr); TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile", &status); if (status.ok()) { @@ -997,14 +1081,8 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel( // add it to this level return false; } - if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key, - file_largest_user_key, level)) { - // File overlap with a running compaction output that will be stored - // in this level, we cannot add this file to this level - return false; - } - // File did not overlap with level files, our compaction output + // File did not overlap with level files, nor compaction output return true; } diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index ce50ae86da1..49bb1e31e59 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -11,6 +11,7 @@ #include "db/column_family.h" #include "db/internal_stats.h" #include "db/snapshot_impl.h" +#include "db/version_edit.h" #include "env/file_system_tracer.h" #include "logging/event_logger.h" #include "options/db_options.h" @@ -78,7 +79,8 @@ class ExternalSstFileIngestionJob { public: ExternalSstFileIngestionJob( VersionSet* versions, ColumnFamilyData* cfd, - const ImmutableDBOptions& db_options, const EnvOptions& env_options, + const ImmutableDBOptions& db_options, + const MutableDBOptions& mutable_db_options, const EnvOptions& env_options, SnapshotList* db_snapshots, const IngestExternalFileOptions& ingestion_options, Directories* directories, EventLogger* event_logger, @@ -88,6 +90,7 @@ class ExternalSstFileIngestionJob { versions_(versions), cfd_(cfd), db_options_(db_options), + mutable_db_options_(mutable_db_options), env_options_(env_options), db_snapshots_(db_snapshots), ingestion_options_(ingestion_options), @@ -99,6 +102,17 @@ class ExternalSstFileIngestionJob { assert(directories != nullptr); } + ~ExternalSstFileIngestionJob() { + for (const auto& c : file_ingesting_compactions_) { + cfd_->compaction_picker()->UnregisterCompaction(c); + delete c; + } + + for (const auto& f : compaction_input_metdatas_) { + delete f; + } + } + // Prepare the job by copying external files into the DB. Status Prepare(const std::vector& external_files_paths, const std::vector& files_checksums, @@ -120,6 +134,15 @@ class ExternalSstFileIngestionJob { // REQUIRES: Mutex held Status Run(); + // Register key range involved in this ingestion job + // to prevent key range conflict with other ongoing compaction/file ingestion + // REQUIRES: Mutex held + void RegisterRange(); + + // Unregister key range registered for this ingestion job + // REQUIRES: Mutex held + void UnregisterRange(); + // Update column family stats. // REQUIRES: Mutex held void UpdateStats(); @@ -175,11 +198,17 @@ class ExternalSstFileIngestionJob { template Status SyncIngestedFile(TWritableFile* file); + // Create equivalent `Compaction` objects to this file ingestion job + // , which will be used to check range conflict with other ongoing + // compactions. + void CreateEquivalentFileIngestingCompactions(); + SystemClock* clock_; FileSystemPtr fs_; VersionSet* versions_; ColumnFamilyData* cfd_; const ImmutableDBOptions& db_options_; + const MutableDBOptions& mutable_db_options_; const EnvOptions& env_options_; SnapshotList* db_snapshots_; autovector files_to_ingest_; @@ -196,6 +225,14 @@ class ExternalSstFileIngestionJob { // file_checksum_gen_factory is set, DB will generate checksum each file. bool need_generate_file_checksum_{true}; std::shared_ptr io_tracer_; + + // Below are variables used in (un)registering range for this ingestion job + // + // FileMetaData used in inputs of compactions equivalent to this ingestion + // job + std::vector compaction_input_metdatas_; + // Compactions equivalent to this ingestion job + std::vector file_ingesting_compactions_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index d3d22bccbf7..edbed9e9eba 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -301,7 +301,8 @@ TEST_F(ExternalSSTFileTest, Basic) { SstFileWriter sst_file_writer(EnvOptions(), options); - // Current file size should be 0 after sst_file_writer init and before open a file. + // Current file size should be 0 after sst_file_writer init and before open + // a file. ASSERT_EQ(sst_file_writer.FileSize(), 0); // file1.sst (0 => 99) @@ -972,7 +973,7 @@ TEST_F(ExternalSSTFileTest, MultiThreaded) { do { Options options = CurrentOptions(); - + options.disable_auto_compactions = true; std::atomic thread_num(0); std::function write_file_func = [&]() { int file_idx = thread_num.fetch_add(1); @@ -1248,8 +1249,9 @@ TEST_P(ExternalSSTFileTest, PickedLevel) { // This file overlaps with file 0 (L3), file 1 (L2) and the // output of compaction going to L1 - ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true, - false, false, &true_data)); + ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, + true /* allow_global_seqno */, false, + true, false, false, &true_data)); EXPECT_EQ(FilesPerLevel(), "5,0,1,1"); // This file does not overlap with any file or with the running compaction @@ -1269,106 +1271,6 @@ TEST_P(ExternalSSTFileTest, PickedLevel) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(ExternalSSTFileTest, PickedLevelBug) { - env_->skip_fsync_ = true; - Options options = CurrentOptions(); - options.disable_auto_compactions = false; - options.level0_file_num_compaction_trigger = 3; - options.num_levels = 2; - DestroyAndReopen(options); - - std::vector file_keys; - - // file #1 in L0 - file_keys = {0, 5, 7}; - for (int k : file_keys) { - ASSERT_OK(Put(Key(k), Key(k))); - } - ASSERT_OK(Flush()); - - // file #2 in L0 - file_keys = {4, 6, 8, 9}; - for (int k : file_keys) { - ASSERT_OK(Put(Key(k), Key(k))); - } - ASSERT_OK(Flush()); - - // We have 2 overlapping files in L0 - EXPECT_EQ(FilesPerLevel(), "2"); - - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter", - "ExternalSSTFileTest::PickedLevelBug:0"}, - {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"}, - {"ExternalSSTFileTest::PickedLevelBug:2", - "DBImpl::RunManualCompaction:0"}, - {"ExternalSSTFileTest::PickedLevelBug:3", - "DBImpl::RunManualCompaction:1"}}); - - std::atomic bg_compact_started(false); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::BackgroundCompaction:Start", - [&](void* /*arg*/) { bg_compact_started.store(true); }); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - Status bg_compact_status; - Status bg_addfile_status; - - { - // While writing the MANIFEST start a thread that will ask for compaction - ThreadGuard bg_compact(port::Thread([&]() { - bg_compact_status = - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - })); - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2"); - - // Start a thread that will ingest a new file - ThreadGuard bg_addfile(port::Thread([&]() { - file_keys = {1, 2, 3}; - bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1); - })); - - // Wait for AddFile to start picking levels and writing MANIFEST - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0"); - - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3"); - - // We need to verify that no compactions can run while AddFile is - // ingesting the files into the levels it find suitable. So we will - // wait for 2 seconds to give a chance for compactions to run during - // this period, and then make sure that no compactions where able to run - env_->SleepForMicroseconds(1000000 * 2); - bool bg_compact_started_tmp = bg_compact_started.load(); - - // Hold AddFile from finishing writing the MANIFEST - TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1"); - - // check the status at the end, so even if the ASSERT fails the threads - // could be joined and return. - ASSERT_FALSE(bg_compact_started_tmp); - } - - ASSERT_OK(bg_addfile_status); - ASSERT_OK(bg_compact_status); - - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - - int total_keys = 0; - Iterator* iter = db_->NewIterator(ReadOptions()); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - total_keys++; - } - ASSERT_EQ(total_keys, 10); - - delete iter; - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); -} - TEST_F(ExternalSSTFileTest, IngestNonExistingFile) { Options options = CurrentOptions(); DestroyAndReopen(options); @@ -1419,7 +1321,8 @@ TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) { int range_id = 0; std::vector file_keys; std::function bg_addfile = [&]() { - ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id)); + ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id, + true /* allow_global_seqno */)); }; const int num_of_ranges = 1000; @@ -1502,8 +1405,9 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) { // This file overlaps with the output of the compaction (going to L3) // so the file will be added to L0 since L3 is the base level - ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false, - false, true, false, false, &true_data)); + ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, + true /* allow_global_seqno */, false, + true, false, false, &true_data)); EXPECT_EQ(FilesPerLevel(), "5"); // This file does not overlap with the current running compactiong @@ -1641,14 +1545,15 @@ TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():Start", [&](void* /*arg*/) { - // fit in L3 but will overlap with compaction so will be added - // to L2 but a compaction will trivially move it to L3 - // and break LSM consistency + // Fit in L3 but will overlap with the compaction output so will be + // added to L2. Prior to the fix, a compaction will then trivially move + // this file to L3 and break LSM consistency static std::atomic called = {false}; if (!called) { called = true; ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}})); - ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7)); + ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7, + true /* allow_global_seqno */)); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -2318,7 +2223,6 @@ TEST_F(ExternalSSTFileTest, SkipBloomFilter) { table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - // Create external SST file and include bloom filters options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); DestroyAndReopen(options); diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index b2c7870d972..ddd4b47cc59 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -338,8 +338,7 @@ class FaultInjectionTest FaultInjectionTest::kValExpectNoError)); } - void NoWriteTestPreFault() { - } + void NoWriteTestPreFault() {} void NoWriteTestReopenWithFault(ResetMethod reset_method) { CloseDB(); diff --git a/db/file_indexer.cc b/db/file_indexer.cc index 523cb3c1602..608f1cb28da 100644 --- a/db/file_indexer.cc +++ b/db/file_indexer.cc @@ -8,8 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/file_indexer.h" + #include #include + #include "db/version_edit.h" #include "rocksdb/comparator.h" diff --git a/db/file_indexer.h b/db/file_indexer.h index fd889b03146..45cb1361501 100644 --- a/db/file_indexer.h +++ b/db/file_indexer.h @@ -12,6 +12,7 @@ #include #include #include + #include "memory/arena.h" #include "port/port.h" #include "util/autovector.h" @@ -66,7 +67,7 @@ class FileIndexer { struct IndexUnit { IndexUnit() - : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {} + : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {} // During file search, a key is compared against smallest and largest // from a FileMetaData. It can have 3 possible outcomes: // (1) key is smaller than smallest, implying it is also smaller than diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc index 99ce9399368..5c82189ef95 100644 --- a/db/file_indexer_test.cc +++ b/db/file_indexer_test.cc @@ -8,7 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/file_indexer.h" + #include + #include "db/dbformat.h" #include "db/version_edit.h" #include "port/stack_trace.h" @@ -73,8 +75,8 @@ class FileIndexerTest : public testing::Test { } void GetNextLevelIndex(const uint32_t level, const uint32_t file_index, - const int cmp_smallest, const int cmp_largest, int32_t* left_index, - int32_t* right_index) { + const int cmp_smallest, const int cmp_largest, + int32_t* left_index, int32_t* right_index) { *left_index = 100; *right_index = 100; indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest, diff --git a/db/filename_test.cc b/db/filename_test.cc index bba275dc07b..04c81b33330 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -69,35 +69,33 @@ TEST_F(FileNameTest, Parse) { } // Errors - static const char* errors[] = { - "", - "foo", - "foo-dx-100.log", - ".log", - "", - "manifest", - "CURREN", - "CURRENTX", - "MANIFES", - "MANIFEST", - "MANIFEST-", - "XMANIFEST-3", - "MANIFEST-3x", - "META", - "METADB", - "METADB-", - "XMETADB-3", - "METADB-3x", - "LOC", - "LOCKx", - "LO", - "LOGx", - "18446744073709551616.log", - "184467440737095516150.log", - "100", - "100.", - "100.lop" - }; + static const char* errors[] = {"", + "foo", + "foo-dx-100.log", + ".log", + "", + "manifest", + "CURREN", + "CURRENTX", + "MANIFES", + "MANIFEST", + "MANIFEST-", + "XMANIFEST-3", + "MANIFEST-3x", + "META", + "METADB", + "METADB-", + "XMETADB-3", + "METADB-3x", + "LOC", + "LOCKx", + "LO", + "LOGx", + "18446744073709551616.log", + "184467440737095516150.log", + "100", + "100.", + "100.lop"}; for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { std::string f = errors[i]; ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; diff --git a/db/flush_job.cc b/db/flush_job.cc index 9598f42235a..649e1e1fe1b 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -23,6 +23,7 @@ #include "db/memtable_list.h" #include "db/merge_context.h" #include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" #include "db/version_set.h" #include "file/file_util.h" #include "file/filename.h" @@ -48,7 +49,7 @@ namespace ROCKSDB_NAMESPACE { -const char* GetFlushReasonString (FlushReason flush_reason) { +const char* GetFlushReasonString(FlushReason flush_reason) { switch (flush_reason) { case FlushReason::kOthers: return "Other Reasons"; @@ -90,7 +91,7 @@ FlushJob::FlushJob( std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, SnapshotChecker* snapshot_checker, JobContext* job_context, - LogBuffer* log_buffer, FSDirectory* db_directory, + FlushReason flush_reason, LogBuffer* log_buffer, FSDirectory* db_directory, FSDirectory* output_file_directory, CompressionType output_compression, Statistics* stats, EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, @@ -113,6 +114,7 @@ FlushJob::FlushJob( earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), snapshot_checker_(snapshot_checker), job_context_(job_context), + flush_reason_(flush_reason), log_buffer_(log_buffer), db_directory_(db_directory), output_file_directory_(output_file_directory), @@ -136,17 +138,14 @@ FlushJob::FlushJob( TEST_SYNC_POINT("FlushJob::FlushJob()"); } -FlushJob::~FlushJob() { - ThreadStatusUtil::ResetThreadStatus(); -} +FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); } void FlushJob::ReportStartedFlush() { ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env, db_options_.enable_thread_tracking); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); - ThreadStatusUtil::SetThreadOperationProperty( - ThreadStatus::COMPACTION_JOB_ID, - job_context_->job_id); + ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, + job_context_->job_id); IOSTATS_RESET(bytes_written); } @@ -156,8 +155,7 @@ void FlushJob::ReportFlushInputSize(const autovector& mems) { input_size += mem->ApproximateMemoryUsage(); } ThreadStatusUtil::IncreaseThreadOperationProperty( - ThreadStatus::FLUSH_BYTES_MEMTABLES, - input_size); + ThreadStatus::FLUSH_BYTES_MEMTABLES, input_size); } void FlushJob::RecordFlushIOStats() { @@ -207,6 +205,7 @@ void FlushJob::PickMemTable() { // path 0 for level 0 file. meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + meta_.epoch_number = cfd_->NewEpochNumber(); base_ = cfd_->current(); base_->Ref(); // it is likely that we do not need this reference @@ -224,8 +223,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, double mempurge_threshold = mutable_cf_options_.experimental_mempurge_threshold; - AutoThreadOperationStageUpdater stage_run( - ThreadStatus::STAGE_FLUSH_RUN); + AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN); if (mems_.empty()) { ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush", cfd_->GetName().c_str()); @@ -252,9 +250,8 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, } Status mempurge_s = Status::NotFound("No MemPurge."); if ((mempurge_threshold > 0.0) && - (cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) && - (!mems_.empty()) && MemPurgeDecider(mempurge_threshold) && - !(db_options_.atomic_flush)) { + (flush_reason_ == FlushReason::kWriteBufferFull) && (!mems_.empty()) && + MemPurgeDecider(mempurge_threshold) && !(db_options_.atomic_flush)) { cfd_->SetMempurgeUsed(); mempurge_s = MemPurge(); if (!mempurge_s.ok()) { @@ -891,7 +888,7 @@ Status FlushJob::WriteLevel0Table() { << total_num_deletes << "total_data_size" << total_data_size << "memory_usage" << total_memory_usage << "flush_reason" - << GetFlushReasonString(cfd_->GetFlushReason()); + << GetFlushReasonString(flush_reason_); { ScopedArenaIterator iter( @@ -916,8 +913,7 @@ Status FlushJob::WriteLevel0Table() { } const uint64_t current_time = static_cast(_current_time); - uint64_t oldest_key_time = - mems_.front()->ApproximateOldestKeyTime(); + uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime(); // It's not clear whether oldest_key_time is always available. In case // it is not available, use current_time. @@ -955,7 +951,7 @@ Status FlushJob::WriteLevel0Table() { cfd_->internal_stats(), &io_s, io_tracer_, BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_, job_context_->job_id, io_priority, &table_properties_, write_hint, - full_history_ts_low, blob_callback_, &num_input_entries, + full_history_ts_low, blob_callback_, base_, &num_input_entries, &memtable_payload_bytes, &memtable_garbage_bytes); // TODO: Cleanup io_status in BuildTable and table builders assert(!s.ok() || io_s.ok()); @@ -1015,9 +1011,9 @@ Status FlushJob::WriteLevel0Table() { meta_.fd.smallest_seqno, meta_.fd.largest_seqno, meta_.marked_for_compaction, meta_.temperature, meta_.oldest_blob_file_number, meta_.oldest_ancester_time, - meta_.file_creation_time, meta_.file_checksum, - meta_.file_checksum_func_name, meta_.unique_id); - + meta_.file_creation_time, meta_.epoch_number, + meta_.file_checksum, meta_.file_checksum_func_name, + meta_.unique_id, meta_.compensated_range_deletion_size); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } #ifndef ROCKSDB_LITE @@ -1090,7 +1086,7 @@ std::unique_ptr FlushJob::GetFlushJobInfo() const { info->smallest_seqno = meta_.fd.smallest_seqno; info->largest_seqno = meta_.fd.largest_seqno; info->table_properties = table_properties_; - info->flush_reason = cfd_->GetFlushReason(); + info->flush_reason = flush_reason_; info->blob_compression_type = mutable_cf_options_.blob_compression_type; // Update BlobFilesInfo. diff --git a/db/flush_job.h b/db/flush_job.h index 60c272aec3b..062ef299760 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -67,8 +67,8 @@ class FlushJob { std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, SnapshotChecker* snapshot_checker, JobContext* job_context, - LogBuffer* log_buffer, FSDirectory* db_directory, - FSDirectory* output_file_directory, + FlushReason flush_reason, LogBuffer* log_buffer, + FSDirectory* db_directory, FSDirectory* output_file_directory, CompressionType output_compression, Statistics* stats, EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, @@ -150,6 +150,7 @@ class FlushJob { SequenceNumber earliest_write_conflict_snapshot_; SnapshotChecker* snapshot_checker_; JobContext* job_context_; + FlushReason flush_reason_; LogBuffer* log_buffer_; FSDirectory* db_directory_; FSDirectory* output_file_directory_; diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index c3275b85611..003a1a6570c 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -164,15 +164,15 @@ TEST_F(FlushJobTest, Empty) { auto cfd = versions_->GetColumnFamilySet()->GetDefault(); EventLogger event_logger(db_options_.info_log.get()); SnapshotChecker* snapshot_checker = nullptr; // not relavant - FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), - db_options_, *cfd->GetLatestMutableCFOptions(), - std::numeric_limits::max() /* memtable_id */, - env_options_, versions_.get(), &mutex_, &shutting_down_, - {}, kMaxSequenceNumber, snapshot_checker, &job_context, - nullptr, nullptr, nullptr, kNoCompression, nullptr, - &event_logger, false, true /* sync_output_directory */, - true /* write_manifest */, Env::Priority::USER, - nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); + FlushJob flush_job( + dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, + *cfd->GetLatestMutableCFOptions(), + std::numeric_limits::max() /* memtable_id */, env_options_, + versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, + snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, nullptr, &event_logger, false, + true /* sync_output_directory */, true /* write_manifest */, + Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); { InstrumentedMutexLock l(&mutex_); flush_job.PickMemTable(); @@ -214,8 +214,8 @@ TEST_F(FlushJobTest, NonEmpty) { // Note: the first two blob references will not be considered when resolving // the oldest blob file referenced (the first one is inlined TTL, while the // second one is TTL and thus points to a TTL blob file). - constexpr std::array blob_file_numbers{{ - kInvalidBlobFileNumber, 5, 103, 17, 102, 101}}; + constexpr std::array blob_file_numbers{ + {kInvalidBlobFileNumber, 5, 103, 17, 102, 101}}; for (size_t i = 0; i < blob_file_numbers.size(); ++i) { std::string key(std::to_string(i + 10001)); std::string blob_index; @@ -255,9 +255,9 @@ TEST_F(FlushJobTest, NonEmpty) { *cfd->GetLatestMutableCFOptions(), std::numeric_limits::max() /* memtable_id */, env_options_, versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, - db_options_.statistics.get(), &event_logger, true, - true /* sync_output_directory */, true /* write_manifest */, + snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); HistogramData hist; @@ -318,9 +318,9 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) { dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_, versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, - db_options_.statistics.get(), &event_logger, true, - true /* sync_output_directory */, true /* write_manifest */, + snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); HistogramData hist; FileMetaData file_meta; @@ -391,8 +391,8 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), memtable_ids[k], env_options_, versions_.get(), &mutex_, &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker, - &job_context, nullptr, nullptr, nullptr, kNoCompression, - db_options_.statistics.get(), &event_logger, true, + &job_context, FlushReason::kTest, nullptr, nullptr, nullptr, + kNoCompression, db_options_.statistics.get(), &event_logger, true, false /* sync_output_directory */, false /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_)); @@ -520,9 +520,9 @@ TEST_F(FlushJobTest, Snapshots) { *cfd->GetLatestMutableCFOptions(), std::numeric_limits::max() /* memtable_id */, env_options_, versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, - db_options_.statistics.get(), &event_logger, true, - true /* sync_output_directory */, true /* write_manifest */, + snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); mutex_.Lock(); flush_job.PickMemTable(); @@ -576,9 +576,9 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) { dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_, *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_, versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, - db_options_.statistics.get(), &event_logger, true, - true /* sync_output_directory */, true /* write_manifest */, + snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_); // When the state from WriteController is normal. @@ -656,9 +656,9 @@ TEST_F(FlushJobTimestampTest, AllKeysExpired) { dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), std::numeric_limits::max() /* memtable_id */, env_options_, versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, - db_options_.statistics.get(), &event_logger, true, - true /* sync_output_directory */, true /* write_manifest */, + snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_, /*db_id=*/"", /*db_session_id=*/"", full_history_ts_low); @@ -709,9 +709,9 @@ TEST_F(FlushJobTimestampTest, NoKeyExpired) { dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), std::numeric_limits::max() /* memtable_id */, env_options_, versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, - snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression, - db_options_.statistics.get(), &event_logger, true, - true /* sync_output_directory */, true /* write_manifest */, + snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr, + nullptr, kNoCompression, db_options_.statistics.get(), &event_logger, + true, true /* sync_output_directory */, true /* write_manifest */, Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_, /*db_id=*/"", /*db_session_id=*/"", full_history_ts_low); diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 13a94cb813f..3fbc2cf4706 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -104,9 +104,7 @@ class ForwardLevelIterator : public InternalIterator { status_ = Status::NotSupported("ForwardLevelIterator::Prev()"); valid_ = false; } - bool Valid() const override { - return valid_; - } + bool Valid() const override { return valid_; } void SeekToFirst() override { assert(file_iter_ != nullptr); if (!status_.ok()) { @@ -249,9 +247,7 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, immutable_status_.PermitUncheckedError(); } -ForwardIterator::~ForwardIterator() { - Cleanup(true); -} +ForwardIterator::~ForwardIterator() { Cleanup(true); } void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv, bool background_purge_on_iterator_cleanup) { @@ -284,13 +280,13 @@ struct SVCleanupParams { SuperVersion* sv; bool background_purge_on_iterator_cleanup; }; -} +} // anonymous namespace // Used in PinnedIteratorsManager to release pinned SuperVersion void ForwardIterator::DeferredSVCleanup(void* arg) { auto d = reinterpret_cast(arg); - ForwardIterator::SVCleanup( - d->db, d->sv, d->background_purge_on_iterator_cleanup); + ForwardIterator::SVCleanup(d->db, d->sv, + d->background_purge_on_iterator_cleanup); delete d; } @@ -547,8 +543,7 @@ void ForwardIterator::Next() { assert(valid_); bool update_prev_key = false; - if (sv_ == nullptr || - sv_->version_number != cfd_->GetSuperVersionNumber()) { + if (sv_ == nullptr || sv_->version_number != cfd_->GetSuperVersionNumber()) { std::string current_key = key().ToString(); Slice old_key(current_key.data(), current_key.size()); @@ -578,7 +573,6 @@ void ForwardIterator::Next() { update_prev_key = true; } - if (update_prev_key) { prev_key_.SetInternalKey(current_->key()); is_prev_set_ = true; @@ -635,7 +629,7 @@ bool ForwardIterator::PrepareValue() { assert(!current_->Valid()); assert(!current_->status().ok()); - assert(current_ != mutable_iter_); // memtable iterator can't fail + assert(current_ != mutable_iter_); // memtable iterator can't fail assert(immutable_status_.ok()); valid_ = false; @@ -950,11 +944,11 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { } Slice prev_key = prev_key_.GetInternalKey(); if (prefix_extractor_ && prefix_extractor_->Transform(target).compare( - prefix_extractor_->Transform(prev_key)) != 0) { + prefix_extractor_->Transform(prev_key)) != 0) { return true; } if (cfd_->internal_comparator().InternalKeyComparator::Compare( - prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) { + prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) { return true; } @@ -963,8 +957,8 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { return false; } if (cfd_->internal_comparator().InternalKeyComparator::Compare( - target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() - : current_->key()) > 0) { + target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() + : current_->key()) > 0) { return true; } return false; @@ -1040,11 +1034,11 @@ uint32_t ForwardIterator::FindFileInRange( uint32_t left, uint32_t right) { auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool { return cfd_->internal_comparator().InternalKeyComparator::Compare( - f->largest.Encode(), k) < 0; + f->largest.Encode(), k) < 0; }; - const auto &b = files.begin(); - return static_cast(std::lower_bound(b + left, - b + right, internal_key, cmp) - b); + const auto& b = files.begin(); + return static_cast( + std::lower_bound(b + left, b + right, internal_key, cmp) - b); } void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) { diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 21cbd70017f..5a5c6f0f376 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -7,9 +7,9 @@ #include "rocksdb/comparator.h" #ifndef ROCKSDB_LITE +#include #include #include -#include #include "memory/arena.h" #include "rocksdb/db.h" @@ -35,6 +35,7 @@ class MinIterComparator { bool operator()(InternalIterator* a, InternalIterator* b) { return comparator_->Compare(a->key(), b->key()) > 0; } + private: const CompareInterface* comparator_; }; @@ -92,8 +93,8 @@ class ForwardIterator : public InternalIterator { // either done immediately or deferred until this iterator is unpinned by // PinnedIteratorsManager. void SVCleanup(); - static void SVCleanup( - DBImpl* db, SuperVersion* sv, bool background_purge_on_iterator_cleanup); + static void SVCleanup(DBImpl* db, SuperVersion* sv, + bool background_purge_on_iterator_cleanup); static void DeferredSVCleanup(void* arg); void RebuildIterators(bool refresh_sv); @@ -107,9 +108,9 @@ class ForwardIterator : public InternalIterator { void UpdateCurrent(); bool NeedToSeekImmutable(const Slice& internal_key); void DeleteCurrentIter(); - uint32_t FindFileInRange( - const std::vector& files, const Slice& internal_key, - uint32_t left, uint32_t right); + uint32_t FindFileInRange(const std::vector& files, + const Slice& internal_key, uint32_t left, + uint32_t right); bool IsOverUpperBound(const Slice& internal_key) const; diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc index f03c734d63a..325661cef34 100644 --- a/db/forward_iterator_bench.cc +++ b/db/forward_iterator_bench.cc @@ -14,6 +14,7 @@ int main() { int main() { return 0; } #else #include + #include #include #include @@ -281,8 +282,9 @@ struct StatsThread { } auto now = std::chrono::steady_clock::now(); double elapsed = - std::chrono::duration_cast >( - now - tlast).count(); + std::chrono::duration_cast >(now - + tlast) + .count(); uint64_t w = ::stats.written.load(); uint64_t r = ::stats.read.load(); fprintf(stderr, diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 0832ff571a6..17ad044a7e7 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -4,6 +4,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "db/version_builder.h" #ifndef ROCKSDB_LITE #include "db/import_column_family_job.h" @@ -45,40 +46,6 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, auto num_files = files_to_import_.size(); if (num_files == 0) { return Status::InvalidArgument("The list of files is empty"); - } else if (num_files > 1) { - // Verify that passed files don't have overlapping ranges in any particular - // level. - int min_level = 1; // Check for overlaps in Level 1 and above. - int max_level = -1; - for (const auto& file_metadata : metadata_) { - if (file_metadata.level > max_level) { - max_level = file_metadata.level; - } - } - for (int level = min_level; level <= max_level; ++level) { - autovector sorted_files; - for (size_t i = 0; i < num_files; i++) { - if (metadata_[i].level == level) { - sorted_files.push_back(&files_to_import_[i]); - } - } - - std::sort( - sorted_files.begin(), sorted_files.end(), - [this](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { - return cfd_->internal_comparator().Compare( - info1->smallest_internal_key, - info2->smallest_internal_key) < 0; - }); - - for (size_t i = 0; i + 1 < sorted_files.size(); i++) { - if (cfd_->internal_comparator().Compare( - sorted_files[i]->largest_internal_key, - sorted_files[i + 1]->smallest_internal_key) >= 0) { - return Status::InvalidArgument("Files have overlapping ranges"); - } - } - } } for (const auto& f : files_to_import_) { @@ -143,9 +110,6 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, // REQUIRES: we have become the only writer by entering both write_thread_ and // nonmem_write_thread_ Status ImportColumnFamilyJob::Run() { - Status status; - edit_.SetColumnFamily(cfd_->GetID()); - // We use the import time as the ancester time. This is the time the data // is written to the database. int64_t temp_current_time = 0; @@ -156,27 +120,67 @@ Status ImportColumnFamilyJob::Run() { static_cast(temp_current_time); } - for (size_t i = 0; i < files_to_import_.size(); ++i) { + // Recover files' epoch number using dummy VersionStorageInfo + VersionBuilder dummy_version_builder( + cfd_->current()->version_set()->file_options(), cfd_->ioptions(), + cfd_->table_cache(), cfd_->current()->storage_info(), + cfd_->current()->version_set(), + cfd_->GetFileMetadataCacheReservationManager()); + VersionStorageInfo dummy_vstorage( + &cfd_->internal_comparator(), cfd_->user_comparator(), + cfd_->NumberLevels(), cfd_->ioptions()->compaction_style, + nullptr /* src_vstorage */, cfd_->ioptions()->force_consistency_checks, + EpochNumberRequirement::kMightMissing); + Status s; + for (size_t i = 0; s.ok() && i < files_to_import_.size(); ++i) { const auto& f = files_to_import_[i]; const auto& file_metadata = metadata_[i]; - edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), - f.fd.GetFileSize(), f.smallest_internal_key, - f.largest_internal_key, file_metadata.smallest_seqno, - file_metadata.largest_seqno, false, file_metadata.temperature, - kInvalidBlobFileNumber, oldest_ancester_time, current_time, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - f.unique_id); - - // If incoming sequence number is higher, update local sequence number. - if (file_metadata.largest_seqno > versions_->LastSequence()) { - versions_->SetLastAllocatedSequence(file_metadata.largest_seqno); - versions_->SetLastPublishedSequence(file_metadata.largest_seqno); - versions_->SetLastSequence(file_metadata.largest_seqno); + VersionEdit dummy_version_edit; + dummy_version_edit.AddFile( + file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), + f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, + file_metadata.smallest_seqno, file_metadata.largest_seqno, false, + file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, + current_time, file_metadata.epoch_number, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, f.unique_id, 0); + s = dummy_version_builder.Apply(&dummy_version_edit); + } + if (s.ok()) { + s = dummy_version_builder.SaveTo(&dummy_vstorage); + } + if (s.ok()) { + dummy_vstorage.RecoverEpochNumbers(cfd_); + } + + // Record changes from this CF import in VersionEdit, including files with + // recovered epoch numbers + if (s.ok()) { + edit_.SetColumnFamily(cfd_->GetID()); + + for (int level = 0; level < dummy_vstorage.num_levels(); level++) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + edit_.AddFile(level, *file_meta); + // If incoming sequence number is higher, update local sequence number. + if (file_meta->fd.largest_seqno > versions_->LastSequence()) { + versions_->SetLastAllocatedSequence(file_meta->fd.largest_seqno); + versions_->SetLastPublishedSequence(file_meta->fd.largest_seqno); + versions_->SetLastSequence(file_meta->fd.largest_seqno); + } + } } } - return status; + // Release resources occupied by the dummy VersionStorageInfo + for (int level = 0; level < dummy_vstorage.num_levels(); level++) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + file_meta->refs--; + if (file_meta->refs <= 0) { + delete file_meta; + } + } + } + return s; } void ImportColumnFamilyJob::Cleanup(const Status& status) { @@ -228,8 +232,8 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( std::unique_ptr sst_file; std::unique_ptr sst_file_reader; - status = fs_->NewRandomAccessFile(external_file, env_options_, - &sst_file, nullptr); + status = + fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr); if (!status.ok()) { return status; } @@ -306,7 +310,6 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( return status; } - } // namespace ROCKSDB_NAMESPACE #endif // !ROCKSDB_LITE diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index 2847ea8da49..0c07ee2a8bf 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -556,10 +556,9 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19)); metadata.db_comparator_name = options.comparator->Name(); - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::InvalidArgument("Files have overlapping ranges")); + ASSERT_NOK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_)); ASSERT_EQ(import_cfh_, nullptr); } diff --git a/db/internal_stats.cc b/db/internal_stats.cc index dfe7e6e70bb..bc7f315d953 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -423,6 +423,10 @@ const std::string DB::Properties::kBlobCacheUsage = const std::string DB::Properties::kBlobCachePinnedUsage = rocksdb_prefix + blob_cache_pinned_usage; +const std::string InternalStats::kPeriodicCFStats = + DB::Properties::kCFStats + ".periodic"; +const int InternalStats::kMaxNoChangePeriodSinceDump = 8; + const UnorderedMap InternalStats::ppt_name_to_info = { {DB::Properties::kNumFilesAtLevelPrefix, @@ -438,6 +442,9 @@ const UnorderedMap {DB::Properties::kCFStats, {false, &InternalStats::HandleCFStats, nullptr, &InternalStats::HandleCFMapStats, nullptr}}, + {InternalStats::kPeriodicCFStats, + {false, &InternalStats::HandleCFStatsPeriodic, nullptr, nullptr, + nullptr}}, {DB::Properties::kCFStatsNoFileHistogram, {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr, nullptr}}, @@ -605,6 +612,7 @@ InternalStats::InternalStats(int num_levels, SystemClock* clock, comp_stats_(num_levels), comp_stats_by_pri_(Env::Priority::TOTAL), file_read_latency_(num_levels), + has_cf_change_since_dump_(true), bg_error_count_(0), number_levels_(num_levels), clock_(clock), @@ -651,17 +659,18 @@ void InternalStats::CollectCacheEntryStats(bool foreground) { min_interval_factor); } -std::function +std::function Blah() { + static int x = 42; + return [&]() { ++x; }; +} + +std::function InternalStats::CacheEntryRoleStats::GetEntryCallback() { - return [&](const Slice& /*key*/, void* /*value*/, size_t charge, - Cache::DeleterFn deleter) { - auto e = role_map_.find(deleter); - size_t role_idx; - if (e == role_map_.end()) { - role_idx = static_cast(CacheEntryRole::kMisc); - } else { - role_idx = static_cast(e->second); - } + return [&](const Slice& /*key*/, Cache::ObjectPtr /*value*/, size_t charge, + const Cache::CacheItemHelper* helper) -> void { + size_t role_idx = + static_cast(helper ? helper->role : CacheEntryRole::kMisc); entry_counts[role_idx]++; total_charges[role_idx] += charge; }; @@ -672,7 +681,6 @@ void InternalStats::CacheEntryRoleStats::BeginCollection( Clear(); last_start_time_micros_ = start_time_micros; ++collection_count; - role_map_ = CopyCacheDeleterRoleMap(); std::ostringstream str; str << cache->Name() << "@" << static_cast(cache) << "#" << port::GetProcessID(); @@ -1041,9 +1049,41 @@ bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) { return true; } +bool InternalStats::HandleCFStatsPeriodic(std::string* value, + Slice /*suffix*/) { + bool has_change = has_cf_change_since_dump_; + if (!has_change) { + // If file histogram changes, there is activity in this period too. + uint64_t new_histogram_num = 0; + for (int level = 0; level < number_levels_; level++) { + new_histogram_num += file_read_latency_[level].num(); + } + new_histogram_num += blob_file_read_latency_.num(); + if (new_histogram_num != last_histogram_num) { + has_change = true; + last_histogram_num = new_histogram_num; + } + } + if (has_change) { + no_cf_change_period_since_dump_ = 0; + has_cf_change_since_dump_ = false; + } else if (no_cf_change_period_since_dump_++ > 0) { + // Not ready to sync + if (no_cf_change_period_since_dump_ == kMaxNoChangePeriodSinceDump) { + // Next periodic, we need to dump stats even if there is no change. + no_cf_change_period_since_dump_ = 0; + } + return true; + } + + DumpCFStatsNoFileHistogram(/*is_periodic=*/true, value); + DumpCFFileHistogram(value); + return true; +} + bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value, Slice /*suffix*/) { - DumpCFStatsNoFileHistogram(value); + DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value); return true; } @@ -1708,11 +1748,12 @@ void InternalStats::DumpCFMapStatsIOStalls( } void InternalStats::DumpCFStats(std::string* value) { - DumpCFStatsNoFileHistogram(value); + DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value); DumpCFFileHistogram(value); } -void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { +void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, + std::string* value) { char buf[2000]; // Per-ColumnFamily stats PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level"); @@ -1864,9 +1905,11 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001), interval_compact_micros / kMicrosInSec); value->append(buf); - cf_stats_snapshot_.compact_bytes_write = compact_bytes_write; - cf_stats_snapshot_.compact_bytes_read = compact_bytes_read; - cf_stats_snapshot_.compact_micros = compact_micros; + if (is_periodic) { + cf_stats_snapshot_.compact_bytes_write = compact_bytes_write; + cf_stats_snapshot_.compact_bytes_read = compact_bytes_read; + cf_stats_snapshot_.compact_micros = compact_micros; + } snprintf(buf, sizeof(buf), "Stalls(count): %" PRIu64 @@ -1897,14 +1940,16 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) { total_stall_count - cf_stats_snapshot_.stall_count); value->append(buf); - cf_stats_snapshot_.seconds_up = seconds_up; - cf_stats_snapshot_.ingest_bytes_flush = flush_ingest; - cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest; - cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile; - cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile; - cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile; - cf_stats_snapshot_.comp_stats = compaction_stats_sum; - cf_stats_snapshot_.stall_count = total_stall_count; + if (is_periodic) { + cf_stats_snapshot_.seconds_up = seconds_up; + cf_stats_snapshot_.ingest_bytes_flush = flush_ingest; + cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest; + cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile; + cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile; + cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile; + cf_stats_snapshot_.comp_stats = compaction_stats_sum; + cf_stats_snapshot_.stall_count = total_stall_count; + } // Do not gather cache entry stats during CFStats because DB // mutex is held. Only dump last cached collection (rely on DB diff --git a/db/internal_stats.h b/db/internal_stats.h index 386eef42ddf..a833240637f 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -353,7 +353,7 @@ class InternalStats { this->num_output_records += c.num_output_records; this->count += c.count; int num_of_reasons = static_cast(CompactionReason::kNumOfReasons); - for (int i = 0; i< num_of_reasons; i++) { + for (int i = 0; i < num_of_reasons; i++) { counts[i] += c.counts[i]; } } @@ -472,7 +472,8 @@ class InternalStats { } void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros); - std::function + std::function GetEntryCallback(); void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros); void SkippedCollection(); @@ -482,7 +483,6 @@ class InternalStats { SystemClock* clock) const; private: - UnorderedMap role_map_; uint64_t GetLastDurationMicros() const; }; @@ -506,6 +506,7 @@ class InternalStats { db_stats_snapshot_.Clear(); bg_error_count_ = 0; started_at_ = clock_->NowMicros(); + has_cf_change_since_dump_ = true; } void AddCompactionStats(int level, Env::Priority thread_pri, @@ -528,6 +529,7 @@ class InternalStats { } void AddCFStats(InternalCFStatsType type, uint64_t value) { + has_cf_change_since_dump_ = true; cf_stats_value_[type] += value; ++cf_stats_count_[type]; } @@ -593,6 +595,8 @@ class InternalStats { // DBPropertyInfo struct used internally for retrieving properties. static const UnorderedMap ppt_name_to_info; + static const std::string kPeriodicCFStats; + private: void DumpDBMapStats(std::map* db_stats); void DumpDBStats(std::string* value); @@ -605,7 +609,11 @@ class InternalStats { std::map>* priorities_stats); void DumpCFMapStatsIOStalls(std::map* cf_stats); void DumpCFStats(std::string* value); - void DumpCFStatsNoFileHistogram(std::string* value); + // if is_periodic = true, it is an internal call by RocksDB periodically to + // dump the status. + void DumpCFStatsNoFileHistogram(bool is_periodic, std::string* value); + // if is_periodic = true, it is an internal call by RocksDB periodically to + // dump the status. void DumpCFFileHistogram(std::string* value); Cache* GetBlockCacheForStats(); @@ -629,13 +637,19 @@ class InternalStats { CompactionStats per_key_placement_comp_stats_; std::vector file_read_latency_; HistogramImpl blob_file_read_latency_; + bool has_cf_change_since_dump_; + // How many periods of no change since the last time stats are dumped for + // a periodic dump. + int no_cf_change_period_since_dump_ = 0; + uint64_t last_histogram_num = std::numeric_limits::max(); + static const int kMaxNoChangePeriodSinceDump; // Used to compute per-interval statistics struct CFStatsSnapshot { // ColumnFamily-level stats CompactionStats comp_stats; - uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) - uint64_t stall_count; // Stall count + uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) + uint64_t stall_count; // Stall count // Stats from compaction jobs - bytes written, bytes read, duration. uint64_t compact_bytes_write; uint64_t compact_bytes_read; @@ -677,10 +691,10 @@ class InternalStats { struct DBStatsSnapshot { // DB-level stats - uint64_t ingest_bytes; // Bytes written by user - uint64_t wal_bytes; // Bytes written to WAL - uint64_t wal_synced; // Number of times WAL is synced - uint64_t write_with_wal; // Number of writes that request WAL + uint64_t ingest_bytes; // Bytes written by user + uint64_t wal_bytes; // Bytes written to WAL + uint64_t wal_synced; // Number of times WAL is synced + uint64_t write_with_wal; // Number of writes that request WAL // These count the number of writes processed by the calling thread or // another thread. uint64_t write_other; @@ -729,6 +743,7 @@ class InternalStats { bool HandleCFStats(std::string* value, Slice suffix); bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix); bool HandleCFFileHistogram(std::string* value, Slice suffix); + bool HandleCFStatsPeriodic(std::string* value, Slice suffix); bool HandleDBMapStats(std::map* compaction_stats, Slice suffix); bool HandleDBStats(std::string* value, Slice suffix); @@ -965,13 +980,14 @@ class InternalStats { return false; } - bool GetIntProperty(const DBPropertyInfo& /*property_info*/, uint64_t* /*value*/, - DBImpl* /*db*/) const { + bool GetIntProperty(const DBPropertyInfo& /*property_info*/, + uint64_t* /*value*/, DBImpl* /*db*/) const { return false; } bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/, - Version* /*version*/, uint64_t* /*value*/) const { + Version* /*version*/, + uint64_t* /*value*/) const { return false; } }; diff --git a/db/job_context.h b/db/job_context.h index d7d05b11a08..352c58e8239 100644 --- a/db/job_context.h +++ b/db/job_context.h @@ -35,7 +35,7 @@ struct SuperVersionContext { new_superversion; // if nullptr no new superversion explicit SuperVersionContext(bool create_superversion = false) - : new_superversion(create_superversion ? new SuperVersion() : nullptr) {} + : new_superversion(create_superversion ? new SuperVersion() : nullptr) {} explicit SuperVersionContext(SuperVersionContext&& other) noexcept : superversions_to_free(std::move(other.superversions_to_free)), @@ -54,8 +54,7 @@ struct SuperVersionContext { inline bool HaveSomethingToDelete() const { #ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION - return !superversions_to_free.empty() || - !write_stall_notifications.empty(); + return !superversions_to_free.empty() || !write_stall_notifications.empty(); #else return !superversions_to_free.empty(); #endif @@ -77,7 +76,8 @@ struct SuperVersionContext { (void)new_cond; (void)name; (void)ioptions; -#endif // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) +#endif // !defined(ROCKSDB_LITE) && + // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) } void Clean() { @@ -139,8 +139,7 @@ struct JobContext { CandidateFileInfo(std::string name, std::string path) : file_name(std::move(name)), file_path(std::move(path)) {} bool operator==(const CandidateFileInfo& other) const { - return file_name == other.file_name && - file_path == other.file_path; + return file_name == other.file_name && file_path == other.file_path; } }; diff --git a/db/listener_test.cc b/db/listener_test.cc index 0d88c8f55a8..160866bb774 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -89,7 +89,7 @@ class TestCompactionListener : public EventListener { public: explicit TestCompactionListener(EventListenerTest* test) : test_(test) {} - void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override { + void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { std::lock_guard lock(mutex_); compacted_dbs_.push_back(db); ASSERT_GT(ci.input_files.size(), 0U); @@ -172,9 +172,9 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) { TestCompactionListener* listener = new TestCompactionListener(this); options.listeners.emplace_back(listener); - std::vector cf_names = { - "pikachu", "ilya", "muromec", "dobrynia", - "nikitich", "alyosha", "popovich"}; + std::vector cf_names = {"pikachu", "ilya", "muromec", + "dobrynia", "nikitich", "alyosha", + "popovich"}; CreateAndReopenWithCF(cf_names, options); ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); @@ -214,8 +214,7 @@ class TestFlushListener : public EventListener { virtual ~TestFlushListener() { prev_fc_info_.status.PermitUncheckedError(); // Ignore the status } - void OnTableFileCreated( - const TableFileCreationInfo& info) override { + void OnTableFileCreated(const TableFileCreationInfo& info) override { // remember the info for later checking the FlushJobInfo. prev_fc_info_ = info; ASSERT_GT(info.db_name.size(), 0U); @@ -250,8 +249,7 @@ class TestFlushListener : public EventListener { #endif // ROCKSDB_USING_THREAD_STATUS } - void OnFlushCompleted( - DB* db, const FlushJobInfo& info) override { + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { flushed_dbs_.push_back(db); flushed_column_family_names_.push_back(info.cf_name); if (info.triggered_writes_slowdown) { @@ -317,9 +315,9 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) { #endif // ROCKSDB_USING_THREAD_STATUS TestFlushListener* listener = new TestFlushListener(options.env, this); options.listeners.emplace_back(listener); - std::vector cf_names = { - "pikachu", "ilya", "muromec", "dobrynia", - "nikitich", "alyosha", "popovich"}; + std::vector cf_names = {"pikachu", "ilya", "muromec", + "dobrynia", "nikitich", "alyosha", + "popovich"}; options.table_properties_collector_factories.push_back( std::make_shared()); CreateAndReopenWithCF(cf_names, options); @@ -421,9 +419,9 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { listeners.emplace_back(new TestFlushListener(options.env, this)); } - std::vector cf_names = { - "pikachu", "ilya", "muromec", "dobrynia", - "nikitich", "alyosha", "popovich"}; + std::vector cf_names = {"pikachu", "ilya", "muromec", + "dobrynia", "nikitich", "alyosha", + "popovich"}; options.create_if_missing = true; for (int i = 0; i < kNumListeners; ++i) { @@ -433,7 +431,7 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { ColumnFamilyOptions cf_opts(options); std::vector dbs; - std::vector> vec_handles; + std::vector> vec_handles; for (int d = 0; d < kNumDBs; ++d) { ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options)); @@ -452,8 +450,8 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { for (int d = 0; d < kNumDBs; ++d) { for (size_t c = 0; c < cf_names.size(); ++c) { - ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c], - cf_names[c], cf_names[c])); + ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c], cf_names[c], + cf_names[c])); } } @@ -483,7 +481,6 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) { } } - for (auto handles : vec_handles) { for (auto h : handles) { delete h; @@ -887,16 +884,17 @@ TEST_F(EventListenerTest, TableFileCreationListenersTest) { } class MemTableSealedListener : public EventListener { -private: + private: SequenceNumber latest_seq_number_; -public: + + public: MemTableSealedListener() {} void OnMemTableSealed(const MemTableInfo& info) override { latest_seq_number_ = info.first_seqno; } void OnFlushCompleted(DB* /*db*/, - const FlushJobInfo& flush_job_info) override { + const FlushJobInfo& flush_job_info) override { ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_); } }; @@ -911,8 +909,8 @@ TEST_F(EventListenerTest, MemTableSealedListenerTest) { for (unsigned int i = 0; i < 10; i++) { std::string tag = std::to_string(i); - ASSERT_OK(Put("foo"+tag, "aaa")); - ASSERT_OK(Put("bar"+tag, "bbb")); + ASSERT_OK(Put("foo" + tag, "aaa")); + ASSERT_OK(Put("bar" + tag, "bbb")); ASSERT_OK(Flush()); } diff --git a/db/log_reader.cc b/db/log_reader.cc index eb5c88d254a..575a7d75891 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -21,8 +21,7 @@ namespace ROCKSDB_NAMESPACE { namespace log { -Reader::Reporter::~Reporter() { -} +Reader::Reporter::~Reporter() {} Reader::Reader(std::shared_ptr info_log, std::unique_ptr&& _file, @@ -241,9 +240,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, FALLTHROUGH_INTENDED; case kBadRecordChecksum: - if (recycled_ && - wal_recovery_mode == - WALRecoveryMode::kTolerateCorruptedTailRecords) { + if (recycled_ && wal_recovery_mode == + WALRecoveryMode::kTolerateCorruptedTailRecords) { scratch->clear(); return false; } @@ -297,9 +295,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, return false; } -uint64_t Reader::LastRecordOffset() { - return last_record_offset_; -} +uint64_t Reader::LastRecordOffset() { return last_record_offset_; } uint64_t Reader::LastRecordEnd() { return end_of_buffer_offset_ - buffer_.size(); @@ -361,11 +357,11 @@ void Reader::UnmarkEOFInternal() { if (read_buffer.data() != backing_store_ + eof_offset_) { // Read did not write to backing_store_ memmove(backing_store_ + eof_offset_, read_buffer.data(), - read_buffer.size()); + read_buffer.size()); } buffer_ = Slice(backing_store_ + consumed_bytes, - eof_offset_ + added - consumed_bytes); + eof_offset_ + added - consumed_bytes); if (added < remaining) { eof_ = true; @@ -385,7 +381,7 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) { } } -bool Reader::ReadMore(size_t* drop_size, int *error) { +bool Reader::ReadMore(size_t* drop_size, int* error) { if (!eof_ && !read_error_) { // Last read was a full read, so this is a trailer to skip buffer_.clear(); @@ -519,10 +515,11 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size, size_t uncompressed_size = 0; int remaining = 0; + const char* input = header + header_size; do { - remaining = uncompress_->Uncompress(header + header_size, length, - uncompressed_buffer_.get(), - &uncompressed_size); + remaining = uncompress_->Uncompress( + input, length, uncompressed_buffer_.get(), &uncompressed_size); + input = nullptr; if (remaining < 0) { buffer_.clear(); return kBadRecord; @@ -834,10 +831,11 @@ bool FragmentBufferedReader::TryReadFragment( uncompressed_record_.clear(); size_t uncompressed_size = 0; int remaining = 0; + const char* input = header + header_size; do { - remaining = uncompress_->Uncompress(header + header_size, length, - uncompressed_buffer_.get(), - &uncompressed_size); + remaining = uncompress_->Uncompress( + input, length, uncompressed_buffer_.get(), &uncompressed_size); + input = nullptr; if (remaining < 0) { buffer_.clear(); *fragment_type_or_err = kBadRecord; diff --git a/db/log_reader.h b/db/log_reader.h index 2ebeaaca92e..e3be1570e37 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -85,9 +85,7 @@ class Reader { uint64_t LastRecordEnd(); // returns true if the reader has encountered an eof condition. - bool IsEOF() { - return eof_; - } + bool IsEOF() { return eof_; } // returns true if the reader has encountered read error. bool hasReadError() const { return read_error_; } @@ -122,8 +120,8 @@ class Reader { // Internal state variables used for reading records Slice buffer_; - bool eof_; // Last Read() indicated EOF by returning < kBlockSize - bool read_error_; // Error occurred while reading from file + bool eof_; // Last Read() indicated EOF by returning < kBlockSize + bool read_error_; // Error occurred while reading from file // Offset of the file position indicator within the last block when an // EOF was detected. @@ -182,7 +180,7 @@ class Reader { uint64_t* fragment_checksum = nullptr); // Read some more - bool ReadMore(size_t* drop_size, int *error); + bool ReadMore(size_t* drop_size, int* error); void UnmarkEOFInternal(); diff --git a/db/log_test.cc b/db/log_test.cc index a055d72f6cb..f4d388f41b0 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -128,7 +128,7 @@ class LogTest size_t dropped_bytes_; std::string message_; - ReportCollector() : dropped_bytes_(0) { } + ReportCollector() : dropped_bytes_(0) {} void Corruption(size_t bytes, const Status& status) override { dropped_bytes_ += bytes; message_.append(status.ToString()); @@ -185,9 +185,7 @@ class LogTest ASSERT_OK(writer_->AddRecord(Slice(msg))); } - size_t WrittenBytes() const { - return dest_contents().size(); - } + size_t WrittenBytes() const { return dest_contents().size(); } std::string Read(const WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords) { @@ -235,13 +233,9 @@ class LogTest source_->force_error_position_ = position; } - size_t DroppedBytes() const { - return report_.dropped_bytes_; - } + size_t DroppedBytes() const { return report_.dropped_bytes_; } - std::string ReportMessage() const { - return report_.message_; - } + std::string ReportMessage() const { return report_.message_; } void ForceEOF(size_t position = 0) { source_->force_eof_ = true; @@ -389,7 +383,7 @@ TEST_P(LogTest, BadRecordType) { TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) { Write("foo"); - ShrinkSize(4); // Drop all payload as well as a header byte + ShrinkSize(4); // Drop all payload as well as a header byte ASSERT_EQ("EOF", Read()); // Truncated last record is ignored, not treated as an error ASSERT_EQ(0U, DroppedBytes()); @@ -581,7 +575,7 @@ TEST_P(LogTest, ErrorJoinsRecords) { Write("correct"); // Wipe the middle block - for (unsigned int offset = kBlockSize; offset < 2*kBlockSize; offset++) { + for (unsigned int offset = kBlockSize; offset < 2 * kBlockSize; offset++) { SetByte(offset, 'x'); } @@ -985,6 +979,38 @@ TEST_P(CompressionLogTest, Fragmentation) { ASSERT_EQ("EOF", Read()); } +TEST_P(CompressionLogTest, AlignedFragmentation) { + CompressionType compression_type = std::get<2>(GetParam()); + if (!StreamingCompressionTypeSupported(compression_type)) { + ROCKSDB_GTEST_SKIP("Test requires support for compression type"); + return; + } + ASSERT_OK(SetupTestEnv()); + Random rnd(301); + int num_filler_records = 0; + // Keep writing small records until the next record will be aligned at the + // beginning of the block. + while ((WrittenBytes() & (kBlockSize - 1)) >= kHeaderSize) { + char entry = 'a'; + ASSERT_OK(writer_->AddRecord(Slice(&entry, 1))); + num_filler_records++; + } + const std::vector wal_entries = { + rnd.RandomBinaryString(3 * kBlockSize), + }; + for (const std::string& wal_entry : wal_entries) { + Write(wal_entry); + } + + for (int i = 0; i < num_filler_records; ++i) { + ASSERT_EQ("a", Read()); + } + for (const std::string& wal_entry : wal_entries) { + ASSERT_EQ(wal_entry, Read()); + } + ASSERT_EQ("EOF", Read()); +} + INSTANTIATE_TEST_CASE_P( Compression, CompressionLogTest, ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(), @@ -1032,10 +1058,11 @@ TEST_P(StreamingCompressionTest, Basic) { for (int i = 0; i < (int)compressed_buffers.size(); i++) { // Call uncompress till either the entire input is consumed or the output // buffer size is equal to the allocated output buffer size. + const char* input = compressed_buffers[i].c_str(); do { - ret_val = uncompress->Uncompress(compressed_buffers[i].c_str(), - compressed_buffers[i].size(), + ret_val = uncompress->Uncompress(input, compressed_buffers[i].size(), uncompressed_output_buffer, &output_pos); + input = nullptr; if (output_pos > 0) { std::string uncompressed_fragment; uncompressed_fragment.assign(uncompressed_output_buffer, output_pos); diff --git a/db/log_writer.h b/db/log_writer.h index 4d0d49a8649..5d266e43431 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -100,7 +100,7 @@ class Writer { private: std::unique_ptr dest_; - size_t block_offset_; // Current offset in block + size_t block_offset_; // Current offset in block uint64_t log_number_; bool recycle_log_files_; diff --git a/db/logs_with_prep_tracker.h b/db/logs_with_prep_tracker.h index 7f9ece76bca..f72f0ca0787 100644 --- a/db/logs_with_prep_tracker.h +++ b/db/logs_with_prep_tracker.h @@ -58,6 +58,5 @@ class LogsWithPrepTracker { // both logs_with_prep_ and prepared_section_completed_. std::unordered_map prepared_section_completed_; std::mutex prepared_section_completed_mutex_; - }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/lookup_key.h b/db/lookup_key.h index 609d08daf8b..68851bddd16 100644 --- a/db/lookup_key.h +++ b/db/lookup_key.h @@ -10,6 +10,7 @@ #pragma once #include #include + #include "rocksdb/slice.h" #include "rocksdb/types.h" @@ -35,7 +36,9 @@ class LookupKey { return Slice(kstart_, static_cast(end_ - kstart_)); } - // Return the user key + // Return the user key. + // If user-defined timestamp is enabled, then timestamp is included in the + // result. Slice user_key() const { return Slice(kstart_, static_cast(end_ - kstart_ - 8)); } diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc index 8f58ab2cfef..52f2e6e0f28 100644 --- a/db/malloc_stats.cc +++ b/db/malloc_stats.cc @@ -10,9 +10,10 @@ #include "db/malloc_stats.h" #ifndef ROCKSDB_LITE -#include #include +#include + #include "port/jemalloc_helper.h" namespace ROCKSDB_NAMESPACE { diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index d41eca589b3..b92cb794b99 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -42,9 +42,7 @@ std::string Key1(int i) { return buf; } -std::string Key2(int i) { - return Key1(i) + "_xxx"; -} +std::string Key2(int i) { return Key1(i) + "_xxx"; } class ManualCompactionTest : public testing::Test { public: @@ -102,10 +100,10 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) { for (int iter = 0; iter < 2; ++iter) { DB* db; Options options; - if (iter == 0) { // level compaction + if (iter == 0) { // level compaction options.num_levels = 3; options.compaction_style = CompactionStyle::kCompactionStyleLevel; - } else { // universal compaction + } else { // universal compaction options.compaction_style = CompactionStyle::kCompactionStyleUniversal; } options.create_if_missing = true; diff --git a/db/memtable.cc b/db/memtable.cc index f83df13b201..fc7540a7325 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -76,7 +76,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, : comparator_(cmp), moptions_(ioptions, mutable_cf_options), refs_(0), - kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)), + kArenaBlockSize(Arena::OptimizeBlockSize(moptions_.arena_block_size)), mem_tracker_(write_buffer_manager), arena_(moptions_.arena_block_size, (write_buffer_manager != nullptr && @@ -347,9 +347,8 @@ int MemTable::KeyComparator::operator()(const char* prefix_len_key1, return comparator.CompareKeySeq(k1, k2); } -int MemTable::KeyComparator::operator()(const char* prefix_len_key, - const KeyComparator::DecodedType& key) - const { +int MemTable::KeyComparator::operator()( + const char* prefix_len_key, const KeyComparator::DecodedType& key) const { // Internal keys are encoded as length-prefixed strings. Slice a = GetLengthPrefixedSlice(prefix_len_key); return comparator.CompareKeySeq(a, key); @@ -604,10 +603,9 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( auto* unfragmented_iter = new MemTableIterator(*this, read_options, nullptr /* arena */, true /* use_range_del_table */); - cache->tombstones = std::make_unique( - FragmentedRangeTombstoneList( - std::unique_ptr(unfragmented_iter), - comparator_.comparator)); + cache->tombstones.reset(new FragmentedRangeTombstoneList( + std::unique_ptr(unfragmented_iter), + comparator_.comparator)); cache->initialized.store(true, std::memory_order_release); } cache->reader_mutex.unlock(); @@ -802,7 +800,8 @@ Status MemTable::Add(SequenceNumber s, ValueType type, std::memory_order_relaxed); data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len, std::memory_order_relaxed); - if (type == kTypeDeletion) { + if (type == kTypeDeletion || type == kTypeSingleDeletion || + type == kTypeDeletionWithTimestamp) { num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); } @@ -931,7 +930,7 @@ struct Saver { return true; } }; -} // namespace +} // anonymous namespace static bool SaveValue(void* arg, const char* entry) { TEST_SYNC_POINT_CALLBACK("Memtable::SaveValue:Begin:entry", &entry); @@ -1075,6 +1074,8 @@ static bool SaveValue(void* arg, const char* entry) { if (!s->do_merge) { // Preserve the value with the goal of returning it as part of // raw merge operands to the user + // TODO(yanqin) update MergeContext so that timestamps information + // can also be retained. merge_context->PushOperand( v, s->inplace_update_support == false /* operand_pinned */); @@ -1083,10 +1084,15 @@ static bool SaveValue(void* arg, const char* entry) { if (s->value || s->columns) { std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), &v, merge_context->GetOperands(), &result, s->logger, s->statistics, - s->clock, nullptr /* result_operand */, true); + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); if (s->status->ok()) { if (s->value) { @@ -1116,20 +1122,6 @@ static bool SaveValue(void* arg, const char* entry) { return false; } case kTypeWideColumnEntity: { - if (!s->do_merge) { - *(s->status) = Status::NotSupported( - "GetMergeOperands not supported for wide-column entities"); - *(s->found_final_value) = true; - return false; - } - - if (*(s->merge_in_progress)) { - *(s->status) = Status::NotSupported( - "Merge not supported for wide-column entities"); - *(s->found_final_value) = true; - return false; - } - if (s->inplace_update_support) { s->mem->GetLock(s->key->user_key())->ReadLock(); } @@ -1138,7 +1130,53 @@ static bool SaveValue(void* arg, const char* entry) { *(s->status) = Status::OK(); - if (s->value) { + if (!s->do_merge) { + // Preserve the value with the goal of returning it as part of + // raw merge operands to the user + + Slice value_of_default; + *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( + v, value_of_default); + + if (s->status->ok()) { + merge_context->PushOperand( + value_of_default, + s->inplace_update_support == false /* operand_pinned */); + } + } else if (*(s->merge_in_progress)) { + assert(s->do_merge); + + if (s->value) { + Slice value_of_default; + *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( + v, value_of_default); + if (s->status->ok()) { + // `op_failure_scope` (an output parameter) is not provided (set + // to nullptr) since a failure must be propagated regardless of + // its value. + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), &value_of_default, + merge_context->GetOperands(), s->value, s->logger, + s->statistics, s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + } + } else if (s->columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. + *(s->status) = MergeHelper::TimedFullMergeWithEntity( + merge_operator, s->key->user_key(), v, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + + if (s->status->ok()) { + *(s->status) = s->columns->SetWideColumnValue(result); + } + } + } else if (s->value) { Slice value_of_default; *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( v, value_of_default); @@ -1166,11 +1204,31 @@ static bool SaveValue(void* arg, const char* entry) { case kTypeSingleDeletion: case kTypeRangeDeletion: { if (*(s->merge_in_progress)) { - if (s->value != nullptr) { + if (s->value || s->columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. *(s->status) = MergeHelper::TimedFullMerge( merge_operator, s->key->user_key(), nullptr, - merge_context->GetOperands(), s->value, s->logger, - s->statistics, s->clock, nullptr /* result_operand */, true); + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + + if (s->status->ok()) { + if (s->value) { + *(s->value) = std::move(result); + } else { + assert(s->columns); + s->columns->SetPlainValue(result); + } + } + } else { + // We have found a final value (a base deletion) and have newer + // merge operands that we do not intend to merge. Nothing remains + // to be done so assign status to OK. + *(s->status) = Status::OK(); } } else { *(s->status) = Status::NotFound(); @@ -1195,10 +1253,28 @@ static bool SaveValue(void* arg, const char* entry) { v, s->inplace_update_support == false /* operand_pinned */); if (s->do_merge && merge_operator->ShouldMerge( merge_context->GetOperandsDirectionBackward())) { - *(s->status) = MergeHelper::TimedFullMerge( - merge_operator, s->key->user_key(), nullptr, - merge_context->GetOperands(), s->value, s->logger, s->statistics, - s->clock, nullptr /* result_operand */, true); + if (s->value || s->columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), nullptr, + merge_context->GetOperands(), &result, s->logger, s->statistics, + s->clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + + if (s->status->ok()) { + if (s->value) { + *(s->value) = std::move(result); + } else { + assert(s->columns); + s->columns->SetPlainValue(result); + } + } + } + *(s->found_final_value) = true; return false; } diff --git a/db/memtable.h b/db/memtable.h index d8a5521b898..1fa8c9ca699 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -88,7 +88,7 @@ class MemTable { public: struct KeyComparator : public MemTableRep::KeyComparator { const InternalKeyComparator comparator; - explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } + explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {} virtual int operator()(const char* prefix_len_key1, const char* prefix_len_key2) const override; virtual int operator()(const char* prefix_len_key, @@ -455,9 +455,7 @@ class MemTable { // persisted. // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. - void MarkFlushed() { - table_->MarkFlushed(); - } + void MarkFlushed() { table_->MarkFlushed(); } // return true if the current MemTableRep supports merge operator. bool IsMergeOperatorSupported() const { @@ -572,8 +570,8 @@ class MemTable { std::atomic write_buffer_size_; // These are used to manage memtable flushes to storage - bool flush_in_progress_; // started the flush - bool flush_completed_; // finished the flush + bool flush_in_progress_; // started the flush + bool flush_completed_; // finished the flush uint64_t file_number_; // filled up after flush is complete // The updates to be applied to the transaction log when this diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 0b90cd437d3..8242061afb0 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -4,9 +4,11 @@ // (found in the LICENSE.Apache file in the root directory). #include "db/memtable_list.h" + #include #include #include + #include "db/merge_context.h" #include "db/version_set.h" #include "db/write_controller.h" diff --git a/db/merge_context.h b/db/merge_context.h index 925bfc0e068..8a7b0729020 100644 --- a/db/merge_context.h +++ b/db/merge_context.h @@ -8,6 +8,7 @@ #include #include #include + #include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 5430509ebf9..e29d9c5badb 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -12,6 +12,8 @@ #include "db/blob/prefetch_buffer_collection.h" #include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" +#include "db/wide/wide_column_serialization.h" +#include "logging/logging.h" #include "monitoring/perf_context_imp.h" #include "monitoring/statistics.h" #include "port/likely.h" @@ -54,16 +56,15 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator, } } -Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, - const Slice& key, const Slice* value, - const std::vector& operands, - std::string* result, Logger* logger, - Statistics* statistics, SystemClock* clock, - Slice* result_operand, - bool update_num_ops_stats) { +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, const Slice* value, + const std::vector& operands, std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, Slice* result_operand, + bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope) { assert(merge_operator != nullptr); - if (operands.size() == 0) { + if (operands.empty()) { assert(value != nullptr && result != nullptr); result->assign(value->data(), value->size()); return Status::OK(); @@ -74,7 +75,7 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, static_cast(operands.size())); } - bool success; + bool success = false; Slice tmp_result_operand(nullptr, 0); const MergeOperator::MergeOperationInput merge_in(key, value, operands, logger); @@ -102,6 +103,14 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, statistics ? timer.ElapsedNanos() : 0); } + if (op_failure_scope != nullptr) { + *op_failure_scope = merge_out.op_failure_scope; + // Apply default per merge_operator.h + if (*op_failure_scope == MergeOperator::OpFailureScope::kDefault) { + *op_failure_scope = MergeOperator::OpFailureScope::kTryMerge; + } + } + if (!success) { RecordTick(statistics, NUMBER_MERGE_FAILURES); return Status::Corruption("Error: Could not perform merge."); @@ -110,6 +119,59 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, return Status::OK(); } +Status MergeHelper::TimedFullMergeWithEntity( + const MergeOperator* merge_operator, const Slice& key, Slice base_entity, + const std::vector& operands, std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope) { + WideColumns base_columns; + + { + const Status s = + WideColumnSerialization::Deserialize(base_entity, base_columns); + if (!s.ok()) { + return s; + } + } + + const bool has_default_column = + !base_columns.empty() && base_columns[0].name() == kDefaultWideColumnName; + + Slice value_of_default; + if (has_default_column) { + value_of_default = base_columns[0].value(); + } + + std::string merge_result; + + { + const Status s = TimedFullMerge(merge_operator, key, &value_of_default, + operands, &merge_result, logger, statistics, + clock, nullptr /* result_operand */, + update_num_ops_stats, op_failure_scope); + if (!s.ok()) { + return s; + } + } + + if (has_default_column) { + base_columns[0].value() = merge_result; + + const Status s = WideColumnSerialization::Serialize(base_columns, *result); + if (!s.ok()) { + return s; + } + } else { + const Status s = + WideColumnSerialization::Serialize(merge_result, base_columns, *result); + if (!s.ok()) { + return s; + } + } + + return Status::OK(); +} + // PRE: iter points to the first merge type entry // POST: iter points to the first entry beyond the merge process (or the end) // keys_, operands_ are updated to reflect the merge result. @@ -125,6 +187,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, const bool at_bottom, const bool allow_data_in_errors, const BlobFetcher* blob_fetcher, + const std::string* const full_history_ts_low, PrefetchBufferCollection* prefetch_buffers, CompactionIterationStats* c_iter_stats) { // Get a copy of the internal key, before it's invalidated by iter->Next() @@ -134,6 +197,12 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, merge_context_.Clear(); has_compaction_filter_skip_until_ = false; assert(user_merge_operator_); + assert(user_comparator_); + const size_t ts_sz = user_comparator_->timestamp_size(); + if (full_history_ts_low) { + assert(ts_sz > 0); + assert(ts_sz == full_history_ts_low->size()); + } bool first_key = true; // We need to parse the internal key again as the parsed key is @@ -153,7 +222,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, assert(s.ok()); if (!s.ok()) return s; + assert(kTypeMerge == orig_ikey.type); + bool hit_the_next_user_key = false; + int cmp_with_full_history_ts_low = 0; for (; iter->Valid(); iter->Next(), original_key_is_iter = false) { if (IsShuttingDown()) { s = Status::ShutdownInProgress(); @@ -165,6 +237,14 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, Status pik_status = ParseInternalKey(iter->key(), &ikey, allow_data_in_errors); + Slice ts; + if (pik_status.ok()) { + ts = ExtractTimestampFromUserKey(ikey.user_key, ts_sz); + if (full_history_ts_low) { + cmp_with_full_history_ts_low = + user_comparator_->CompareTimestamp(ts, *full_history_ts_low); + } + } if (!pik_status.ok()) { // stop at corrupted key if (assert_valid_internal_key_) { @@ -172,10 +252,18 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, } break; } else if (first_key) { + // If user-defined timestamp is enabled, we expect both user key and + // timestamps are equal, as a sanity check. assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)); first_key = false; - } else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) { - // hit a different user key, stop right here + } else if (!user_comparator_->EqualWithoutTimestamp(ikey.user_key, + orig_ikey.user_key) || + (ts_sz > 0 && + !user_comparator_->Equal(ikey.user_key, orig_ikey.user_key) && + cmp_with_full_history_ts_low >= 0)) { + // 1) hit a different user key, or + // 2) user-defined timestamp is enabled, and hit a version of user key NOT + // eligible for GC, then stop right here. hit_the_next_user_key = true; break; } else if (stop_before > 0 && ikey.sequence <= stop_before && @@ -205,78 +293,97 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, return s; } - // TODO(noetzli) If the merge operator returns false, we are currently - // (almost) silently dropping the put/delete. That's probably not what we - // want. Also if we're in compaction and it's a put, it would be nice to - // run compaction filter on it. - const Slice val = iter->value(); - PinnableSlice blob_value; - const Slice* val_ptr; - if ((kTypeValue == ikey.type || kTypeBlobIndex == ikey.type || - kTypeWideColumnEntity == ikey.type) && - (range_del_agg == nullptr || - !range_del_agg->ShouldDelete( - ikey, RangeDelPositioningMode::kForwardTraversal))) { - if (ikey.type == kTypeWideColumnEntity) { - // TODO: support wide-column entities - return Status::NotSupported( - "Merge currently not supported for wide-column entities"); - } else if (ikey.type == kTypeBlobIndex) { - BlobIndex blob_index; - - s = blob_index.DecodeFrom(val); - if (!s.ok()) { - return s; - } - - FilePrefetchBuffer* prefetch_buffer = - prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer( - blob_index.file_number()) - : nullptr; - - uint64_t bytes_read = 0; - - assert(blob_fetcher); - - s = blob_fetcher->FetchBlob(ikey.user_key, blob_index, - prefetch_buffer, &blob_value, - &bytes_read); - if (!s.ok()) { - return s; - } - - val_ptr = &blob_value; - - if (c_iter_stats) { - ++c_iter_stats->num_blobs_read; - c_iter_stats->total_blob_bytes_read += bytes_read; - } - } else { - val_ptr = &val; + // TODO: if we're in compaction and it's a put, it would be nice to run + // compaction filter on it. + std::string merge_result; + MergeOperator::OpFailureScope op_failure_scope; + + if (range_del_agg && + range_del_agg->ShouldDelete( + ikey, RangeDelPositioningMode::kForwardTraversal)) { + s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); + } else if (ikey.type == kTypeValue) { + const Slice val = iter->value(); + + s = TimedFullMerge(user_merge_operator_, ikey.user_key, &val, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); + } else if (ikey.type == kTypeBlobIndex) { + BlobIndex blob_index; + + s = blob_index.DecodeFrom(iter->value()); + if (!s.ok()) { + return s; + } + + FilePrefetchBuffer* prefetch_buffer = + prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer( + blob_index.file_number()) + : nullptr; + + uint64_t bytes_read = 0; + + assert(blob_fetcher); + + PinnableSlice blob_value; + s = blob_fetcher->FetchBlob(ikey.user_key, blob_index, prefetch_buffer, + &blob_value, &bytes_read); + if (!s.ok()) { + return s; } + + if (c_iter_stats) { + ++c_iter_stats->num_blobs_read; + c_iter_stats->total_blob_bytes_read += bytes_read; + } + + s = TimedFullMerge(user_merge_operator_, ikey.user_key, &blob_value, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); + } else if (ikey.type == kTypeWideColumnEntity) { + s = TimedFullMergeWithEntity( + user_merge_operator_, ikey.user_key, iter->value(), + merge_context_.GetOperands(), &merge_result, logger_, stats_, + clock_, /* update_num_ops_stats */ false, &op_failure_scope); } else { - val_ptr = nullptr; + s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr, + merge_context_.GetOperands(), &merge_result, logger_, + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); } - std::string merge_result; - s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr, - merge_context_.GetOperands(), &merge_result, logger_, - stats_, clock_); // We store the result in keys_.back() and operands_.back() // if nothing went wrong (i.e.: no operand corruption on disk) if (s.ok()) { // The original key encountered original_key = std::move(keys_.back()); - orig_ikey.type = kTypeValue; + orig_ikey.type = ikey.type == kTypeWideColumnEntity + ? kTypeWideColumnEntity + : kTypeValue; UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); keys_.clear(); merge_context_.Clear(); keys_.emplace_front(std::move(original_key)); merge_context_.PushOperand(merge_result); - } - // move iter to the next entry - iter->Next(); + // move iter to the next entry + iter->Next(); + } else if (op_failure_scope == + MergeOperator::OpFailureScope::kMustMerge) { + // Change to `Status::MergeInProgress()` to denote output consists of + // merge operands only. Leave `iter` at the non-merge entry so it will + // be output after. + s = Status::MergeInProgress(); + } return s; } else { // hit a merge @@ -308,9 +415,9 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, filter == CompactionFilter::Decision::kChangeValue) { if (original_key_is_iter) { // this is just an optimization that saves us one memcpy - keys_.push_front(std::move(original_key)); + keys_.emplace_front(original_key); } else { - keys_.push_front(iter->key().ToString()); + keys_.emplace_front(iter->key().ToString()); } if (keys_.size() == 1) { // we need to re-anchor the orig_ikey because it was anchored by @@ -323,7 +430,8 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, if (filter == CompactionFilter::Decision::kKeep) { merge_context_.PushOperand( value_slice, iter->IsValuePinned() /* operand_pinned */); - } else { // kChangeValue + } else { + assert(filter == CompactionFilter::Decision::kChangeValue); // Compaction filter asked us to change the operand from value_slice // to compaction_filter_value_. merge_context_.PushOperand(compaction_filter_value_, false); @@ -339,6 +447,18 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, } } + if (cmp_with_full_history_ts_low >= 0) { + size_t num_merge_operands = merge_context_.GetNumOperands(); + if (ts_sz && num_merge_operands > 1) { + // We do not merge merge operands with different timestamps if they are + // not eligible for GC. + ROCKS_LOG_ERROR(logger_, "ts_sz=%d, %d merge oprands", + static_cast(ts_sz), + static_cast(num_merge_operands)); + assert(false); + } + } + if (merge_context_.GetNumOperands() == 0) { // we filtered out all the merge operands return s; @@ -352,6 +472,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // AND // we have either encountered another key or end of key history on this // layer. + // Note that if user-defined timestamp is enabled, we need some extra caution + // here: if full_history_ts_low is nullptr, or it's not null but the key's + // timestamp is greater than or equal to full_history_ts_low, it means this + // key cannot be dropped. We may not have seen the beginning of the key. // // When these conditions are true we are able to merge all the keys // using full merge. @@ -361,7 +485,8 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // sure that all merge-operands on the same level get compacted together, // this will simply lead to these merge operands moving to the next level. bool surely_seen_the_beginning = - (hit_the_next_user_key || !iter->Valid()) && at_bottom; + (hit_the_next_user_key || !iter->Valid()) && at_bottom && + (ts_sz == 0 || cmp_with_full_history_ts_low < 0); if (surely_seen_the_beginning) { // do a final merge with nullptr as the existing value and say // bye to the merge type (it's now converted to a Put) @@ -369,9 +494,12 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, assert(merge_context_.GetNumOperands() >= 1); assert(merge_context_.GetNumOperands() == keys_.size()); std::string merge_result; + MergeOperator::OpFailureScope op_failure_scope; s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr, merge_context_.GetOperands(), &merge_result, logger_, - stats_, clock_); + stats_, clock_, + /* result_operand */ nullptr, + /* update_num_ops_stats */ false, &op_failure_scope); if (s.ok()) { // The original key encountered // We are certain that keys_ is not empty here (see assertions couple of @@ -383,6 +511,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, merge_context_.Clear(); keys_.emplace_front(std::move(original_key)); merge_context_.PushOperand(merge_result); + } else if (op_failure_scope == MergeOperator::OpFailureScope::kMustMerge) { + // Change to `Status::MergeInProgress()` to denote output consists of + // merge operands only. + s = Status::MergeInProgress(); } } else { // We haven't seen the beginning of the key nor a Put/Delete. diff --git a/db/merge_helper.h b/db/merge_helper.h index ae426280664..7f624b74328 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -16,6 +16,7 @@ #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" +#include "rocksdb/wide_columns.h" #include "util/stop_watch.h" namespace ROCKSDB_NAMESPACE { @@ -47,22 +48,36 @@ class MergeHelper { // the latency is sensitive. // Returns one of the following statuses: // - OK: Entries were successfully merged. - // - Corruption: Merge operator reported unsuccessful merge. + // - Corruption: Merge operator reported unsuccessful merge. The scope of the + // damage will be stored in `*op_failure_scope` when `op_failure_scope` is + // not nullptr static Status TimedFullMerge(const MergeOperator* merge_operator, const Slice& key, const Slice* value, const std::vector& operands, std::string* result, Logger* logger, Statistics* statistics, SystemClock* clock, - Slice* result_operand = nullptr, - bool update_num_ops_stats = false); + Slice* result_operand, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope); - // Merge entries until we hit + static Status TimedFullMergeWithEntity( + const MergeOperator* merge_operator, const Slice& key, Slice base_entity, + const std::vector& operands, std::string* result, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope); + + // During compaction, merge entries until we hit // - a corrupted key // - a Put/Delete, // - a different user key, // - a specific sequence number (snapshot boundary), // - REMOVE_AND_SKIP_UNTIL returned from compaction filter, // or - the end of iteration + // + // The result(s) of the merge can be accessed in `MergeHelper::keys()` and + // `MergeHelper::values()`, which are invalidated the next time `MergeUntil()` + // is called. `MergeOutputIterator` is specially designed to iterate the + // results of a `MergeHelper`'s most recent `MergeUntil()`. + // // iter: (IN) points to the first merge type entry // (OUT) points to the first entry not included in the merge process // range_del_agg: (IN) filters merge operands covered by range tombstones. @@ -79,8 +94,7 @@ class MergeHelper { // // Returns one of the following statuses: // - OK: Entries were successfully merged. - // - MergeInProgress: Put/Delete not encountered, and didn't reach the start - // of key's history. Output consists of merge operands only. + // - MergeInProgress: Output consists of merge operands only. // - Corruption: Merge operator reported unsuccessful merge or a corrupted // key has been encountered and not expected (applies only when compiling // with asserts removed). @@ -92,6 +106,7 @@ class MergeHelper { const SequenceNumber stop_before, const bool at_bottom, const bool allow_data_in_errors, const BlobFetcher* blob_fetcher, + const std::string* const full_history_ts_low, PrefetchBufferCollection* prefetch_buffers, CompactionIterationStats* c_iter_stats); @@ -99,6 +114,7 @@ class MergeHelper { // in the constructor. Returns the decision that the filter made. // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the // optional outputs of compaction filter. + // user_key includes timestamp if user-defined timestamp is enabled. CompactionFilter::Decision FilterMerge(const Slice& user_key, const Slice& value_slice); @@ -156,7 +172,7 @@ class MergeHelper { const CompactionFilter* compaction_filter_; const std::atomic* shutting_down_; Logger* logger_; - bool assert_valid_internal_key_; // enforce no internal key corruption? + bool assert_valid_internal_key_; // enforce no internal key corruption? bool allow_single_operand_; SequenceNumber latest_snapshot_; const SnapshotChecker* const snapshot_checker_; diff --git a/db/merge_helper_test.cc b/db/merge_helper_test.cc index b3fd9a07413..05408d5b96d 100644 --- a/db/merge_helper_test.cc +++ b/db/merge_helper_test.cc @@ -35,7 +35,8 @@ class MergeHelperTest : public testing::Test { return merge_helper_->MergeUntil( iter_.get(), nullptr /* range_del_agg */, stop_before, at_bottom, false /* allow_data_in_errors */, nullptr /* blob_fetcher */, - nullptr /* prefetch_buffers */, nullptr /* c_iter_stats */); + nullptr /* full_history_ts_low */, nullptr /* prefetch_buffers */, + nullptr /* c_iter_stats */); } void AddKeyVal(const std::string& user_key, const SequenceNumber& seq, diff --git a/db/merge_operator.cc b/db/merge_operator.cc index 75dea432cad..d325856406f 100644 --- a/db/merge_operator.cc +++ b/db/merge_operator.cc @@ -74,12 +74,11 @@ bool AssociativeMergeOperator::FullMergeV2( // Call the user defined simple merge on the operands; // NOTE: It is assumed that the client's merge-operator will handle any errors. -bool AssociativeMergeOperator::PartialMerge( - const Slice& key, - const Slice& left_operand, - const Slice& right_operand, - std::string* new_value, - Logger* logger) const { +bool AssociativeMergeOperator::PartialMerge(const Slice& key, + const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* logger) const { return Merge(key, &left_operand, right_operand, new_value, logger); } diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index 8fc47e3f5ce..8e9f28f65aa 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -10,10 +10,12 @@ #ifndef ROCKSDB_LITE #include + #include #include #include #include + #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/version_set.h" @@ -28,7 +30,6 @@ #include "test_util/testutil.h" #include "util/string_util.h" - namespace ROCKSDB_NAMESPACE { class ObsoleteFilesTest : public DBTestBase { @@ -40,7 +41,7 @@ class ObsoleteFilesTest : public DBTestBase { void AddKeys(int numkeys, int startkey) { WriteOptions options; options.sync = false; - for (int i = startkey; i < (numkeys + startkey) ; i++) { + for (int i = startkey; i < (numkeys + startkey); i++) { std::string temp = std::to_string(i); Slice key(temp); Slice value(temp); @@ -117,7 +118,7 @@ TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) { "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"}, {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles", "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"}, - }); + }); SyncPoint::GetInstance()->SetCallBack( "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) { Status* p_status = reinterpret_cast(arg); diff --git a/db/options_file_test.cc b/db/options_file_test.cc index 283d1934404..eb02e6ca4f1 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -59,7 +59,7 @@ void VerifyOptionsFileName( } } } -} // namespace +} // anonymous namespace TEST_F(OptionsFileTest, NumberOfOptionsFiles) { const int kReopenCount = 20; diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 0416dee08bd..454d12dc584 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -39,31 +39,31 @@ const std::string kDbName = namespace ROCKSDB_NAMESPACE { std::shared_ptr OpenDb(bool read_only = false) { - DB* db; - Options options; - options.create_if_missing = true; - options.max_open_files = -1; - options.write_buffer_size = FLAGS_write_buffer_size; - options.max_write_buffer_number = FLAGS_max_write_buffer_number; - options.min_write_buffer_number_to_merge = + DB* db; + Options options; + options.create_if_missing = true; + options.max_open_files = -1; + options.write_buffer_size = FLAGS_write_buffer_size; + options.max_write_buffer_number = FLAGS_max_write_buffer_number; + options.min_write_buffer_number_to_merge = FLAGS_min_write_buffer_number_to_merge; - if (FLAGS_use_set_based_memetable) { + if (FLAGS_use_set_based_memetable) { #ifndef ROCKSDB_LITE - options.prefix_extractor.reset( - ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0)); - options.memtable_factory.reset(NewHashSkipListRepFactory()); + options.prefix_extractor.reset( + ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0)); + options.memtable_factory.reset(NewHashSkipListRepFactory()); #endif // ROCKSDB_LITE - } + } - Status s; - if (!read_only) { - s = DB::Open(options, kDbName, &db); - } else { - s = DB::OpenForReadOnly(options, kDbName, &db); - } - EXPECT_OK(s); - return std::shared_ptr(db); + Status s; + if (!read_only) { + s = DB::Open(options, kDbName, &db); + } else { + s = DB::OpenForReadOnly(options, kDbName, &db); + } + EXPECT_OK(s); + return std::shared_ptr(db); } class PerfContextTest : public testing::Test {}; @@ -81,7 +81,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { ASSERT_OK(db->Put(write_options, key, value)); } - for (int i = 0; i < FLAGS_total_keys -1 ; ++i) { + for (int i = 0; i < FLAGS_total_keys - 1; ++i) { std::string key = "k" + std::to_string(i); ASSERT_OK(db->Delete(write_options, key)); } @@ -103,8 +103,9 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { } if (FLAGS_verbose) { - std::cout << "Get user key comparison: \n" << hist_get.ToString() - << "Get time: \n" << hist_get_time.ToString(); + std::cout << "Get user key comparison: \n" + << hist_get.ToString() << "Get time: \n" + << hist_get_time.ToString(); } { @@ -139,7 +140,8 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { hist_seek.Add(get_perf_context()->user_key_comparison_count); if (FLAGS_verbose) { std::cout << "seek cmp: " << get_perf_context()->user_key_comparison_count - << " ikey skipped " << get_perf_context()->internal_key_skipped_count + << " ikey skipped " + << get_perf_context()->internal_key_skipped_count << " idelete skipped " << get_perf_context()->internal_delete_skipped_count << " elapsed: " << elapsed_nanos << "ns\n"; @@ -322,7 +324,8 @@ void ProfileQueries(bool enabled_time = false) { hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); hist_mget_files.Add(get_perf_context()->get_from_output_files_time); - hist_mget_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count); + hist_mget_num_memtable_checked.Add( + get_perf_context()->get_from_memtable_count); hist_mget_post_process.Add(get_perf_context()->get_post_process_time); hist_mget.Add(get_perf_context()->user_key_comparison_count); } @@ -337,12 +340,14 @@ void ProfileQueries(bool enabled_time = false) { << hist_write_wal_time.ToString() << "\n" << " Writing Mem Table time: \n" << hist_write_memtable_time.ToString() << "\n" - << " Write Delay: \n" << hist_write_delay_time.ToString() << "\n" + << " Write Delay: \n" + << hist_write_delay_time.ToString() << "\n" << " Waiting for Batch time: \n" << hist_write_thread_wait_nanos.ToString() << "\n" << " Scheduling Flushes and Compactions Time: \n" << hist_write_scheduling_time.ToString() << "\n" - << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n"; + << " Total DB mutex nanos: \n" + << total_db_mutex_nanos << "\n"; std::cout << "Get(): Time to get snapshot: \n" << hist_get_snapshot.ToString() @@ -352,8 +357,8 @@ void ProfileQueries(bool enabled_time = false) { << hist_get_files.ToString() << "\n" << " Number of memtables checked: \n" << hist_num_memtable_checked.ToString() << "\n" - << " Time to post process: \n" << hist_get_post_process.ToString() - << "\n"; + << " Time to post process: \n" + << hist_get_post_process.ToString() << "\n"; std::cout << "MultiGet(): Time to get snapshot: \n" << hist_mget_snapshot.ToString() @@ -440,7 +445,8 @@ void ProfileQueries(bool enabled_time = false) { hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time); hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time); hist_mget_files.Add(get_perf_context()->get_from_output_files_time); - hist_mget_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count); + hist_mget_num_memtable_checked.Add( + get_perf_context()->get_from_memtable_count); hist_mget_post_process.Add(get_perf_context()->get_post_process_time); hist_mget.Add(get_perf_context()->user_key_comparison_count); } @@ -459,8 +465,8 @@ void ProfileQueries(bool enabled_time = false) { << hist_get_files.ToString() << "\n" << " Number of memtables checked: \n" << hist_num_memtable_checked.ToString() << "\n" - << " Time to post process: \n" << hist_get_post_process.ToString() - << "\n"; + << " Time to post process: \n" + << hist_get_post_process.ToString() << "\n"; std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n" << hist_mget_snapshot.ToString() @@ -556,7 +562,8 @@ TEST_F(PerfContextTest, SeekKeyComparison) { } if (FLAGS_verbose) { - std::cout << "Put time:\n" << hist_put_time.ToString() << "WAL time:\n" + std::cout << "Put time:\n" + << hist_put_time.ToString() << "WAL time:\n" << hist_wal_time.ToString() << "time diff:\n" << hist_time_diff.ToString(); } @@ -584,7 +591,8 @@ TEST_F(PerfContextTest, SeekKeyComparison) { } ASSERT_OK(iter->status()); if (FLAGS_verbose) { - std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n" + std::cout << "Seek:\n" + << hist_seek.ToString() << "Next:\n" << hist_next.ToString(); } } @@ -614,7 +622,7 @@ TEST_F(PerfContextTest, DBMutexLockCounter) { SystemClock::Default()->SleepForMicroseconds(100); mutex.Unlock(); child_thread.join(); - } + } } } @@ -806,14 +814,18 @@ TEST_F(PerfContextTest, PerfContextByLevelGetSet) { .bloom_filter_full_positive); ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2] .bloom_filter_full_true_positive); - ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0] - .block_cache_hit_count); - ASSERT_EQ(5, (*(get_perf_context()->level_to_perf_context))[2] - .block_cache_hit_count); - ASSERT_EQ(2, (*(get_perf_context()->level_to_perf_context))[3] - .block_cache_miss_count); - ASSERT_EQ(4, (*(get_perf_context()->level_to_perf_context))[1] - .block_cache_miss_count); + ASSERT_EQ( + 1, + (*(get_perf_context()->level_to_perf_context))[0].block_cache_hit_count); + ASSERT_EQ( + 5, + (*(get_perf_context()->level_to_perf_context))[2].block_cache_hit_count); + ASSERT_EQ( + 2, + (*(get_perf_context()->level_to_perf_context))[3].block_cache_miss_count); + ASSERT_EQ( + 4, + (*(get_perf_context()->level_to_perf_context))[1].block_cache_miss_count); std::string zero_excluded = get_perf_context()->ToString(true); ASSERT_NE(std::string::npos, zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7")); diff --git a/db/periodic_task_scheduler_test.cc b/db/periodic_task_scheduler_test.cc index 4abea4d5ec7..73c13fa1384 100644 --- a/db/periodic_task_scheduler_test.cc +++ b/db/periodic_task_scheduler_test.cc @@ -65,7 +65,7 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) { ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec); ASSERT_GT(kPeriodSec, 1u); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec) - 1); }); @@ -77,14 +77,14 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) { ASSERT_EQ(1, pst_st_counter); ASSERT_EQ(1, flush_info_log_counter); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); ASSERT_EQ(2, dump_st_counter); ASSERT_EQ(2, pst_st_counter); ASSERT_EQ(2, flush_info_log_counter); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); ASSERT_EQ(3, dump_st_counter); @@ -98,7 +98,7 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) { ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec); // Info log flush should still run. - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); ASSERT_EQ(3, dump_st_counter); ASSERT_EQ(3, pst_st_counter); @@ -113,7 +113,7 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) { ASSERT_EQ(2, scheduler.TEST_GetValidTaskNum()); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kPeriodSec)); }); ASSERT_EQ(4, dump_st_counter); ASSERT_EQ(3, pst_st_counter); @@ -154,19 +154,19 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) { ASSERT_EQ(kInstanceNum * 3, scheduler.TEST_GetValidTaskNum()); int expected_run = kInstanceNum; - dbi->TEST_WaitForPeridicTaskRun( + dbi->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); ASSERT_EQ(expected_run, dump_st_counter); ASSERT_EQ(expected_run, pst_st_counter); expected_run += kInstanceNum; - dbi->TEST_WaitForPeridicTaskRun( + dbi->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_EQ(expected_run, dump_st_counter); ASSERT_EQ(expected_run, pst_st_counter); expected_run += kInstanceNum; - dbi->TEST_WaitForPeridicTaskRun( + dbi->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_EQ(expected_run, dump_st_counter); ASSERT_EQ(expected_run, pst_st_counter); @@ -178,9 +178,9 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) { expected_run += (kInstanceNum - half) * 2; - dbi->TEST_WaitForPeridicTaskRun( + dbi->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); - dbi->TEST_WaitForPeridicTaskRun( + dbi->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_EQ(expected_run, dump_st_counter); ASSERT_EQ(expected_run, pst_st_counter); diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index a38ed8742c9..755b639b07f 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -38,7 +38,6 @@ #include "util/string_util.h" #include "utilities/merge_operators.h" - namespace ROCKSDB_NAMESPACE { class PlainTableKeyDecoderTest : public testing::Test {}; @@ -148,9 +147,7 @@ class PlainTableDBTest : public testing::Test, DBImpl* dbfull() { return static_cast_with_check(db_); } - void Reopen(Options* options = nullptr) { - ASSERT_OK(TryReopen(options)); - } + void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); } void Close() { delete db_; @@ -160,7 +157,7 @@ class PlainTableDBTest : public testing::Test, bool mmap_mode() const { return mmap_mode_; } void DestroyAndReopen(Options* options = nullptr) { - //Destroy using last options + // Destroy using last options Destroy(&last_options_); ASSERT_OK(TryReopen(options)); } @@ -200,9 +197,7 @@ class PlainTableDBTest : public testing::Test, return db_->Put(WriteOptions(), k, v); } - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { ReadOptions options; @@ -217,7 +212,6 @@ class PlainTableDBTest : public testing::Test, return result; } - int NumTableFilesAtLevel(int level) { std::string property; EXPECT_TRUE(db_->GetProperty( @@ -448,99 +442,100 @@ TEST_P(PlainTableDBTest, Flush) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; huge_page_tlb_size += 2 * 1024 * 1024) { for (EncodingType encoding_type : {kPlain, kPrefix}) { - for (int bloom = -1; bloom <= 117; bloom += 117) { - const int bloom_bits = std::max(bloom, 0); - const bool full_scan_mode = bloom < 0; - for (int total_order = 0; total_order <= 1; total_order++) { - for (int store_index_in_file = 0; store_index_in_file <= 1; - ++store_index_in_file) { - Options options = CurrentOptions(); - options.create_if_missing = true; - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - if (total_order) { - options.prefix_extractor.reset(); - - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = 0; - plain_table_options.bloom_bits_per_key = bloom_bits; - plain_table_options.hash_table_ratio = 0; - plain_table_options.index_sparseness = 2; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - plain_table_options.full_scan_mode = full_scan_mode; - plain_table_options.store_index_in_file = store_index_in_file; - - options.table_factory.reset( - NewPlainTableFactory(plain_table_options)); - } else { - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = 0; - plain_table_options.bloom_bits_per_key = bloom_bits; - plain_table_options.hash_table_ratio = 0.75; - plain_table_options.index_sparseness = 16; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - plain_table_options.full_scan_mode = full_scan_mode; - plain_table_options.store_index_in_file = store_index_in_file; - - options.table_factory.reset( - NewPlainTableFactory(plain_table_options)); - } - DestroyAndReopen(&options); - uint64_t int_num; - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); - - ASSERT_OK(Put("1000000000000foo", "v1")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("1000000000000foo", "v3")); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_GT(int_num, 0U); - - TablePropertiesCollection ptc; - ASSERT_OK( - reinterpret_cast(dbfull())->GetPropertiesOfAllTables(&ptc)); - ASSERT_EQ(1U, ptc.size()); - auto row = ptc.begin(); - auto tp = row->second; - - if (full_scan_mode) { - // Does not support Get/Seek - std::unique_ptr iter(dbfull()->NewIterator(ReadOptions())); - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("0000000000000bar", iter->key().ToString()); - ASSERT_EQ("v2", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000000foo", iter->key().ToString()); - ASSERT_EQ("v3", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - ASSERT_TRUE(iter->status().ok()); - } else { - if (!store_index_in_file) { - ASSERT_EQ(total_order ? "4" : "12", - (tp->user_collected_properties) - .at("plain_table_hash_table_size")); - ASSERT_EQ("0", (tp->user_collected_properties) - .at("plain_table_sub_index_size")); + for (int bloom = -1; bloom <= 117; bloom += 117) { + const int bloom_bits = std::max(bloom, 0); + const bool full_scan_mode = bloom < 0; + for (int total_order = 0; total_order <= 1; total_order++) { + for (int store_index_in_file = 0; store_index_in_file <= 1; + ++store_index_in_file) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor.reset(); + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.full_scan_mode = full_scan_mode; + plain_table_options.store_index_in_file = store_index_in_file; + + options.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } else { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 0; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.full_scan_mode = full_scan_mode; + plain_table_options.store_index_in_file = store_index_in_file; + + options.table_factory.reset( + NewPlainTableFactory(plain_table_options)); + } + DestroyAndReopen(&options); + uint64_t int_num; + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(Put("0000000000000bar", "v2")); + ASSERT_OK(Put("1000000000000foo", "v3")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + TablePropertiesCollection ptc; + ASSERT_OK(reinterpret_cast(dbfull())->GetPropertiesOfAllTables( + &ptc)); + ASSERT_EQ(1U, ptc.size()); + auto row = ptc.begin(); + auto tp = row->second; + + if (full_scan_mode) { + // Does not support Get/Seek + std::unique_ptr iter( + dbfull()->NewIterator(ReadOptions())); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("0000000000000bar", iter->key().ToString()); + ASSERT_EQ("v2", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000000foo", iter->key().ToString()); + ASSERT_EQ("v3", iter->value().ToString()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().ok()); } else { - ASSERT_EQ("0", (tp->user_collected_properties) - .at("plain_table_hash_table_size")); - ASSERT_EQ("0", (tp->user_collected_properties) - .at("plain_table_sub_index_size")); + if (!store_index_in_file) { + ASSERT_EQ(total_order ? "4" : "12", + (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } else { + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_hash_table_size")); + ASSERT_EQ("0", (tp->user_collected_properties) + .at("plain_table_sub_index_size")); + } + ASSERT_EQ("v3", Get("1000000000000foo")); + ASSERT_EQ("v2", Get("0000000000000bar")); } - ASSERT_EQ("v3", Get("1000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); } } - } } } } @@ -550,79 +545,79 @@ TEST_P(PlainTableDBTest, Flush2) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; huge_page_tlb_size += 2 * 1024 * 1024) { for (EncodingType encoding_type : {kPlain, kPrefix}) { - for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { - for (int total_order = 0; total_order <= 1; total_order++) { - for (int store_index_in_file = 0; store_index_in_file <= 1; - ++store_index_in_file) { - if (encoding_type == kPrefix && total_order) { - continue; - } - if (!bloom_bits && store_index_in_file) { - continue; - } - if (total_order && store_index_in_file) { - continue; - } - bool expect_bloom_not_match = false; - Options options = CurrentOptions(); - options.create_if_missing = true; - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - PlainTableOptions plain_table_options; - if (total_order) { - options.prefix_extractor = nullptr; - plain_table_options.hash_table_ratio = 0; - plain_table_options.index_sparseness = 2; - } else { - plain_table_options.hash_table_ratio = 0.75; - plain_table_options.index_sparseness = 16; - } - plain_table_options.user_key_len = kPlainTableVariableLength; - plain_table_options.bloom_bits_per_key = bloom_bits; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - plain_table_options.store_index_in_file = store_index_in_file; - options.table_factory.reset(new TestPlainTableFactory( - &expect_bloom_not_match, plain_table_options, - 0 /* column_family_id */, kDefaultColumnFamilyName)); - - DestroyAndReopen(&options); - ASSERT_OK(Put("0000000000000bar", "b")); - ASSERT_OK(Put("1000000000000foo", "v1")); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - - ASSERT_OK(Put("1000000000000foo", "v2")); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ("v2", Get("1000000000000foo")); - - ASSERT_OK(Put("0000000000000eee", "v3")); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ("v3", Get("0000000000000eee")); - - ASSERT_OK(Delete("0000000000000bar")); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); - - ASSERT_OK(Put("0000000000000eee", "v5")); - ASSERT_OK(Put("9000000000000eee", "v5")); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ("v5", Get("0000000000000eee")); - - // Test Bloom Filter - if (bloom_bits > 0) { - // Neither key nor value should exist. - expect_bloom_not_match = true; - ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); - // Key doesn't exist any more but prefix exists. - if (total_order) { - ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); - ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + for (int store_index_in_file = 0; store_index_in_file <= 1; + ++store_index_in_file) { + if (encoding_type == kPrefix && total_order) { + continue; + } + if (!bloom_bits && store_index_in_file) { + continue; + } + if (total_order && store_index_in_file) { + continue; + } + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + PlainTableOptions plain_table_options; + if (total_order) { + options.prefix_extractor = nullptr; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + } else { + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + } + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + plain_table_options.store_index_in_file = store_index_in_file; + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + + DestroyAndReopen(&options); + ASSERT_OK(Put("0000000000000bar", "b")); + ASSERT_OK(Put("1000000000000foo", "v1")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + + ASSERT_OK(Put("1000000000000foo", "v2")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v2", Get("1000000000000foo")); + + ASSERT_OK(Put("0000000000000eee", "v3")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v3", Get("0000000000000eee")); + + ASSERT_OK(Delete("0000000000000bar")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); + + ASSERT_OK(Put("0000000000000eee", "v5")); + ASSERT_OK(Put("9000000000000eee", "v5")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v5", Get("0000000000000eee")); + + // Test Bloom Filter + if (bloom_bits > 0) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar")); + // Key doesn't exist any more but prefix exists. + if (total_order) { + ASSERT_EQ("NOT_FOUND", Get("1000000000000not")); + ASSERT_EQ("NOT_FOUND", Get("0000000000000not")); + } + expect_bloom_not_match = false; + } } - expect_bloom_not_match = false; } } - } - } } } } @@ -675,129 +670,129 @@ TEST_P(PlainTableDBTest, Iterator) { for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024; huge_page_tlb_size += 2 * 1024 * 1024) { for (EncodingType encoding_type : {kPlain, kPrefix}) { - for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { - for (int total_order = 0; total_order <= 1; total_order++) { - if (encoding_type == kPrefix && total_order == 1) { - continue; - } - bool expect_bloom_not_match = false; - Options options = CurrentOptions(); - options.create_if_missing = true; - // Set only one bucket to force bucket conflict. - // Test index interval for the same prefix to be 1, 2 and 4 - if (total_order) { - options.prefix_extractor = nullptr; - - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = 16; - plain_table_options.bloom_bits_per_key = bloom_bits; - plain_table_options.hash_table_ratio = 0; - plain_table_options.index_sparseness = 2; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - - options.table_factory.reset(new TestPlainTableFactory( - &expect_bloom_not_match, plain_table_options, - 0 /* column_family_id */, kDefaultColumnFamilyName)); - } else { - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = 16; - plain_table_options.bloom_bits_per_key = bloom_bits; - plain_table_options.hash_table_ratio = 0.75; - plain_table_options.index_sparseness = 16; - plain_table_options.huge_page_tlb_size = huge_page_tlb_size; - plain_table_options.encoding_type = encoding_type; - - options.table_factory.reset(new TestPlainTableFactory( - &expect_bloom_not_match, plain_table_options, - 0 /* column_family_id */, kDefaultColumnFamilyName)); - } - DestroyAndReopen(&options); - - ASSERT_OK(Put("1000000000foo002", "v_2")); - ASSERT_OK(Put("0000000000000bar", "random")); - ASSERT_OK(Put("1000000000foo001", "v1")); - ASSERT_OK(Put("3000000000000bar", "bar_v")); - ASSERT_OK(Put("1000000000foo003", "v__3")); - ASSERT_OK(Put("1000000000foo004", "v__4")); - ASSERT_OK(Put("1000000000foo005", "v__5")); - ASSERT_OK(Put("1000000000foo007", "v__7")); - ASSERT_OK(Put("1000000000foo008", "v__8")); - ASSERT_OK(dbfull()->TEST_FlushMemTable()); - ASSERT_EQ("v1", Get("1000000000foo001")); - ASSERT_EQ("v__3", Get("1000000000foo003")); - Iterator* iter = dbfull()->NewIterator(ReadOptions()); - iter->Seek("1000000000foo000"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo001", iter->key().ToString()); - ASSERT_EQ("v1", iter->value().ToString()); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo002", iter->key().ToString()); - ASSERT_EQ("v_2", iter->value().ToString()); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo003", iter->key().ToString()); - ASSERT_EQ("v__3", iter->value().ToString()); - - iter->Next(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo004", iter->key().ToString()); - ASSERT_EQ("v__4", iter->value().ToString()); - - iter->Seek("3000000000000bar"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("3000000000000bar", iter->key().ToString()); - ASSERT_EQ("bar_v", iter->value().ToString()); - - iter->Seek("1000000000foo000"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo001", iter->key().ToString()); - ASSERT_EQ("v1", iter->value().ToString()); - - iter->Seek("1000000000foo005"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo005", iter->key().ToString()); - ASSERT_EQ("v__5", iter->value().ToString()); - - iter->Seek("1000000000foo006"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo007", iter->key().ToString()); - ASSERT_EQ("v__7", iter->value().ToString()); - - iter->Seek("1000000000foo008"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("1000000000foo008", iter->key().ToString()); - ASSERT_EQ("v__8", iter->value().ToString()); - - if (total_order == 0) { - iter->Seek("1000000000foo009"); + for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) { + for (int total_order = 0; total_order <= 1; total_order++) { + if (encoding_type == kPrefix && total_order == 1) { + continue; + } + bool expect_bloom_not_match = false; + Options options = CurrentOptions(); + options.create_if_missing = true; + // Set only one bucket to force bucket conflict. + // Test index interval for the same prefix to be 1, 2 and 4 + if (total_order) { + options.prefix_extractor = nullptr; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 2; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + } else { + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = 16; + plain_table_options.bloom_bits_per_key = bloom_bits; + plain_table_options.hash_table_ratio = 0.75; + plain_table_options.index_sparseness = 16; + plain_table_options.huge_page_tlb_size = huge_page_tlb_size; + plain_table_options.encoding_type = encoding_type; + + options.table_factory.reset(new TestPlainTableFactory( + &expect_bloom_not_match, plain_table_options, + 0 /* column_family_id */, kDefaultColumnFamilyName)); + } + DestroyAndReopen(&options); + + ASSERT_OK(Put("1000000000foo002", "v_2")); + ASSERT_OK(Put("0000000000000bar", "random")); + ASSERT_OK(Put("1000000000foo001", "v1")); + ASSERT_OK(Put("3000000000000bar", "bar_v")); + ASSERT_OK(Put("1000000000foo003", "v__3")); + ASSERT_OK(Put("1000000000foo004", "v__4")); + ASSERT_OK(Put("1000000000foo005", "v__5")); + ASSERT_OK(Put("1000000000foo007", "v__7")); + ASSERT_OK(Put("1000000000foo008", "v__8")); + ASSERT_OK(dbfull()->TEST_FlushMemTable()); + ASSERT_EQ("v1", Get("1000000000foo001")); + ASSERT_EQ("v__3", Get("1000000000foo003")); + Iterator* iter = dbfull()->NewIterator(ReadOptions()); + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo002", iter->key().ToString()); + ASSERT_EQ("v_2", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo003", iter->key().ToString()); + ASSERT_EQ("v__3", iter->value().ToString()); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo004", iter->key().ToString()); + ASSERT_EQ("v__4", iter->value().ToString()); + + iter->Seek("3000000000000bar"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("3000000000000bar", iter->key().ToString()); - } + ASSERT_EQ("bar_v", iter->value().ToString()); - // Test Bloom Filter - if (bloom_bits > 0) { - if (!total_order) { - // Neither key nor value should exist. - expect_bloom_not_match = true; - iter->Seek("2not000000000bar"); - ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); - expect_bloom_not_match = false; - } else { - expect_bloom_not_match = true; - ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); - expect_bloom_not_match = false; + iter->Seek("1000000000foo000"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo001", iter->key().ToString()); + ASSERT_EQ("v1", iter->value().ToString()); + + iter->Seek("1000000000foo005"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo005", iter->key().ToString()); + ASSERT_EQ("v__5", iter->value().ToString()); + + iter->Seek("1000000000foo006"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo007", iter->key().ToString()); + ASSERT_EQ("v__7", iter->value().ToString()); + + iter->Seek("1000000000foo008"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("1000000000foo008", iter->key().ToString()); + ASSERT_EQ("v__8", iter->value().ToString()); + + if (total_order == 0) { + iter->Seek("1000000000foo009"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("3000000000000bar", iter->key().ToString()); + } + + // Test Bloom Filter + if (bloom_bits > 0) { + if (!total_order) { + // Neither key nor value should exist. + expect_bloom_not_match = true; + iter->Seek("2not000000000bar"); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } else { + expect_bloom_not_match = true; + ASSERT_EQ("NOT_FOUND", Get("2not000000000bar")); + expect_bloom_not_match = false; + } } + ASSERT_OK(iter->status()); + delete iter; } - ASSERT_OK(iter->status()); - delete iter; } } - } } } @@ -863,7 +858,7 @@ namespace { std::string MakeLongKey(size_t length, char c) { return std::string(length, c); } -} // namespace +} // anonymous namespace TEST_P(PlainTableDBTest, IteratorLargeKeys) { Options options = CurrentOptions(); @@ -878,15 +873,10 @@ TEST_P(PlainTableDBTest, IteratorLargeKeys) { options.prefix_extractor.reset(); DestroyAndReopen(&options); - std::string key_list[] = { - MakeLongKey(30, '0'), - MakeLongKey(16, '1'), - MakeLongKey(32, '2'), - MakeLongKey(60, '3'), - MakeLongKey(90, '4'), - MakeLongKey(50, '5'), - MakeLongKey(26, '6') - }; + std::string key_list[] = {MakeLongKey(30, '0'), MakeLongKey(16, '1'), + MakeLongKey(32, '2'), MakeLongKey(60, '3'), + MakeLongKey(90, '4'), MakeLongKey(50, '5'), + MakeLongKey(26, '6')}; for (size_t i = 0; i < 7; i++) { ASSERT_OK(Put(key_list[i], std::to_string(i))); @@ -913,7 +903,7 @@ namespace { std::string MakeLongKeyWithPrefix(size_t length, char c) { return "00000000" + std::string(length - 8, c); } -} // namespace +} // anonymous namespace TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) { Options options = CurrentOptions(); @@ -1275,7 +1265,7 @@ TEST_P(PlainTableDBTest, CompactionTrigger) { Random rnd(301); for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; - num++) { + num++) { std::vector values; // Write 120KB (10 values, each 12K) for (int i = 0; i < 10; i++) { @@ -1287,7 +1277,7 @@ TEST_P(PlainTableDBTest, CompactionTrigger) { ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); } - //generate one more file in level-0, and should trigger level-0 compaction + // generate one more file in level-0, and should trigger level-0 compaction std::vector values; for (int i = 0; i < 12; i++) { values.push_back(rnd.RandomString(10000)); @@ -1315,8 +1305,7 @@ TEST_P(PlainTableDBTest, AdaptiveTable) { options.create_if_missing = false; std::shared_ptr block_based_factory( NewBlockBasedTableFactory()); - std::shared_ptr plain_table_factory( - NewPlainTableFactory()); + std::shared_ptr plain_table_factory(NewPlainTableFactory()); std::shared_ptr dummy_factory; options.table_factory.reset(NewAdaptiveTableFactory( block_based_factory, block_based_factory, plain_table_factory)); diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 74f0ce6be9a..8592b8f313c 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -69,7 +69,7 @@ struct TestKey { }; // return a slice backed by test_key -inline Slice TestKeyToSlice(std::string &s, const TestKey& test_key) { +inline Slice TestKeyToSlice(std::string& s, const TestKey& test_key) { s.clear(); PutFixed64(&s, test_key.prefix); PutFixed64(&s, test_key.sorted); @@ -77,20 +77,18 @@ inline Slice TestKeyToSlice(std::string &s, const TestKey& test_key) { } inline const TestKey SliceToTestKey(const Slice& slice) { - return TestKey(DecodeFixed64(slice.data()), - DecodeFixed64(slice.data() + 8)); + return TestKey(DecodeFixed64(slice.data()), DecodeFixed64(slice.data() + 8)); } class TestKeyComparator : public Comparator { public: - // Compare needs to be aware of the possibility of a and/or b is // prefix only int Compare(const Slice& a, const Slice& b) const override { const TestKey kkey_a = SliceToTestKey(a); const TestKey kkey_b = SliceToTestKey(b); - const TestKey *key_a = &kkey_a; - const TestKey *key_b = &kkey_b; + const TestKey* key_a = &kkey_a; + const TestKey* key_b = &kkey_b; if (key_a->prefix != key_b->prefix) { if (key_a->prefix < key_b->prefix) return -1; if (key_a->prefix > key_b->prefix) return 1; @@ -215,7 +213,7 @@ class SamePrefixTransform : public SliceTransform { bool FullLengthEnabled(size_t* /*len*/) const override { return false; } }; -} // namespace +} // anonymous namespace class PrefixTest : public testing::Test { public: @@ -226,7 +224,7 @@ class PrefixTest : public testing::Test { options.write_buffer_size = FLAGS_write_buffer_size; options.max_write_buffer_number = FLAGS_max_write_buffer_number; options.min_write_buffer_number_to_merge = - FLAGS_min_write_buffer_number_to_merge; + FLAGS_min_write_buffer_number_to_merge; options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_prefix_bloom_size_ratio; @@ -239,21 +237,19 @@ class PrefixTest : public testing::Test { options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.allow_concurrent_memtable_write = false; - Status s = DB::Open(options, kDbName, &db); + Status s = DB::Open(options, kDbName, &db); EXPECT_OK(s); return std::shared_ptr(db); } - void FirstOption() { - option_config_ = kBegin; - } + void FirstOption() { option_config_ = kBegin; } bool NextOptions(int bucket_count) { // skip some options option_config_++; if (option_config_ < kEnd) { options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - switch(option_config_) { + switch (option_config_) { case kHashSkipList: options.memtable_factory.reset( NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height)); @@ -350,8 +346,7 @@ TEST_F(PrefixTest, TestResult) { FirstOption(); while (NextOptions(num_buckets)) { std::cout << "*** Mem table: " << options.memtable_factory->Name() - << " number of buckets: " << num_buckets - << std::endl; + << " number of buckets: " << num_buckets << std::endl; ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; @@ -581,7 +576,7 @@ TEST_F(PrefixTest, PrefixValid) { TEST_F(PrefixTest, DynamicPrefixIterator) { while (NextOptions(FLAGS_bucket_count)) { std::cout << "*** Mem table: " << options.memtable_factory->Name() - << std::endl; + << std::endl; ASSERT_OK(DestroyDB(kDbName, Options())); auto db = OpenDb(); WriteOptions write_options; @@ -600,7 +595,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { HistogramImpl hist_put_comparison; // insert x random prefix, each with y continuous element. for (auto prefix : prefixes) { - for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { + for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) { TestKey test_key(prefix, sorted); std::string s; @@ -615,8 +610,9 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { } } - std::cout << "Put key comparison: \n" << hist_put_comparison.ToString() - << "Put time: \n" << hist_put_time.ToString(); + std::cout << "Put key comparison: \n" + << hist_put_comparison.ToString() << "Put time: \n" + << hist_put_time.ToString(); // test seek existing keys HistogramImpl hist_seek_time; @@ -635,8 +631,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { auto key_prefix = options.prefix_extractor->Transform(key); uint64_t total_keys = 0; for (iter->Seek(key); - iter->Valid() && iter->key().starts_with(key_prefix); - iter->Next()) { + iter->Valid() && iter->key().starts_with(key_prefix); iter->Next()) { if (FLAGS_trigger_deadlock) { std::cout << "Behold the deadlock!\n"; db->Delete(write_options, iter->key()); @@ -645,12 +640,12 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { } hist_seek_time.Add(timer.ElapsedNanos()); hist_seek_comparison.Add(get_perf_context()->user_key_comparison_count); - ASSERT_EQ(total_keys, FLAGS_items_per_prefix - FLAGS_items_per_prefix/2); + ASSERT_EQ(total_keys, + FLAGS_items_per_prefix - FLAGS_items_per_prefix / 2); } std::cout << "Seek key comparison: \n" - << hist_seek_comparison.ToString() - << "Seek time: \n" + << hist_seek_comparison.ToString() << "Seek time: \n" << hist_seek_time.ToString(); // test non-existing keys @@ -658,8 +653,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { HistogramImpl hist_no_seek_comparison; for (auto prefix = FLAGS_total_prefixes; - prefix < FLAGS_total_prefixes + 10000; - prefix++) { + prefix < FLAGS_total_prefixes + 10000; prefix++) { TestKey test_key(prefix, 0); std::string s; Slice key = TestKeyToSlice(s, test_key); @@ -668,7 +662,8 @@ TEST_F(PrefixTest, DynamicPrefixIterator) { StopWatchNano timer(SystemClock::Default().get(), true); iter->Seek(key); hist_no_seek_time.Add(timer.ElapsedNanos()); - hist_no_seek_comparison.Add(get_perf_context()->user_key_comparison_count); + hist_no_seek_comparison.Add( + get_perf_context()->user_key_comparison_count); ASSERT_TRUE(!iter->Valid()); ASSERT_OK(iter->status()); } diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index b45d5b4d4b4..c03efa11ffe 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -502,7 +502,7 @@ class TruncatedRangeDelMergingIter : public InternalIterator { size_t ts_sz_; }; -} // namespace +} // anonymous namespace std::unique_ptr CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound, diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc index 651999bd807..9dca707e5ce 100644 --- a/db/range_del_aggregator_bench.cc +++ b/db/range_del_aggregator_bench.cc @@ -54,12 +54,17 @@ DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run"); DEFINE_int32(add_tombstones_per_run, 1, "number of AddTombstones calls per run"); +DEFINE_bool(use_compaction_range_del_aggregator, false, + "Whether to use CompactionRangeDelAggregator. Default is to use " + "ReadRangeDelAggregator."); + namespace { struct Stats { uint64_t time_add_tombstones = 0; uint64_t time_first_should_delete = 0; uint64_t time_rest_should_delete = 0; + uint64_t time_fragment_tombstones = 0; }; std::ostream& operator<<(std::ostream& os, const Stats& s) { @@ -67,6 +72,10 @@ std::ostream& operator<<(std::ostream& os, const Stats& s) { fmt_holder.copyfmt(os); os << std::left; + os << std::setw(25) << "Fragment Tombstones: " + << s.time_fragment_tombstones / + (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3) + << " us\n"; os << std::setw(25) << "AddTombstones: " << s.time_add_tombstones / (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3) @@ -186,10 +195,17 @@ int main(int argc, char** argv) { FLAGS_num_range_tombstones); } auto mode = ROCKSDB_NAMESPACE::RangeDelPositioningMode::kForwardTraversal; - + std::vector snapshots{0}; for (int i = 0; i < FLAGS_num_runs; i++) { - ROCKSDB_NAMESPACE::ReadRangeDelAggregator range_del_agg( - &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */); + std::unique_ptr range_del_agg = + nullptr; + if (FLAGS_use_compaction_range_del_aggregator) { + range_del_agg.reset(new ROCKSDB_NAMESPACE::CompactionRangeDelAggregator( + &icmp, snapshots)); + } else { + range_del_agg.reset(new ROCKSDB_NAMESPACE::ReadRangeDelAggregator( + &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */)); + } std::vector< std::unique_ptr > @@ -207,12 +223,16 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::PersistentRangeTombstone( ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j); } - + auto iter = + ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones); + ROCKSDB_NAMESPACE::StopWatchNano stop_watch_fragment_tombstones( + clock, true /* auto_start */); fragmented_range_tombstone_lists.emplace_back( new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList( - ROCKSDB_NAMESPACE::MakeRangeDelIterator( - persistent_range_tombstones), - icmp)); + std::move(iter), icmp, FLAGS_use_compaction_range_del_aggregator, + snapshots)); + stats.time_fragment_tombstones += + stop_watch_fragment_tombstones.ElapsedNanos(); std::unique_ptr fragmented_range_del_iter( new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator( @@ -221,7 +241,7 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones( clock, true /* auto_start */); - range_del_agg.AddTombstones(std::move(fragmented_range_del_iter)); + range_del_agg->AddTombstones(std::move(fragmented_range_del_iter)); stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); } @@ -238,7 +258,7 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete( clock, true /* auto_start */); - range_del_agg.ShouldDelete(parsed_key, mode); + range_del_agg->ShouldDelete(parsed_key, mode); uint64_t call_time = stop_watch_should_delete.ElapsedNanos(); if (j == 0) { diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 3541fafd1a1..7fe35276a67 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -192,7 +192,7 @@ void VerifyFragmentedRangeDels( EXPECT_FALSE(iter->Valid()); } -} // namespace +} // anonymous namespace TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) { auto range_del_iter = MakeRangeDelIter({}); diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index 925b4ed331c..7e7cedeca48 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -156,7 +156,6 @@ void FragmentedRangeTombstoneList::FragmentTombstones( if (seq <= next_snapshot) { // This seqnum is visible by a lower snapshot. tombstone_seqs_.push_back(seq); - seq_set_.insert(seq); auto upper_bound_it = std::lower_bound(snapshots.begin(), snapshots.end(), seq); if (upper_bound_it == snapshots.begin()) { @@ -173,7 +172,6 @@ void FragmentedRangeTombstoneList::FragmentTombstones( // The fragmentation is being done for reads, so preserve all seqnums. tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(), seqnums_to_flush.end()); - seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end()); if (ts_sz) { tombstone_timestamps_.insert(tombstone_timestamps_.end(), timestamps_to_flush.begin(), @@ -258,15 +256,20 @@ void FragmentedRangeTombstoneList::FragmentTombstones( } bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower, - SequenceNumber upper) const { + SequenceNumber upper) { + std::call_once(seq_set_init_once_flag_, [this]() { + for (auto s : tombstone_seqs_) { + seq_set_.insert(s); + } + }); auto seq_it = seq_set_.lower_bound(lower); return seq_it != seq_set_.end() && *seq_it <= upper; } FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( - const FragmentedRangeTombstoneList* tombstones, - const InternalKeyComparator& icmp, SequenceNumber _upper_bound, - const Slice* ts_upper_bound, SequenceNumber _lower_bound) + FragmentedRangeTombstoneList* tombstones, const InternalKeyComparator& icmp, + SequenceNumber _upper_bound, const Slice* ts_upper_bound, + SequenceNumber _lower_bound) : tombstone_start_cmp_(icmp.user_comparator()), tombstone_end_cmp_(icmp.user_comparator()), icmp_(&icmp), @@ -280,7 +283,7 @@ FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( } FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( - const std::shared_ptr& tombstones, + const std::shared_ptr& tombstones, const InternalKeyComparator& icmp, SequenceNumber _upper_bound, const Slice* ts_upper_bound, SequenceNumber _lower_bound) : tombstone_start_cmp_(icmp.user_comparator()), diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index f4b0eab42bb..df07fa8949b 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -84,7 +84,9 @@ struct FragmentedRangeTombstoneList { // Returns true if the stored tombstones contain with one with a sequence // number in [lower, upper]. - bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const; + // This method is not const as it internally lazy initialize a set of + // sequence numbers (`seq_set_`). + bool ContainsRange(SequenceNumber lower, SequenceNumber upper); uint64_t num_unfragmented_tombstones() const { return num_unfragmented_tombstones_; @@ -113,6 +115,7 @@ struct FragmentedRangeTombstoneList { std::vector tombstones_; std::vector tombstone_seqs_; std::vector tombstone_timestamps_; + std::once_flag seq_set_init_once_flag_; std::set seq_set_; std::list pinned_slices_; PinnedIteratorsManager pinned_iters_mgr_; @@ -131,12 +134,13 @@ struct FragmentedRangeTombstoneList { // tombstone collapsing is always O(n log n). class FragmentedRangeTombstoneIterator : public InternalIterator { public: + FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp, + SequenceNumber upper_bound, + const Slice* ts_upper_bound = nullptr, + SequenceNumber lower_bound = 0); FragmentedRangeTombstoneIterator( - const FragmentedRangeTombstoneList* tombstones, - const InternalKeyComparator& icmp, SequenceNumber upper_bound, - const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); - FragmentedRangeTombstoneIterator( - const std::shared_ptr& tombstones, + const std::shared_ptr& tombstones, const InternalKeyComparator& icmp, SequenceNumber upper_bound, const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); FragmentedRangeTombstoneIterator( @@ -311,9 +315,9 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { const RangeTombstoneStackEndComparator tombstone_end_cmp_; const InternalKeyComparator* icmp_; const Comparator* ucmp_; - std::shared_ptr tombstones_ref_; + std::shared_ptr tombstones_ref_; std::shared_ptr tombstones_cache_ref_; - const FragmentedRangeTombstoneList* tombstones_; + FragmentedRangeTombstoneList* tombstones_; SequenceNumber upper_bound_; SequenceNumber lower_bound_; // Only consider timestamps <= ts_upper_bound_. diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc index 46b3c99b595..eee2ca2ca6d 100644 --- a/db/range_tombstone_fragmenter_test.cc +++ b/db/range_tombstone_fragmenter_test.cc @@ -354,7 +354,7 @@ TEST_F(RangeTombstoneFragmenterTest, FragmentedRangeTombstoneList fragment_list( std::move(range_del_iter), bytewise_icmp, true /* for_compaction */, - {20, 9} /* upper_bounds */); + {9, 20} /* snapshots */); FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, kMaxSequenceNumber /* upper_bound */); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, diff --git a/db/repair.cc b/db/repair.cc index 34da5ba05f6..ddec43e9b60 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -59,6 +59,7 @@ // Store per-table metadata (smallest, largest, largest-seq#, ...) // in the table's meta section to speed up ScanTable. +#include "db/version_builder.h" #ifndef ROCKSDB_LITE #include @@ -281,7 +282,7 @@ class Repairer { std::vector to_search_paths; for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) { - to_search_paths.push_back(db_options_.db_paths[path_id].path); + to_search_paths.push_back(db_options_.db_paths[path_id].path); } // search wal_dir if user uses a customize wal_dir @@ -332,7 +333,8 @@ class Repairer { void ConvertLogFilesToTables() { const auto& wal_dir = immutable_db_options_.GetWalDir(); for (size_t i = 0; i < logs_.size(); i++) { - // we should use LogFileName(wal_dir, logs_[i]) here. user might uses wal_dir option. + // we should use LogFileName(wal_dir, logs_[i]) here. user might uses + // wal_dir option. std::string logname = LogFileName(wal_dir, logs_[i]); Status status = ConvertLogToTable(wal_dir, logs_[i]); if (!status.ok()) { @@ -393,8 +395,8 @@ class Repairer { int counter = 0; while (reader.ReadRecord(&record, &scratch)) { if (record.size() < WriteBatchInternal::kHeader) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); continue; } Status record_status = WriteBatchInternal::SetContents(&batch, record); @@ -639,38 +641,80 @@ class Repairer { for (const auto& cf_id_and_tables : cf_id_to_tables) { auto* cfd = vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first); - VersionEdit edit; - edit.SetComparatorName(cfd->user_comparator()->Name()); - edit.SetLogNumber(0); - edit.SetNextFile(next_file_number_); - edit.SetColumnFamily(cfd->GetID()); - // TODO(opt): separate out into multiple levels + // Recover files' epoch number using dummy VersionStorageInfo + VersionBuilder dummy_version_builder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), cfd->current()->storage_info(), + cfd->current()->version_set(), + cfd->GetFileMetadataCacheReservationManager()); + VersionStorageInfo dummy_vstorage( + &cfd->internal_comparator(), cfd->user_comparator(), + cfd->NumberLevels(), cfd->ioptions()->compaction_style, + nullptr /* src_vstorage */, cfd->ioptions()->force_consistency_checks, + EpochNumberRequirement::kMightMissing); + Status s; + VersionEdit dummy_edit; for (const auto* table : cf_id_and_tables.second) { - edit.AddFile( + // TODO(opt): separate out into multiple levels + dummy_edit.AddFile( 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), table->meta.fd.GetFileSize(), table->meta.smallest, table->meta.largest, table->meta.fd.smallest_seqno, table->meta.fd.largest_seqno, table->meta.marked_for_compaction, table->meta.temperature, table->meta.oldest_blob_file_number, table->meta.oldest_ancester_time, table->meta.file_creation_time, - table->meta.file_checksum, table->meta.file_checksum_func_name, - table->meta.unique_id); + table->meta.epoch_number, table->meta.file_checksum, + table->meta.file_checksum_func_name, table->meta.unique_id, + table->meta.compensated_range_deletion_size); } - assert(next_file_number_ > 0); - vset_.MarkFileNumberUsed(next_file_number_ - 1); - mutex_.Lock(); - std::unique_ptr db_dir; - Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), - &db_dir, nullptr); - if (status.ok()) { - status = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, db_dir.get(), - false /* new_descriptor_log */); + s = dummy_version_builder.Apply(&dummy_edit); + if (s.ok()) { + s = dummy_version_builder.SaveTo(&dummy_vstorage); } - mutex_.Unlock(); - if (!status.ok()) { - return status; + if (s.ok()) { + dummy_vstorage.RecoverEpochNumbers(cfd); + } + if (s.ok()) { + // Record changes from this repair in VersionEdit, including files with + // recovered epoch numbers + VersionEdit edit; + edit.SetComparatorName(cfd->user_comparator()->Name()); + edit.SetLogNumber(0); + edit.SetNextFile(next_file_number_); + edit.SetColumnFamily(cfd->GetID()); + for (int level = 0; level < dummy_vstorage.num_levels(); ++level) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + edit.AddFile(level, *file_meta); + } + } + + // Release resources occupied by the dummy VersionStorageInfo + for (int level = 0; level < dummy_vstorage.num_levels(); ++level) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + file_meta->refs--; + if (file_meta->refs <= 0) { + delete file_meta; + } + } + } + + // Persist record of changes + assert(next_file_number_ > 0); + vset_.MarkFileNumberUsed(next_file_number_ - 1); + mutex_.Lock(); + std::unique_ptr db_dir; + s = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, + nullptr); + if (s.ok()) { + s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, + &mutex_, db_dir.get(), + false /* new_descriptor_log */); + } + mutex_.Unlock(); + } + if (!s.ok()) { + return s; } } return Status::OK(); @@ -715,8 +759,7 @@ Status GetDefaultCFOptions( } // anonymous namespace Status RepairDB(const std::string& dbname, const DBOptions& db_options, - const std::vector& column_families - ) { + const std::vector& column_families) { ColumnFamilyOptions default_cf_opts; Status status = GetDefaultCFOptions(column_families, &default_cf_opts); if (!status.ok()) { @@ -756,8 +799,7 @@ Status RepairDB(const std::string& dbname, const Options& options) { DBOptions db_options(opts); ColumnFamilyOptions cf_options(opts); - Repairer repairer(dbname, db_options, - {}, cf_options /* default_cf_opts */, + Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */, cf_options /* unknown_cf_opts */, true /* create_unknown_cfs */); Status status = repairer.Run(); diff --git a/db/repair_test.cc b/db/repair_test.cc index b93f1f951a4..f80f2b722f7 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -62,8 +62,62 @@ class RepairTest : public DBTestBase { ASSERT_GT(verify_passed, 0); SyncPoint::GetInstance()->DisableProcessing(); } + + std::vector GetLevelFileMetadatas(int level, int cf = 0) { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = + versions->GetColumnFamilySet()->GetColumnFamily(cf); + assert(cfd); + Version* const current = cfd->current(); + assert(current); + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + return storage_info->LevelFiles(level); + } }; +TEST_F(RepairTest, SortRepairedDBL0ByEpochNumber) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + + ASSERT_OK(Put("k1", "oldest")); + ASSERT_OK(Put("k1", "older")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("k1", "new")); + + std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + ASSERT_EQ(level0_files[0]->epoch_number, 2); + std::vector level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 1); + ASSERT_EQ(level1_files[0]->epoch_number, 1); + + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + ReopenWithSstIdVerify(); + + EXPECT_EQ(Get("k1"), "new"); + + level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 3); + EXPECT_EQ(level0_files[0]->epoch_number, 3); + EXPECT_EQ(level0_files[1]->epoch_number, 2); + EXPECT_EQ(level0_files[2]->epoch_number, 1); + level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 0); +} + TEST_F(RepairTest, LostManifest) { // Add a couple SST files, delete the manifest, and verify RepairDB() saves // the day. @@ -279,7 +333,7 @@ TEST_F(RepairTest, SeparateWalDir) { ASSERT_EQ(total_ssts_size, 0); } std::string manifest_path = - DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); Close(); ASSERT_OK(env_->FileExists(manifest_path)); @@ -301,7 +355,7 @@ TEST_F(RepairTest, SeparateWalDir) { ASSERT_EQ(Get("key"), "val"); ASSERT_EQ(Get("foo"), "bar"); - } while(ChangeWalOptions()); + } while (ChangeWalOptions()); } TEST_F(RepairTest, RepairMultipleColumnFamilies) { @@ -387,8 +441,7 @@ TEST_F(RepairTest, RepairColumnFamilyOptions) { ASSERT_EQ(fname_to_props.size(), 2U); for (const auto& fname_and_props : fname_to_props) { std::string comparator_name(rev_opts.comparator->Name()); - ASSERT_EQ(comparator_name, - fname_and_props.second->comparator_name); + ASSERT_EQ(comparator_name, fname_and_props.second->comparator_name); } Close(); diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc index 12394a3689b..c19dd1c91dc 100644 --- a/db/seqno_time_test.cc +++ b/db/seqno_time_test.cc @@ -80,7 +80,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); int sst_num = 0; @@ -88,7 +88,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { for (; sst_num < kNumTrigger; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -110,7 +110,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { for (; sst_num < kNumTrigger * 2; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -124,7 +124,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { for (; sst_num < kNumTrigger * 3; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); } @@ -143,7 +143,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { AssertKeyTemperature(20, Temperature::kCold); for (int i = 0; i < 30; i++) { - dbfull()->TEST_WaitForPeridicTaskRun([&] { + dbfull()->TEST_WaitForPeriodicTaskRun([&] { mock_clock_->MockSleepForSeconds(static_cast(20 * kKeyPerSec)); }); ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); @@ -161,7 +161,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { // the compaction will not get the new seqno->time sampling to decide the last // a few data's time. for (int i = 0; i < 5; i++) { - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(1000)); }); ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); } @@ -192,7 +192,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { // pass some time first, otherwise the first a few keys write time are going // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); int sst_num = 0; @@ -200,7 +200,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { for (; sst_num < 4; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); } ASSERT_OK(Flush()); @@ -222,7 +222,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { for (; sst_num < 14; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); } ASSERT_OK(Flush()); @@ -243,7 +243,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { // Wait some time, with each wait, the cold data is increasing and hot data is // decreasing for (int i = 0; i < 30; i++) { - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(200)); }); ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); uint64_t pre_hot = hot_data_size; @@ -262,7 +262,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { // hot data might not be empty, because if we don't write new data, there's // no seqno->time sampling available to the compaction for (int i = 0; i < 5; i++) { - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(1000)); }); ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); } @@ -324,7 +324,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { // Write a key every 10 seconds for (int i = 0; i < 200; i++) { ASSERT_OK(Put(Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); } ASSERT_OK(Flush()); @@ -357,7 +357,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { // Write a key every 1 seconds for (int i = 0; i < 200; i++) { ASSERT_OK(Put(Key(i + 190), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(1)); }); } seq_end = dbfull()->GetLatestSequenceNumber(); @@ -393,7 +393,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { // Write a key every 200 seconds for (int i = 0; i < 200; i++) { ASSERT_OK(Put(Key(i + 380), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(200)); }); } seq_end = dbfull()->GetLatestSequenceNumber(); @@ -435,7 +435,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { // Write a key every 100 seconds for (int i = 0; i < 200; i++) { ASSERT_OK(Put(Key(i + 570), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } seq_end = dbfull()->GetLatestSequenceNumber(); @@ -516,7 +516,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { // Write some data and increase the current time for (int i = 0; i < 200; i++) { ASSERT_OK(Put(Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } ASSERT_OK(Flush()); @@ -536,7 +536,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { // Write some data to the default CF (without preclude_last_level feature) for (int i = 0; i < 200; i++) { ASSERT_OK(Put(Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } ASSERT_OK(Flush()); @@ -544,7 +544,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { // Write some data to the CF one for (int i = 0; i < 20; i++) { ASSERT_OK(Put(1, Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); } ASSERT_OK(Flush(1)); @@ -568,7 +568,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { // Add more data to CF "two" to fill the in memory mapping for (int i = 0; i < 2000; i++) { ASSERT_OK(Put(2, Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping(); @@ -592,7 +592,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { // enabled have flushed, the in-memory seqno->time mapping should be cleared for (int i = 0; i < 10; i++) { ASSERT_OK(Put(0, Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping(); @@ -603,7 +603,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { for (int j = 0; j < 3; j++) { for (int i = 0; i < 200; i++) { ASSERT_OK(Put(2, Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } ASSERT_OK(Flush(2)); @@ -623,7 +623,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { for (int j = 0; j < 2; j++) { for (int i = 0; i < 200; i++) { ASSERT_OK(Put(0, Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } ASSERT_OK(Flush(0)); @@ -638,7 +638,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { // Write some data to CF "two", but don't flush to accumulate for (int i = 0; i < 1000; i++) { ASSERT_OK(Put(2, Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } ASSERT_GE( @@ -682,7 +682,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) { WriteOptions wo; for (int i = 0; i < 200; i++) { ASSERT_OK(dbi->Put(wo, Key(i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } SeqnoToTimeMapping seqno_to_time_mapping = dbi->TEST_GetSeqnoToTimeMapping(); @@ -720,7 +720,7 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { for (; sst_num < kNumTrigger - 1; sst_num++) { for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); } ASSERT_OK(Flush()); @@ -742,7 +742,7 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { // Trigger a compaction for (int i = 0; i < kNumKeys; i++) { ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); } sst_num++; diff --git a/db/snapshot_impl.cc b/db/snapshot_impl.cc index b9228c797c3..98b47546346 100644 --- a/db/snapshot_impl.cc +++ b/db/snapshot_impl.cc @@ -3,14 +3,13 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "rocksdb/snapshot.h" - #include "rocksdb/db.h" +#include "rocksdb/snapshot.h" namespace ROCKSDB_NAMESPACE { -ManagedSnapshot::ManagedSnapshot(DB* db) : db_(db), - snapshot_(db->GetSnapshot()) {} +ManagedSnapshot::ManagedSnapshot(DB* db) + : db_(db), snapshot_(db->GetSnapshot()) {} ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot) : db_(db), snapshot_(_snapshot) {} @@ -21,6 +20,6 @@ ManagedSnapshot::~ManagedSnapshot() { } } -const Snapshot* ManagedSnapshot::snapshot() { return snapshot_;} +const Snapshot* ManagedSnapshot::snapshot() { return snapshot_; } } // namespace ROCKSDB_NAMESPACE diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h index 59f491615d6..23e5e98cd2e 100644 --- a/db/snapshot_impl.h +++ b/db/snapshot_impl.h @@ -41,7 +41,7 @@ class SnapshotImpl : public Snapshot { SnapshotImpl* prev_; SnapshotImpl* next_; - SnapshotList* list_; // just for sanity checks + SnapshotList* list_; // just for sanity checks int64_t unix_time_; @@ -56,7 +56,7 @@ class SnapshotList { SnapshotList() { list_.prev_ = &list_; list_.next_ = &list_; - list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging + list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging // Set all the variables to make UBSAN happy. list_.list_ = nullptr; list_.unix_time_ = 0; @@ -72,8 +72,14 @@ class SnapshotList { assert(list_.next_ != &list_ || 0 == count_); return list_.next_ == &list_; } - SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } - SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } + SnapshotImpl* oldest() const { + assert(!empty()); + return list_.next_; + } + SnapshotImpl* newest() const { + assert(!empty()); + return list_.prev_; + } SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time, bool is_write_conflict_boundary, diff --git a/db/table_cache.cc b/db/table_cache.cc index c44578f8be6..a5fa5fbe3c2 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -31,16 +31,6 @@ #include "util/coding.h" #include "util/stop_watch.h" -namespace ROCKSDB_NAMESPACE { -namespace { -template -static void DeleteEntry(const Slice& /*key*/, void* value) { - T* typed_value = reinterpret_cast(value); - delete typed_value; -} -} // namespace -} // namespace ROCKSDB_NAMESPACE - // Generate the regular and coroutine versions of some methods by // including table_cache_sync_and_async.h twice // Macros in the header will expand differently based on whether @@ -58,12 +48,6 @@ namespace ROCKSDB_NAMESPACE { namespace { -static void UnrefEntry(void* arg1, void* arg2) { - Cache* cache = reinterpret_cast(arg1); - Cache::Handle* h = reinterpret_cast(arg2); - cache->Release(h); -} - static Slice GetSliceForFileNumber(const uint64_t* file_number) { return Slice(reinterpret_cast(file_number), sizeof(*file_number)); @@ -79,7 +63,7 @@ void AppendVarint64(IterKey* key, uint64_t v) { #endif // ROCKSDB_LITE -} // namespace +} // anonymous namespace const int kLoadConcurency = 128; @@ -103,16 +87,7 @@ TableCache::TableCache(const ImmutableOptions& ioptions, } } -TableCache::~TableCache() { -} - -TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) { - return reinterpret_cast(cache_->Value(handle)); -} - -void TableCache::ReleaseHandle(Cache::Handle* handle) { - cache_->Release(handle); -} +TableCache::~TableCache() {} Status TableCache::GetTableReader( const ReadOptions& ro, const FileOptions& file_options, @@ -128,6 +103,8 @@ Status TableCache::GetTableReader( FileOptions fopts = file_options; fopts.temperature = file_temperature; Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); + TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile", + const_cast(&s)); if (s.ok()) { s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr); } @@ -177,17 +154,10 @@ Status TableCache::GetTableReader( return s; } -void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) { - ReleaseHandle(handle); - uint64_t number = fd.GetNumber(); - Slice key = GetSliceForFileNumber(&number); - cache_->Erase(key); -} - Status TableCache::FindTable( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, Cache::Handle** handle, + const FileMetaData& file_meta, TypedHandle** handle, const std::shared_ptr& prefix_extractor, const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, @@ -195,7 +165,7 @@ Status TableCache::FindTable( PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); uint64_t number = file_meta.fd.GetNumber(); Slice key = GetSliceForFileNumber(&number); - *handle = cache_->Lookup(key); + *handle = cache_.Lookup(key); TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0", const_cast(&no_io)); @@ -205,7 +175,7 @@ Status TableCache::FindTable( } MutexLock load_lock(loader_mutex_.get(key)); // We check the cache again under loading mutex - *handle = cache_->Lookup(key); + *handle = cache_.Lookup(key); if (*handle != nullptr) { return Status::OK(); } @@ -223,8 +193,7 @@ Status TableCache::FindTable( // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { - s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry, - handle); + s = cache_.Insert(key, table_reader.get(), 1, handle); if (s.ok()) { // Release ownership of table reader. table_reader.release(); @@ -250,7 +219,7 @@ InternalIterator* TableCache::NewIterator( Status s; TableReader* table_reader = nullptr; - Cache::Handle* handle = nullptr; + TypedHandle* handle = nullptr; if (table_reader_ptr != nullptr) { *table_reader_ptr = nullptr; } @@ -265,7 +234,7 @@ InternalIterator* TableCache::NewIterator( level, true /* prefetch_index_and_filter_in_cache */, max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { - table_reader = GetTableReaderFromHandle(handle); + table_reader = cache_.Value(handle); } } InternalIterator* result = nullptr; @@ -279,7 +248,7 @@ InternalIterator* TableCache::NewIterator( file_options.compaction_readahead_size, allow_unprepared_value); } if (handle != nullptr) { - result->RegisterCleanup(&UnrefEntry, cache_, handle); + cache_.RegisterReleaseAsCleanup(handle, *result); handle = nullptr; // prevent from releasing below } @@ -329,7 +298,7 @@ InternalIterator* TableCache::NewIterator( } if (handle != nullptr) { - ReleaseHandle(handle); + cache_.Release(handle); } if (!s.ok()) { assert(result == nullptr); @@ -347,12 +316,12 @@ Status TableCache::GetRangeTombstoneIterator( const FileDescriptor& fd = file_meta.fd; Status s; TableReader* t = fd.table_reader; - Cache::Handle* handle = nullptr; + TypedHandle* handle = nullptr; if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, &handle); if (s.ok()) { - t = GetTableReaderFromHandle(handle); + t = cache_.Value(handle); } } if (s.ok()) { @@ -361,9 +330,9 @@ Status TableCache::GetRangeTombstoneIterator( } if (handle) { if (*out_iter) { - (*out_iter)->RegisterCleanup(&UnrefEntry, cache_, handle); + cache_.RegisterReleaseAsCleanup(handle, **out_iter); } else { - ReleaseHandle(handle); + cache_.Release(handle); } } return s; @@ -410,16 +379,10 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, bool found = false; row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); - if (auto row_handle = - ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) { + RowCacheInterface row_cache{ioptions_.row_cache.get()}; + if (auto row_handle = row_cache.Lookup(row_cache_key.GetUserKey())) { // Cleanable routine to release the cache entry Cleanable value_pinner; - auto release_cache_entry_func = [](void* cache_to_clean, - void* cache_handle) { - ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle); - }; - auto found_row_cache_entry = - static_cast(ioptions_.row_cache->Value(row_handle)); // If it comes here value is located on the cache. // found_row_cache_entry points to the value on cache, // and value_pinner has cleanup procedure for the cached entry. @@ -428,9 +391,8 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, // cleanup routine under value_pinner will be delegated to // get_context.pinnable_slice_. Cache entry is released when // get_context.pinnable_slice_ is reset. - value_pinner.RegisterCleanup(release_cache_entry_func, - ioptions_.row_cache.get(), row_handle); - replayGetContextLog(*found_row_cache_entry, user_key, get_context, + row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner); + replayGetContextLog(*row_cache.Value(row_handle), user_key, get_context, &value_pinner); RecordTick(ioptions_.stats, ROW_CACHE_HIT); found = true; @@ -469,7 +431,7 @@ Status TableCache::Get( #endif // ROCKSDB_LITE Status s; TableReader* t = fd.table_reader; - Cache::Handle* handle = nullptr; + TypedHandle* handle = nullptr; if (!done) { assert(s.ok()); if (t == nullptr) { @@ -480,7 +442,7 @@ Status TableCache::Get( level, true /* prefetch_index_and_filter_in_cache */, max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { - t = GetTableReaderFromHandle(handle); + t = cache_.Value(handle); } } SequenceNumber* max_covering_tombstone_seq = @@ -516,18 +478,17 @@ Status TableCache::Get( #ifndef ROCKSDB_LITE // Put the replay log in row cache only if something was found. if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { + RowCacheInterface row_cache{ioptions_.row_cache.get()}; size_t charge = row_cache_entry->capacity() + sizeof(std::string); - void* row_ptr = new std::string(std::move(*row_cache_entry)); + auto row_ptr = new std::string(std::move(*row_cache_entry)); // If row cache is full, it's OK to continue. - ioptions_.row_cache - ->Insert(row_cache_key.GetUserKey(), row_ptr, charge, - &DeleteEntry) + row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge) .PermitUncheckedError(); } #endif // ROCKSDB_LITE if (handle != nullptr) { - ReleaseHandle(handle); + cache_.Release(handle); } return s; } @@ -560,7 +521,7 @@ Status TableCache::MultiGetFilter( const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, Cache::Handle** table_handle) { + MultiGetContext::Range* mget_range, TypedHandle** table_handle) { auto& fd = file_meta.fd; #ifndef ROCKSDB_LITE IterKey row_cache_key; @@ -576,7 +537,7 @@ Status TableCache::MultiGetFilter( #endif // ROCKSDB_LITE Status s; TableReader* t = fd.table_reader; - Cache::Handle* handle = nullptr; + TypedHandle* handle = nullptr; MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), mget_range->end()); if (t == nullptr) { @@ -587,7 +548,7 @@ Status TableCache::MultiGetFilter( level, true /* prefetch_index_and_filter_in_cache */, /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); if (s.ok()) { - t = GetTableReaderFromHandle(handle); + t = cache_.Value(handle); } *table_handle = handle; } @@ -601,7 +562,7 @@ Status TableCache::MultiGetFilter( UpdateRangeTombstoneSeqnums(options, t, tombstone_range); } if (mget_range->empty() && handle) { - ReleaseHandle(handle); + cache_.Release(handle); *table_handle = nullptr; } @@ -622,16 +583,16 @@ Status TableCache::GetTableProperties( return Status::OK(); } - Cache::Handle* table_handle = nullptr; + TypedHandle* table_handle = nullptr; Status s = FindTable(ReadOptions(), file_options, internal_comparator, file_meta, &table_handle, prefix_extractor, no_io); if (!s.ok()) { return s; } assert(table_handle); - auto table = GetTableReaderFromHandle(table_handle); + auto table = cache_.Value(table_handle); *properties = table->GetTableProperties(); - ReleaseHandle(table_handle); + cache_.Release(table_handle); return s; } @@ -640,18 +601,18 @@ Status TableCache::ApproximateKeyAnchors( const FileMetaData& file_meta, std::vector& anchors) { Status s; TableReader* t = file_meta.fd.table_reader; - Cache::Handle* handle = nullptr; + TypedHandle* handle = nullptr; if (t == nullptr) { s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle); if (s.ok()) { - t = GetTableReaderFromHandle(handle); + t = cache_.Value(handle); } } if (s.ok() && t != nullptr) { s = t->ApproximateKeyAnchors(ro, anchors); } if (handle != nullptr) { - ReleaseHandle(handle); + cache_.Release(handle); } return s; } @@ -667,29 +628,19 @@ size_t TableCache::GetMemoryUsageByTableReader( return table_reader->ApproximateMemoryUsage(); } - Cache::Handle* table_handle = nullptr; + TypedHandle* table_handle = nullptr; Status s = FindTable(ReadOptions(), file_options, internal_comparator, file_meta, &table_handle, prefix_extractor, true); if (!s.ok()) { return 0; } assert(table_handle); - auto table = GetTableReaderFromHandle(table_handle); + auto table = cache_.Value(table_handle); auto ret = table->ApproximateMemoryUsage(); - ReleaseHandle(table_handle); + cache_.Release(table_handle); return ret; } -bool TableCache::HasEntry(Cache* cache, uint64_t file_number) { - Cache::Handle* handle = cache->Lookup(GetSliceForFileNumber(&file_number)); - if (handle) { - cache->Release(handle); - return true; - } else { - return false; - } -} - void TableCache::Evict(Cache* cache, uint64_t file_number) { cache->Erase(GetSliceForFileNumber(&file_number)); } @@ -700,7 +651,7 @@ uint64_t TableCache::ApproximateOffsetOf( const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; - Cache::Handle* table_handle = nullptr; + TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); Status s = @@ -708,7 +659,7 @@ uint64_t TableCache::ApproximateOffsetOf( &table_handle, prefix_extractor, false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { - table_reader = GetTableReaderFromHandle(table_handle); + table_reader = cache_.Value(table_handle); } } @@ -716,7 +667,7 @@ uint64_t TableCache::ApproximateOffsetOf( result = table_reader->ApproximateOffsetOf(key, caller); } if (table_handle != nullptr) { - ReleaseHandle(table_handle); + cache_.Release(table_handle); } return result; @@ -728,7 +679,7 @@ uint64_t TableCache::ApproximateSize( const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; - Cache::Handle* table_handle = nullptr; + TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); Status s = @@ -736,7 +687,7 @@ uint64_t TableCache::ApproximateSize( &table_handle, prefix_extractor, false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { - table_reader = GetTableReaderFromHandle(table_handle); + table_reader = cache_.Value(table_handle); } } @@ -744,7 +695,7 @@ uint64_t TableCache::ApproximateSize( result = table_reader->ApproximateSize(start, end, caller); } if (table_handle != nullptr) { - ReleaseHandle(table_handle); + cache_.Release(table_handle); } return result; diff --git a/db/table_cache.h b/db/table_cache.h index 2e50f2c7790..66282bf41f0 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -14,6 +14,7 @@ #include #include +#include "cache/typed_cache.h" #include "db/dbformat.h" #include "db/range_del_aggregator.h" #include "options/cf_options.h" @@ -56,6 +57,16 @@ class TableCache { const std::string& db_session_id); ~TableCache(); + // Cache interface for table cache + using CacheInterface = + BasicTypedCacheInterface; + using TypedHandle = CacheInterface::TypedHandle; + + // Cache interface for row cache + using RowCacheInterface = + BasicTypedCacheInterface; + using RowHandle = RowCacheInterface::TypedHandle; + // Return an iterator for the specified file number (the corresponding // file length must be exactly "file_size" bytes). If "table_reader_ptr" // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object @@ -124,7 +135,7 @@ class TableCache { const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, Cache::Handle** table_handle); + MultiGetContext::Range* mget_range, TypedHandle** table_handle); // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until @@ -142,25 +153,18 @@ class TableCache { const std::shared_ptr& prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, bool skip_range_deletions = false, int level = -1, - Cache::Handle* table_handle = nullptr); + TypedHandle* table_handle = nullptr); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); - // Query whether specified file number is currently in cache - static bool HasEntry(Cache* cache, uint64_t file_number); - - // Clean table handle and erase it from the table cache - // Used in DB close, or the file is not live anymore. - void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle); - // Find table reader // @param skip_filters Disables loading/accessing the filter block // @param level == -1 means not specified Status FindTable( const ReadOptions& ro, const FileOptions& toptions, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, Cache::Handle**, + const FileMetaData& file_meta, TypedHandle**, const std::shared_ptr& prefix_extractor = nullptr, const bool no_io = false, bool record_read_stats = true, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, @@ -168,9 +172,6 @@ class TableCache { size_t max_file_size_for_l0_meta_pin = 0, Temperature file_temperature = Temperature::kUnknown); - // Get TableReader from a cache handle. - TableReader* GetTableReaderFromHandle(Cache::Handle* handle); - // Get the table properties of a given table. // @no_io: indicates if we should load table to the cache if it is not present // in table cache yet. @@ -212,10 +213,7 @@ class TableCache { const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor = nullptr); - // Release the handle from a cache - void ReleaseHandle(Cache::Handle* handle); - - Cache* get_cache() const { return cache_; } + CacheInterface& get_cache() { return cache_; } // Capacity of the backing Cache that indicates infinite TableCache capacity. // For example when max_open_files is -1 we set the backing Cache to this. @@ -224,7 +222,7 @@ class TableCache { // The tables opened with this TableCache will be immortal, i.e., their // lifetime is as long as that of the DB. void SetTablesAreImmortal() { - if (cache_->GetCapacity() >= kInfiniteCapacity) { + if (cache_.get()->GetCapacity() >= kInfiniteCapacity) { immortal_tables_ = true; } } @@ -263,7 +261,7 @@ class TableCache { const ImmutableOptions& ioptions_; const FileOptions& file_options_; - Cache* const cache_; + CacheInterface cache_; std::string row_cache_id_; bool immortal_tables_; BlockCacheTracer* const block_cache_tracer_; diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h index e72abdd45d2..9043ec8363c 100644 --- a/db/table_cache_sync_and_async.h +++ b/db/table_cache_sync_and_async.h @@ -19,15 +19,14 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions, - int level, Cache::Handle* table_handle) { + int level, TypedHandle* handle) { auto& fd = file_meta.fd; Status s; TableReader* t = fd.table_reader; - Cache::Handle* handle = table_handle; MultiGetRange table_range(*mget_range, mget_range->begin(), mget_range->end()); if (handle != nullptr && t == nullptr) { - t = GetTableReaderFromHandle(handle); + t = cache_.Value(handle); } #ifndef ROCKSDB_LITE autovector row_cache_entries; @@ -75,7 +74,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) 0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature); TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); if (s.ok()) { - t = GetTableReaderFromHandle(handle); + t = cache_.Value(handle); assert(t); } } @@ -100,6 +99,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) #ifndef ROCKSDB_LITE if (lookup_row_cache) { size_t row_idx = 0; + RowCacheInterface row_cache{ioptions_.row_cache.get()}; for (auto miter = table_range.begin(); miter != table_range.end(); ++miter) { @@ -115,11 +115,9 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) // Put the replay log in row cache only if something was found. if (s.ok() && !row_cache_entry.empty()) { size_t charge = row_cache_entry.capacity() + sizeof(std::string); - void* row_ptr = new std::string(std::move(row_cache_entry)); + auto row_ptr = new std::string(std::move(row_cache_entry)); // If row cache is full, it's OK. - ioptions_.row_cache - ->Insert(row_cache_key.GetUserKey(), row_ptr, charge, - &DeleteEntry) + row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge) .PermitUncheckedError(); } } @@ -127,7 +125,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) #endif // ROCKSDB_LITE if (handle != nullptr) { - ReleaseHandle(handle); + cache_.Release(handle); } CO_RETURN s; } diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc index 591c1d04a6a..edb9a1b63a0 100644 --- a/db/table_properties_collector.cc +++ b/db/table_properties_collector.cc @@ -27,7 +27,7 @@ uint64_t GetUint64Property(const UserCollectedProperties& props, return GetVarint64(&raw, &val) ? val : 0; } -} // namespace +} // anonymous namespace Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key, const Slice& value, @@ -54,13 +54,12 @@ Status UserKeyTablePropertiesCollector::Finish( return collector_->Finish(properties); } -UserCollectedProperties -UserKeyTablePropertiesCollector::GetReadableProperties() const { +UserCollectedProperties UserKeyTablePropertiesCollector::GetReadableProperties() + const { return collector_->GetReadableProperties(); } -uint64_t GetDeletedKeys( - const UserCollectedProperties& props) { +uint64_t GetDeletedKeys(const UserCollectedProperties& props) { bool property_present_ignored; return GetUint64Property(props, TablePropertiesNames::kDeletedKeys, &property_present_ignored); @@ -68,8 +67,8 @@ uint64_t GetDeletedKeys( uint64_t GetMergeOperands(const UserCollectedProperties& props, bool* property_present) { - return GetUint64Property( - props, TablePropertiesNames::kMergeOperands, property_present); + return GetUint64Property(props, TablePropertiesNames::kMergeOperands, + property_present); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h index 9035ba793b3..968115c3d7a 100644 --- a/db/table_properties_collector.h +++ b/db/table_properties_collector.h @@ -150,8 +150,10 @@ class TimestampTablePropertiesCollector : public IntTblPropCollector { } Status Finish(UserCollectedProperties* properties) override { + // timestamp is empty is table is empty assert(timestamp_min_.size() == timestamp_max_.size() && - timestamp_max_.size() == cmp_->timestamp_size()); + (timestamp_min_.empty() || + timestamp_max_.size() == cmp_->timestamp_size())); properties->insert({"rocksdb.timestamp_min", timestamp_min_}); properties->insert({"rocksdb.timestamp_max", timestamp_max_}); return Status::OK(); diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 4098677b155..5f0f205da1c 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -61,30 +61,30 @@ void MakeBuilder( } // namespace // Collects keys that starts with "A" in a table. -class RegularKeysStartWithA: public TablePropertiesCollector { +class RegularKeysStartWithA : public TablePropertiesCollector { public: const char* Name() const override { return "RegularKeysStartWithA"; } Status Finish(UserCollectedProperties* properties) override { - std::string encoded; - std::string encoded_num_puts; - std::string encoded_num_deletes; - std::string encoded_num_single_deletes; - std::string encoded_num_size_changes; - PutVarint32(&encoded, count_); - PutVarint32(&encoded_num_puts, num_puts_); - PutVarint32(&encoded_num_deletes, num_deletes_); - PutVarint32(&encoded_num_single_deletes, num_single_deletes_); - PutVarint32(&encoded_num_size_changes, num_size_changes_); - *properties = UserCollectedProperties{ - {"TablePropertiesTest", message_}, - {"Count", encoded}, - {"NumPuts", encoded_num_puts}, - {"NumDeletes", encoded_num_deletes}, - {"NumSingleDeletes", encoded_num_single_deletes}, - {"NumSizeChanges", encoded_num_size_changes}, - }; - return Status::OK(); + std::string encoded; + std::string encoded_num_puts; + std::string encoded_num_deletes; + std::string encoded_num_single_deletes; + std::string encoded_num_size_changes; + PutVarint32(&encoded, count_); + PutVarint32(&encoded_num_puts, num_puts_); + PutVarint32(&encoded_num_deletes, num_deletes_); + PutVarint32(&encoded_num_single_deletes, num_single_deletes_); + PutVarint32(&encoded_num_size_changes, num_size_changes_); + *properties = UserCollectedProperties{ + {"TablePropertiesTest", message_}, + {"Count", encoded}, + {"NumPuts", encoded_num_puts}, + {"NumDeletes", encoded_num_deletes}, + {"NumSingleDeletes", encoded_num_single_deletes}, + {"NumSizeChanges", encoded_num_size_changes}, + }; + return Status::OK(); } Status AddUserKey(const Slice& user_key, const Slice& /*value*/, @@ -338,7 +338,7 @@ void TestCustomizedTablePropertiesCollector( TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) { // Test properties collectors with internal keys or regular keys // for block based table - for (bool encode_as_internal : { true, false }) { + for (bool encode_as_internal : {true, false}) { Options options; BlockBasedTableOptions table_options; table_options.flush_block_policy_factory = @@ -404,7 +404,7 @@ void TestInternalKeyPropertiesCollector( // HACK: Set options.info_log to avoid writing log in // SanitizeOptions(). options.info_log = std::make_shared(); - options = SanitizeOptions("db", // just a place holder + options = SanitizeOptions("db", // just a place holder options); ImmutableOptions ioptions(options); GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories); diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 044adc2c5ac..3878b428aa6 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -41,7 +41,7 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl( current_status_.PermitUncheckedError(); // Clear on start reporter_.env = options_->env; reporter_.info_log = options_->info_log.get(); - SeekToStartSequence(); // Seek till starting sequence + SeekToStartSequence(); // Seek till starting sequence } Status TransactionLogIteratorImpl::OpenLogFile( @@ -62,8 +62,7 @@ Status TransactionLogIteratorImpl::OpenLogFile( // If cannot open file in DB directory. // Try the archive dir, as it could have moved in the meanwhile. fname = ArchivedLogFileName(dir_, log_file->LogNumber()); - s = fs->NewSequentialFile(fname, optimized_env_options, - &file, nullptr); + s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr); } } if (s.ok()) { @@ -74,7 +73,7 @@ Status TransactionLogIteratorImpl::OpenLogFile( return s; } -BatchResult TransactionLogIteratorImpl::GetBatch() { +BatchResult TransactionLogIteratorImpl::GetBatch() { assert(is_valid_); // cannot call in a non valid state. BatchResult result; result.sequence = current_batch_seq_; @@ -124,8 +123,8 @@ void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index, } while (RestrictedRead(&record)) { if (record.size() < WriteBatchInternal::kHeader) { - reporter_.Corruption( - record.size(), Status::Corruption("very small log record")); + reporter_.Corruption(record.size(), + Status::Corruption("very small log record")); continue; } UpdateCurrentWriteBatch(record); @@ -137,11 +136,12 @@ void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index, reporter_.Info(current_status_.ToString().c_str()); return; } else if (strict) { - reporter_.Info("Could seek required sequence number. Iterator will " - "continue."); + reporter_.Info( + "Could seek required sequence number. Iterator will " + "continue."); } is_valid_ = true; - started_ = true; // set started_ as we could seek till starting sequence + started_ = true; // set started_ as we could seek till starting sequence return; } else { is_valid_ = false; @@ -182,15 +182,15 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) { // Runs every time until we can seek to the start sequence SeekToStartSequence(); } - while(true) { + while (true) { assert(current_log_reader_); if (current_log_reader_->IsEOF()) { current_log_reader_->UnmarkEOF(); } while (RestrictedRead(&record)) { if (record.size() < WriteBatchInternal::kHeader) { - reporter_.Corruption( - record.size(), Status::Corruption("very small log record")); + reporter_.Corruption(record.size(), + Status::Corruption("very small log record")); continue; } else { // started_ should be true if called by application diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index 6ec7b14e135..e8c6efc02e4 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -23,12 +23,11 @@ namespace ROCKSDB_NAMESPACE { class LogFileImpl : public LogFile { public: LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq, - uint64_t sizeBytes) : - logNumber_(logNum), - type_(logType), - startSequence_(startSeq), - sizeFileBytes_(sizeBytes) { - } + uint64_t sizeBytes) + : logNumber_(logNum), + type_(logType), + startSequence_(startSeq), + sizeFileBytes_(sizeBytes) {} std::string PathName() const override { if (type_ == kArchivedLogFile) { @@ -45,7 +44,7 @@ class LogFileImpl : public LogFile { uint64_t SizeFileBytes() const override { return sizeFileBytes_; } - bool operator < (const LogFile& that) const { + bool operator<(const LogFile& that) const { return LogNumber() < that.LogNumber(); } @@ -54,7 +53,6 @@ class LogFileImpl : public LogFile { WalFileType type_; SequenceNumber startSequence_; uint64_t sizeFileBytes_; - }; class TransactionLogIteratorImpl : public TransactionLogIterator { diff --git a/db/trim_history_scheduler.h b/db/trim_history_scheduler.h index b17f6170fcd..252802a7aea 100644 --- a/db/trim_history_scheduler.h +++ b/db/trim_history_scheduler.h @@ -6,8 +6,10 @@ #pragma once #include + #include #include + #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { diff --git a/db/version_builder.cc b/db/version_builder.cc index 2c65dcf7195..4f0e3a8413c 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -28,6 +28,7 @@ #include "db/dbformat.h" #include "db/internal_stats.h" #include "db/table_cache.h" +#include "db/version_edit.h" #include "db/version_set.h" #include "port/port.h" #include "table/table_reader.h" @@ -36,25 +37,22 @@ namespace ROCKSDB_NAMESPACE { class VersionBuilder::Rep { - class NewestFirstBySeqNo { + class NewestFirstByEpochNumber { + private: + inline static const NewestFirstBySeqNo seqno_cmp; + public: bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { assert(lhs); assert(rhs); - if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { - return lhs->fd.largest_seqno > rhs->fd.largest_seqno; - } - - if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { - return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; + if (lhs->epoch_number != rhs->epoch_number) { + return lhs->epoch_number > rhs->epoch_number; + } else { + return seqno_cmp(lhs, rhs); } - - // Break ties by file number - return lhs->fd.GetNumber() > rhs->fd.GetNumber(); } }; - class BySmallestKey { public: explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {} @@ -251,7 +249,8 @@ class VersionBuilder::Rep { std::unordered_map table_file_levels_; // Current compact cursors that should be changed after the last compaction std::unordered_map updated_compact_cursors_; - NewestFirstBySeqNo level_zero_cmp_; + NewestFirstByEpochNumber level_zero_cmp_by_epochno_; + NewestFirstBySeqNo level_zero_cmp_by_seqno_; BySmallestKey level_nonzero_cmp_; // Mutable metadata objects for all blob files affected by the series of @@ -295,7 +294,9 @@ class VersionBuilder::Rep { if (f->refs <= 0) { if (f->table_reader_handle) { assert(table_cache_ != nullptr); - table_cache_->ReleaseHandle(f->table_reader_handle); + // NOTE: have to release in raw cache interface to avoid using a + // TypedHandle for FileMetaData::table_reader_handle + table_cache_->get_cache().get()->Release(f->table_reader_handle); f->table_reader_handle = nullptr; } @@ -382,43 +383,60 @@ class VersionBuilder::Rep { ExpectedLinkedSsts expected_linked_ssts; if (num_levels_ > 0) { + const InternalKeyComparator* const icmp = vstorage->InternalComparator(); + EpochNumberRequirement epoch_number_requirement = + vstorage->GetEpochNumberRequirement(); + assert(icmp); // Check L0 { - auto l0_checker = [this](const FileMetaData* lhs, - const FileMetaData* rhs) { + auto l0_checker = [this, epoch_number_requirement, icmp]( + const FileMetaData* lhs, + const FileMetaData* rhs) { assert(lhs); assert(rhs); - if (!level_zero_cmp_(lhs, rhs)) { - std::ostringstream oss; - oss << "L0 files are not sorted properly: files #" - << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); - - return Status::Corruption("VersionBuilder", oss.str()); - } - - if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) { - // This is an external file that we ingested - const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno; - - if (!(external_file_seqno < lhs->fd.largest_seqno || - external_file_seqno == 0)) { + if (epoch_number_requirement == + EpochNumberRequirement::kMightMissing) { + if (!level_zero_cmp_by_seqno_(lhs, rhs)) { std::ostringstream oss; - oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " - << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno - << " vs. file #" << rhs->fd.GetNumber() - << " with global_seqno " << external_file_seqno; - + oss << "L0 files are not sorted properly: files #" + << lhs->fd.GetNumber() << " with seqnos (largest, smallest) " + << lhs->fd.largest_seqno << " , " << lhs->fd.smallest_seqno + << ", #" << rhs->fd.GetNumber() + << " with seqnos (largest, smallest) " + << rhs->fd.largest_seqno << " , " << rhs->fd.smallest_seqno; return Status::Corruption("VersionBuilder", oss.str()); } - } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) { - std::ostringstream oss; - oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " - << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno - << " vs. file #" << rhs->fd.GetNumber() << " with seqno " - << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno; + } else if (epoch_number_requirement == + EpochNumberRequirement::kMustPresent) { + if (lhs->epoch_number == rhs->epoch_number) { + bool range_overlapped = + icmp->Compare(lhs->smallest, rhs->largest) <= 0 && + icmp->Compare(lhs->largest, rhs->smallest) >= 0; + + if (range_overlapped) { + std::ostringstream oss; + oss << "L0 files of same epoch number but overlapping range #" + << lhs->fd.GetNumber() + << " , smallest key: " << lhs->smallest.DebugString(true) + << " , largest key: " << lhs->largest.DebugString(true) + << " , epoch number: " << lhs->epoch_number << " vs. file #" + << rhs->fd.GetNumber() + << " , smallest key: " << rhs->smallest.DebugString(true) + << " , largest key: " << rhs->largest.DebugString(true) + << " , epoch number: " << rhs->epoch_number; + return Status::Corruption("VersionBuilder", oss.str()); + } + } - return Status::Corruption("VersionBuilder", oss.str()); + if (!level_zero_cmp_by_epochno_(lhs, rhs)) { + std::ostringstream oss; + oss << "L0 files are not sorted properly: files #" + << lhs->fd.GetNumber() << " with epoch number " + << lhs->epoch_number << ", #" << rhs->fd.GetNumber() + << " with epoch number " << rhs->epoch_number; + return Status::Corruption("VersionBuilder", oss.str()); + } } return Status::OK(); @@ -433,8 +451,6 @@ class VersionBuilder::Rep { } // Check L1 and up - const InternalKeyComparator* const icmp = vstorage->InternalComparator(); - assert(icmp); for (int level = 1; level < num_levels_; ++level) { auto checker = [this, level, icmp](const FileMetaData* lhs, @@ -1156,6 +1172,25 @@ class VersionBuilder::Rep { } } + bool PromoteEpochNumberRequirementIfNeeded( + VersionStorageInfo* vstorage) const { + if (vstorage->HasMissingEpochNumber()) { + return false; + } + + for (int level = 0; level < num_levels_; ++level) { + for (const auto& pair : levels_[level].added_files) { + const FileMetaData* f = pair.second; + if (f->epoch_number == kUnknownEpochNumber) { + return false; + } + } + } + + vstorage->SetEpochNumberRequirement(EpochNumberRequirement::kMustPresent); + return true; + } + void SaveSSTFilesTo(VersionStorageInfo* vstorage) const { assert(vstorage); @@ -1163,7 +1198,21 @@ class VersionBuilder::Rep { return; } - SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_); + EpochNumberRequirement epoch_number_requirement = + vstorage->GetEpochNumberRequirement(); + + if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { + bool promoted = PromoteEpochNumberRequirementIfNeeded(vstorage); + if (promoted) { + epoch_number_requirement = vstorage->GetEpochNumberRequirement(); + } + } + + if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { + SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_seqno_); + } else { + SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_epochno_); + } for (int level = 1; level < num_levels_; ++level) { SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_); @@ -1211,7 +1260,8 @@ class VersionBuilder::Rep { size_t max_file_size_for_l0_meta_pin) { assert(table_cache_ != nullptr); - size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity(); + size_t table_cache_capacity = + table_cache_->get_cache().get()->GetCapacity(); bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity); size_t max_load = std::numeric_limits::max(); @@ -1233,7 +1283,7 @@ class VersionBuilder::Rep { load_limit = table_cache_capacity / 4; } - size_t table_cache_usage = table_cache_->get_cache()->GetUsage(); + size_t table_cache_usage = table_cache_->get_cache().get()->GetUsage(); if (table_cache_usage >= load_limit) { // TODO (yanqin) find a suitable status code. return Status::OK(); @@ -1272,18 +1322,18 @@ class VersionBuilder::Rep { auto* file_meta = files_meta[file_idx].first; int level = files_meta[file_idx].second; + TableCache::TypedHandle* handle = nullptr; statuses[file_idx] = table_cache_->FindTable( ReadOptions(), file_options_, - *(base_vstorage_->InternalComparator()), *file_meta, - &file_meta->table_reader_handle, prefix_extractor, false /*no_io */, - true /* record_read_stats */, + *(base_vstorage_->InternalComparator()), *file_meta, &handle, + prefix_extractor, false /*no_io */, true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, file_meta->temperature); - if (file_meta->table_reader_handle != nullptr) { + if (handle != nullptr) { + file_meta->table_reader_handle = handle; // Load table_reader - file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( - file_meta->table_reader_handle); + file_meta->fd.table_reader = table_cache_->get_cache().Value(handle); } } }); diff --git a/db/version_builder.h b/db/version_builder.h index 1c022832aa2..682d6052429 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -11,7 +11,9 @@ #include +#include "db/version_edit.h" #include "rocksdb/file_system.h" +#include "rocksdb/metadata.h" #include "rocksdb/slice_transform.h" namespace ROCKSDB_NAMESPACE { @@ -69,4 +71,22 @@ class BaseReferencedVersionBuilder { Version* version_; }; +class NewestFirstBySeqNo { + public: + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + + if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { + return lhs->fd.largest_seqno > rhs->fd.largest_seqno; + } + + if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { + return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; + } + + // Break ties by file number + return lhs->fd.GetNumber() > rhs->fd.GetNumber(); + } +}; } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index a751b697f52..611dee774b0 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -64,15 +64,16 @@ class VersionBuilderTest : public testing::Test { uint64_t num_entries = 0, uint64_t num_deletions = 0, bool sampled = false, SequenceNumber smallest_seqno = 0, SequenceNumber largest_seqno = 0, - uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber, + uint64_t epoch_number = kUnknownEpochNumber) { assert(level < vstorage_.num_levels()); FileMetaData* f = new FileMetaData( file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq), GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, /* marked_for_compact */ false, Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; @@ -98,7 +99,8 @@ class VersionBuilderTest : public testing::Test { vstorage_.AddBlobFile(std::move(meta)); } - void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) { + void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number, + uint64_t epoch_number) { constexpr int level = 0; constexpr char smallest[] = "bar"; constexpr char largest[] = "foo"; @@ -112,11 +114,11 @@ class VersionBuilderTest : public testing::Test { Add(level, table_file_number, smallest, largest, file_size, path_id, smallest_seq, largest_seq, num_entries, num_deletions, sampled, - smallest_seq, largest_seq, blob_file_number); + smallest_seq, largest_seq, blob_file_number, epoch_number); } void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number, - uint64_t blob_file_number) { + uint64_t blob_file_number, uint64_t epoch_number) { assert(edit); constexpr int level = 0; @@ -128,12 +130,13 @@ class VersionBuilderTest : public testing::Test { constexpr SequenceNumber largest_seqno = 300; constexpr bool marked_for_compaction = false; - edit->AddFile( - level, table_file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, blob_file_number, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + edit->AddFile(level, table_file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + epoch_number, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); } void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) { @@ -157,7 +160,13 @@ void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) { } TEST_F(VersionBuilderTest, ApplyAndSaveTo) { - Add(0, 1U, "150", "200", 100U); + Add(0, 1U, "150", "200", 100U, /*path_id*/ 0, + /*smallest_seq*/ 100, /*largest_seq*/ 100, + /*num_entries*/ 0, /*num_deletions*/ 0, + /*sampled*/ false, /*smallest_seqno*/ 0, + /*largest_seqno*/ 0, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 1); Add(1, 66U, "150", "200", 100U); Add(1, 88U, "201", "300", 100U); @@ -177,8 +186,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { version_edit.AddFile( 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.DeleteFile(3, 27U); EnvOptions env_options; @@ -204,8 +213,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { ioptions_.level_compaction_dynamic_level_bytes = true; - Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U); - Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U); + Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 2); + Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 1); Add(4, 6U, "150", "179", 100U); Add(4, 7U, "180", "220", 100U); @@ -220,8 +233,9 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { version_edit.AddFile( 3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -250,8 +264,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { ioptions_.level_compaction_dynamic_level_bytes = true; - Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U); - Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U); + Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 2); + Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 1); Add(4, 6U, "150", "179", 100U); Add(4, 7U, "180", "220", 100U); @@ -266,8 +284,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { version_edit.AddFile( 4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -302,28 +320,28 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { version_edit.AddFile( 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.AddFile( 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.AddFile( 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.AddFile( 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.AddFile( 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); EnvOptions env_options; constexpr TableCache* table_cache = nullptr; @@ -361,43 +379,43 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { version_edit.AddFile( 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.AddFile( 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.AddFile( 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.AddFile( 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit.AddFile( 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); ASSERT_OK(version_builder.Apply(&version_edit)); VersionEdit version_edit2; version_edit.AddFile( 2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); version_edit.AddFile( 2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); ASSERT_OK(version_builder.Apply(&version_edit2)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -502,13 +520,13 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { constexpr bool marked_for_compaction = false; - addition.AddFile(level, file_number, path_id, file_size, - GetInternalKey(smallest, smallest_seq), - GetInternalKey(largest, largest_seq), smallest_seqno, - largest_seqno, marked_for_compaction, Temperature::kUnknown, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + addition.AddFile( + level, file_number, path_id, file_size, + GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); ASSERT_OK(builder.Apply(&addition)); @@ -556,8 +574,8 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { new_level, file_number, path_id, file_size, GetInternalKey(smallest), GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); const Status s = builder.Apply(&edit); ASSERT_TRUE(s.IsCorruption()); @@ -588,12 +606,12 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; - edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + edit.AddFile( + level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); ASSERT_OK(builder.Apply(&edit)); @@ -605,8 +623,8 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { new_level, file_number, path_id, file_size, GetInternalKey(smallest), GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); const Status s = builder.Apply(&other_edit); ASSERT_TRUE(s.IsCorruption()); @@ -641,8 +659,8 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { level, file_number, path_id, file_size, GetInternalKey(smallest), GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); ASSERT_OK(builder.Apply(&addition)); @@ -691,7 +709,8 @@ TEST_F(VersionBuilderTest, ApplyBlobFileAddition) { // Add dummy table file to ensure the blob file is referenced. constexpr uint64_t table_file_number = 1; - AddDummyFileToEdit(&edit, table_file_number, blob_file_number); + AddDummyFileToEdit(&edit, table_file_number, blob_file_number, + 1 /*epoch_number*/); ASSERT_OK(builder.Apply(&edit)); @@ -813,7 +832,7 @@ TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) { ASSERT_NE(meta, nullptr); // Add dummy table file to ensure the blob file is referenced. - AddDummyFile(table_file_number, blob_file_number); + AddDummyFile(table_file_number, blob_file_number, 1 /*epoch_number*/); UpdateVersionStorageInfo(); @@ -892,7 +911,8 @@ TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) { // Add dummy table file to ensure the blob file is referenced. constexpr uint64_t table_file_number = 1; - AddDummyFileToEdit(&addition, table_file_number, blob_file_number); + AddDummyFileToEdit(&addition, table_file_number, blob_file_number, + 1 /*epoch_number*/); ASSERT_OK(builder.Apply(&addition)); @@ -989,7 +1009,8 @@ TEST_F(VersionBuilderTest, BlobFileGarbageOverflow) { // Add dummy table file to ensure the blob file is referenced. constexpr uint64_t table_file_number = 1; - AddDummyFileToEdit(&addition, table_file_number, blob_file_number); + AddDummyFileToEdit(&addition, table_file_number, blob_file_number, + 1 /*epoch_number*/); ASSERT_OK(builder.Apply(&addition)); @@ -1050,7 +1071,7 @@ TEST_F(VersionBuilderTest, SaveBlobFilesTo) { const uint64_t table_file_number = 2 * i; const uint64_t blob_file_number = 2 * i + 1; - AddDummyFile(table_file_number, blob_file_number); + AddDummyFile(table_file_number, blob_file_number, i /*epoch_number*/); } UpdateVersionStorageInfo(); @@ -1171,7 +1192,8 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { constexpr uint64_t garbage_blob_count = 0; constexpr uint64_t garbage_blob_bytes = 0; - AddDummyFile(base_table_file_number, base_blob_file_number); + AddDummyFile(base_table_file_number, base_blob_file_number, + 1 /*epoch_number*/); AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes, checksum_method, checksum_value, BlobFileMetaData::LinkedSsts{base_table_file_number}, @@ -1206,12 +1228,12 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { constexpr uint64_t total_blob_count = 234; constexpr uint64_t total_blob_bytes = 1 << 22; - edit.AddFile(level, table_file_number, path_id, file_size, - GetInternalKey(smallest), GetInternalKey(largest), - smallest_seqno, largest_seqno, marked_for_compaction, - Temperature::kUnknown, blob_file_number, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - checksum_value, checksum_method, kNullUniqueId64x2); + edit.AddFile( + level, table_file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /*epoch_number*/, + checksum_value, checksum_method, kNullUniqueId64x2, 0); edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, checksum_value); @@ -1297,8 +1319,9 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* largest_seqno */ 200, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0); edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("801"), @@ -1306,8 +1329,9 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* largest_seqno */ 200, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0); edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, /* total_blob_bytes */ 200000, /* checksum_method */ std::string(), @@ -1527,8 +1551,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 2100, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); // Add an SST that does not reference any blob files. edit.AddFile( @@ -1537,8 +1561,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200, /* largest_seqno */ 2200, /* marked_for_compaction */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); // Delete a file that references a blob file. edit.DeleteFile(/* level */ 1, /* file_number */ 6); @@ -1559,8 +1583,9 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 300, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0); // Trivially move a file that does not reference any blob files. edit.DeleteFile(/* level */ 1, /* file_number */ 13); @@ -1571,8 +1596,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 1300, /* marked_for_compaction */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); // Add one more SST file that references a blob file, then promptly // delete it in a second version edit before the new version gets saved. @@ -1584,8 +1609,9 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 2300, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0); VersionEdit edit2; @@ -1634,7 +1660,13 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { } TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { - Add(0, 1U, "150", "200", 100U); + Add(0, 1U, "150", "200", 100, /*path_id*/ 0, + /*smallest_seq*/ 100, /*largest_seq*/ 100, + /*num_entries*/ 0, /*num_deletions*/ 0, + /*sampled*/ false, /*smallest_seqno*/ 0, + /*largest_seqno*/ 0, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 1); UpdateVersionStorageInfo(); @@ -1656,16 +1688,109 @@ TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { UpdateVersionStorageInfo(&new_vstorage); VersionBuilder version_builder2(env_options, &ioptions_, table_cache, - &new_vstorage, version_set); + &new_vstorage, version_set); VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, nullptr, - true /* force_consistency_checks */); + kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */); ASSERT_NOK(version_builder2.Apply(&version_edit)); UnrefFilesInVersion(&new_vstorage); UnrefFilesInVersion(&new_vstorage2); } +TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { + Status s; + // To verify files of same epoch number of overlapping ranges are caught as + // corrupted + VersionEdit version_edit_1; + version_edit_1.AddFile( + /* level */ 0, /* file_number */ 1U, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("a", 1), + /* largest */ GetInternalKey("c", 3), /* smallest_seqno */ 1, + /* largest_seqno */ 3, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0); + version_edit_1.AddFile( + /* level */ 0, /* file_number */ 2U, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), + /* largest */ GetInternalKey("d", 4), /* smallest_seqno */ 2, + /* largest_seqno */ 4, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0); + + VersionBuilder version_builder_1(EnvOptions(), &ioptions_, + nullptr /* table_cache */, &vstorage_, + nullptr /* file_metadata_cache_res_mgr */); + VersionStorageInfo new_vstorage_1( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, + nullptr /* src_vstorage */, true /* force_consistency_checks */); + + ASSERT_OK(version_builder_1.Apply(&version_edit_1)); + s = version_builder_1.SaveTo(&new_vstorage_1); + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(std::strstr( + s.getState(), "L0 files of same epoch number but overlapping range")); + UnrefFilesInVersion(&new_vstorage_1); + + // To verify L0 files not sorted by epoch_number are caught as corrupted + VersionEdit version_edit_2; + version_edit_2.AddFile( + /* level */ 0, /* file_number */ 1U, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("a", 1), + /* largest */ GetInternalKey("a", 1), /* smallest_seqno */ 1, + /* largest_seqno */ 1, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0); + version_edit_2.AddFile( + /* level */ 0, /* file_number */ 2U, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), + /* largest */ GetInternalKey("b", 2), /* smallest_seqno */ 2, + /* largest_seqno */ 2, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 2 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0); + + VersionBuilder version_builder_2(EnvOptions(), &ioptions_, + nullptr /* table_cache */, &vstorage_, + nullptr /* file_metadata_cache_res_mgr */); + VersionStorageInfo new_vstorage_2( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, + nullptr /* src_vstorage */, true /* force_consistency_checks */); + + ASSERT_OK(version_builder_2.Apply(&version_edit_2)); + s = version_builder_2.SaveTo(&new_vstorage_2); + ASSERT_TRUE(s.ok()); + + const std::vector& l0_files = new_vstorage_2.LevelFiles(0); + ASSERT_EQ(l0_files.size(), 2); + // Manually corrupt L0 files's epoch_number + l0_files[0]->epoch_number = 1; + l0_files[1]->epoch_number = 2; + + // To surface corruption error by applying dummy version edit + VersionEdit dummy_version_edit; + VersionBuilder dummy_version_builder( + EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, + nullptr /* file_metadata_cache_res_mgr */); + ASSERT_OK(dummy_version_builder.Apply(&dummy_version_edit)); + s = dummy_version_builder.SaveTo(&new_vstorage_2); + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(std::strstr(s.getState(), "L0 files are not sorted properly")); + + UnrefFilesInVersion(&new_vstorage_2); +} + TEST_F(VersionBuilderTest, EstimatedActiveKeys) { const uint32_t kTotalSamples = 20; const uint32_t kNumLevels = 5; diff --git a/db/version_edit.cc b/db/version_edit.cc index 8e13aedbdc3..e751353315a 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -20,9 +20,7 @@ namespace ROCKSDB_NAMESPACE { -namespace { - -} // anonymous namespace +namespace {} // anonymous namespace uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { assert(number <= kFileNumberMask); @@ -149,7 +147,8 @@ bool VersionEdit::EncodeTo(std::string* dst) const { bool min_log_num_written = false; for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; - if (!f.smallest.Valid() || !f.largest.Valid()) { + if (!f.smallest.Valid() || !f.largest.Valid() || + f.epoch_number == kUnknownEpochNumber) { return false; } PutVarint32(dst, kNewFile4); @@ -198,6 +197,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const { &varint_file_creation_time); PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); + PutVarint32(dst, NewFileCustomTag::kEpochNumber); + std::string varint_epoch_number; + PutVarint64(&varint_epoch_number, f.epoch_number); + PutLengthPrefixedSlice(dst, Slice(varint_epoch_number)); + PutVarint32(dst, NewFileCustomTag::kFileChecksum); PutLengthPrefixedSlice(dst, Slice(f.file_checksum)); @@ -239,6 +243,13 @@ bool VersionEdit::EncodeTo(std::string* dst) const { std::string unique_id_str = EncodeUniqueIdBytes(&unique_id); PutLengthPrefixedSlice(dst, Slice(unique_id_str)); } + if (f.compensated_range_deletion_size) { + PutVarint32(dst, kCompensatedRangeDeletionSize); + std::string compensated_range_deletion_size; + PutVarint64(&compensated_range_deletion_size, + f.compensated_range_deletion_size); + PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); + } TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", dst); @@ -366,6 +377,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "invalid file creation time"; } break; + case kEpochNumber: + if (!GetVarint64(&field, &f.epoch_number)) { + return "invalid epoch number"; + } + break; case kFileChecksum: f.file_checksum = field.ToString(); break; @@ -407,6 +423,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "invalid unique id"; } break; + case kCompensatedRangeDeletionSize: + if (!GetVarint64(&field, &f.compensated_range_deletion_size)) { + return "Invalid compensated range deletion size"; + } + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -530,8 +551,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) { break; case kCompactCursor: - if (GetLevel(&input, &level, &msg) && - GetInternalKey(&input, &key)) { + if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) { // Here we re-use the output format of compact pointer in LevelDB // to persist compact_cursors_ compact_cursors_.push_back(std::make_pair(level, key)); @@ -848,6 +868,8 @@ std::string VersionEdit::DebugString(bool hex_key) const { AppendNumberTo(&r, f.oldest_ancester_time); r.append(" file_creation_time:"); AppendNumberTo(&r, f.file_creation_time); + r.append(" epoch_number:"); + AppendNumberTo(&r, f.epoch_number); r.append(" file_checksum:"); r.append(Slice(f.file_checksum).ToString(true)); r.append(" file_checksum_func_name: "); @@ -973,6 +995,7 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { jw << "LargestIKey" << f.largest.DebugString(hex_key); jw << "OldestAncesterTime" << f.oldest_ancester_time; jw << "FileCreationTime" << f.file_creation_time; + jw << "EpochNumber" << f.epoch_number; jw << "FileChecksum" << Slice(f.file_checksum).ToString(true); jw << "FileChecksumFuncName" << f.file_checksum_func_name; if (f.temperature != Temperature::kUnknown) { diff --git a/db/version_edit.h b/db/version_edit.h index 4bc927a58bc..24938de0bbd 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -92,6 +92,8 @@ enum NewFileCustomTag : uint32_t { kMinTimestamp = 10, kMaxTimestamp = 11, kUniqueId = 12, + kEpochNumber = 13, + kCompensatedRangeDeletionSize = 14, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -106,6 +108,10 @@ class VersionSet; constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; constexpr uint64_t kUnknownOldestAncesterTime = 0; constexpr uint64_t kUnknownFileCreationTime = 0; +constexpr uint64_t kUnknownEpochNumber = 0; +// If `Options::allow_ingest_behind` is true, this epoch number +// will be dedicated to files ingested behind. +constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1; extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); @@ -118,7 +124,7 @@ struct FileDescriptor { // Table reader in table_reader_handle TableReader* table_reader; uint64_t packed_number_and_path_id; - uint64_t file_size; // File size in bytes + uint64_t file_size; // File size in bytes SequenceNumber smallest_seqno; // The smallest seqno in this file SequenceNumber largest_seqno; // The largest seqno in this file @@ -150,8 +156,8 @@ struct FileDescriptor { return packed_number_and_path_id & kFileNumberMask; } uint32_t GetPathId() const { - return static_cast( - packed_number_and_path_id / (kFileNumberMask + 1)); + return static_cast(packed_number_and_path_id / + (kFileNumberMask + 1)); } uint64_t GetFileSize() const { return file_size; } }; @@ -170,8 +176,8 @@ struct FileSampledStats { struct FileMetaData { FileDescriptor fd; - InternalKey smallest; // Smallest internal key served by table - InternalKey largest; // Largest internal key served by table + InternalKey smallest; // Smallest internal key served by table + InternalKey largest; // Largest internal key served by table // Needs to be disposed when refs becomes 0. Cache::Handle* table_reader_handle = nullptr; @@ -181,15 +187,22 @@ struct FileMetaData { // Stats for compensating deletion entries during compaction // File size compensated by deletion entry. - // This is updated in Version::UpdateAccumulatedStats() first time when the - // file is created or loaded. After it is updated (!= 0), it is immutable. + // This is used to compute a file's compaction priority, and is updated in + // Version::ComputeCompensatedSizes() first time when the file is created or + // loaded. After it is updated (!= 0), it is immutable. uint64_t compensated_file_size = 0; // These values can mutate, but they can only be read or written from // single-threaded LogAndApply thread uint64_t num_entries = 0; // the number of entries. - uint64_t num_deletions = 0; // the number of deletion entries. + // The number of deletion entries, including range deletions. + uint64_t num_deletions = 0; uint64_t raw_key_size = 0; // total uncompressed key size. uint64_t raw_value_size = 0; // total uncompressed value size. + uint64_t num_range_deletions = 0; + // This is computed during Flush/Compaction, and is added to + // `compensated_file_size`. Currently, this estimates the size of keys in the + // next level covered by range tombstones in this file. + uint64_t compensated_range_deletion_size = 0; int refs = 0; // Reference count @@ -214,6 +227,12 @@ struct FileMetaData { // Unix time when the SST file is created. uint64_t file_creation_time = kUnknownFileCreationTime; + // The order of a file being flushed or ingested/imported. + // Compaction output file will be assigned with the minimum `epoch_number` + // among input files'. + // For L0, larger `epoch_number` indicates newer L0 file. + uint64_t epoch_number = kUnknownEpochNumber; + // File checksum std::string file_checksum = kUnknownFileChecksum; @@ -231,17 +250,20 @@ struct FileMetaData { const SequenceNumber& largest_seq, bool marked_for_compact, Temperature _temperature, uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - const std::string& _file_checksum, + uint64_t _epoch_number, const std::string& _file_checksum, const std::string& _file_checksum_func_name, - UniqueId64x2 _unique_id) + UniqueId64x2 _unique_id, + const uint64_t _compensated_range_deletion_size) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), + compensated_range_deletion_size(_compensated_range_deletion_size), marked_for_compaction(marked_for_compact), temperature(_temperature), oldest_blob_file_number(oldest_blob_file), oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), + epoch_number(_epoch_number), file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), unique_id(std::move(_unique_id)) { @@ -264,6 +286,7 @@ struct FileMetaData { if (largest.size() == 0 || icmp.Compare(largest, end) < 0) { largest = end; } + assert(icmp.Compare(smallest, largest) <= 0); fd.smallest_seqno = std::min(fd.smallest_seqno, seqno); fd.largest_seqno = std::max(fd.largest_seqno, seqno); } @@ -316,15 +339,11 @@ struct FileMetaData { struct FdWithKeyRange { FileDescriptor fd; FileMetaData* file_metadata; // Point to all metadata - Slice smallest_key; // slice that contain smallest key - Slice largest_key; // slice that contain largest key + Slice smallest_key; // slice that contain smallest key + Slice largest_key; // slice that contain largest key FdWithKeyRange() - : fd(), - file_metadata(nullptr), - smallest_key(), - largest_key() { - } + : fd(), file_metadata(nullptr), smallest_key(), largest_key() {} FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key, FileMetaData* _file_metadata) @@ -456,17 +475,19 @@ class VersionEdit { const SequenceNumber& largest_seqno, bool marked_for_compaction, Temperature temperature, uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, uint64_t file_creation_time, - const std::string& file_checksum, + uint64_t epoch_number, const std::string& file_checksum, const std::string& file_checksum_func_name, - const UniqueId64x2& unique_id) { + const UniqueId64x2& unique_id, + const uint64_t compensated_range_deletion_size) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, FileMetaData(file, file_path_id, file_size, smallest, largest, smallest_seqno, largest_seqno, marked_for_compaction, temperature, oldest_blob_file_number, oldest_ancester_time, - file_creation_time, file_checksum, file_checksum_func_name, - unique_id)); + file_creation_time, epoch_number, file_checksum, + file_checksum_func_name, unique_id, + compensated_range_deletion_size)); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); } diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index e237d5e8911..1e0a934eb85 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -14,6 +14,7 @@ #include "db/blob/blob_file_reader.h" #include "db/blob/blob_source.h" +#include "db/version_edit.h" #include "logging/logging.h" #include "monitoring/persistent_stats_history.h" @@ -154,7 +155,7 @@ VersionEditHandler::VersionEditHandler( bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, - bool skip_load_table_files) + bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement) : VersionEditHandlerBase(), read_only_(read_only), column_families_(std::move(column_families)), @@ -163,7 +164,8 @@ VersionEditHandler::VersionEditHandler( no_error_if_files_missing_(no_error_if_files_missing), io_tracer_(io_tracer), skip_load_table_files_(skip_load_table_files), - initialized_(false) { + initialized_(false), + epoch_number_requirement_(epoch_number_requirement) { assert(version_set_ != nullptr); } @@ -431,6 +433,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, } } } + if (s->ok()) { for (auto* cfd : *(version_set_->column_family_set_)) { if (cfd->IsDropped()) { @@ -536,7 +539,8 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, auto* builder = builder_iter->second->version_builder(); auto* v = new Version(cfd, version_set_, version_set_->file_options_, *cfd->GetLatestMutableCFOptions(), io_tracer_, - version_set_->current_version_number_++); + version_set_->current_version_number_++, + epoch_number_requirement_); s = builder->SaveTo(v->storage_info()); if (s.ok()) { // Install new version @@ -667,10 +671,12 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( bool read_only, std::vector column_families, - VersionSet* version_set, const std::shared_ptr& io_tracer) + VersionSet* version_set, const std::shared_ptr& io_tracer, + EpochNumberRequirement epoch_number_requirement) : VersionEditHandler(read_only, column_families, version_set, /*track_missing_files=*/true, - /*no_error_if_files_missing=*/true, io_tracer) {} + /*no_error_if_files_missing=*/true, io_tracer, + epoch_number_requirement) {} VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { for (const auto& elem : versions_) { @@ -829,7 +835,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( auto* version = new Version(cfd, version_set_, version_set_->file_options_, *cfd->GetLatestMutableCFOptions(), io_tracer_, - version_set_->current_version_number_++); + version_set_->current_version_number_++, + epoch_number_requirement_); s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index fd2379b0734..fc3fe7c6b88 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -110,10 +110,13 @@ class VersionEditHandler : public VersionEditHandlerBase { const std::vector& column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent) : VersionEditHandler(read_only, column_families, version_set, track_missing_files, no_error_if_files_missing, - io_tracer, /*skip_load_table_files=*/false) {} + io_tracer, /*skip_load_table_files=*/false, + epoch_number_requirement) {} ~VersionEditHandler() override {} @@ -134,7 +137,9 @@ class VersionEditHandler : public VersionEditHandlerBase { bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, - const std::shared_ptr& io_tracer, bool skip_load_table_files); + const std::shared_ptr& io_tracer, bool skip_load_table_files, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; @@ -189,6 +194,7 @@ class VersionEditHandler : public VersionEditHandlerBase { bool skip_load_table_files_; bool initialized_; std::unique_ptr> cf_to_cmp_names_; + EpochNumberRequirement epoch_number_requirement_; private: Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, @@ -205,7 +211,9 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { public: VersionEditHandlerPointInTime( bool read_only, std::vector column_families, - VersionSet* version_set, const std::shared_ptr& io_tracer); + VersionSet* version_set, const std::shared_ptr& io_tracer, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); ~VersionEditHandlerPointInTime() override; protected: @@ -229,9 +237,12 @@ class ManifestTailer : public VersionEditHandlerPointInTime { public: explicit ManifestTailer(std::vector column_families, VersionSet* version_set, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent) : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, - version_set, io_tracer), + version_set, io_tracer, + epoch_number_requirement), mode_(Mode::kRecovery) {} void PrepareToReadNewManifest() { diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index c7f271d83ab..1fa6c005497 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -43,8 +43,9 @@ TEST_F(VersionEditTest, EncodeDecode) { InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("zoo", kBig + 600 + i, kTypeDeletion), kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, - kInvalidBlobFileNumber, 888, 678, "234", "crc32c", - kNullUniqueId64x2); + kInvalidBlobFileNumber, 888, 678, + kBig + 300 + i /* epoch_number */, "234", "crc32c", + kNullUniqueId64x2, 0); edit.DeleteFile(4, kBig + 700 + i); } @@ -63,25 +64,25 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 300 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 301 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, - 666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 666, 888, 302 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, kBig + 603, true, Temperature::kUnknown, 1001, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 303 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); edit.DeleteFile(4, 700); @@ -121,12 +122,13 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 300 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, - 686, 868, "234", "crc32c", kNullUniqueId64x2); + 686, 868, 301 /* epoch_number */, "234", "crc32c", + kNullUniqueId64x2, 0); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -174,8 +176,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 300 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -205,8 +207,8 @@ TEST_F(VersionEditTest, EncodeEmptyFile) { edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 1 /*epoch_number*/, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } diff --git a/db/version_set.cc b/db/version_set.cc index af4a036a172..7023839c1b9 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -109,23 +109,19 @@ Status SerializeReplicationLogManifestWrite( // Find File in LevelFilesBrief data structure // Within an index range defined by left and right int FindFileInRange(const InternalKeyComparator& icmp, - const LevelFilesBrief& file_level, - const Slice& key, - uint32_t left, - uint32_t right) { + const LevelFilesBrief& file_level, const Slice& key, + uint32_t left, uint32_t right) { auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool { return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0; }; - const auto &b = file_level.files; - return static_cast(std::lower_bound(b + left, - b + right, key, cmp) - b); + const auto& b = file_level.files; + return static_cast(std::lower_bound(b + left, b + right, key, cmp) - b); } Status OverlapWithIterator(const Comparator* ucmp, - const Slice& smallest_user_key, - const Slice& largest_user_key, - InternalIterator* iter, - bool* overlap) { + const Slice& smallest_user_key, + const Slice& largest_user_key, + InternalIterator* iter, bool* overlap) { InternalKey range_start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); iter->Seek(range_start.Encode()); @@ -203,9 +199,9 @@ class FilePicker { // Do key range filtering of files or/and fractional cascading if: // (1) not all the files are in level 0, or // (2) there are more than 3 current level files - // If there are only 3 or less current level files in the system, we skip - // the key range filtering. In this case, more likely, the system is - // highly tuned to minimize number of tables queried by each query, + // If there are only 3 or less current level files in the system, we + // skip the key range filtering. In this case, more likely, the system + // is highly tuned to minimize number of tables queried by each query, // so it is unlikely that key range filtering is more efficient than // querying the files. if (num_levels_ > 1 || curr_file_level_->num_files > 3) { @@ -227,11 +223,9 @@ class FilePicker { // Setup file search bound for the next level based on the // comparison results if (curr_level_ > 0) { - file_indexer_->GetNextLevelIndex(curr_level_, - curr_index_in_curr_level_, - cmp_smallest, cmp_largest, - &search_left_bound_, - &search_right_bound_); + file_indexer_->GetNextLevelIndex( + curr_level_, curr_index_in_curr_level_, cmp_smallest, + cmp_largest, &search_left_bound_, &search_right_bound_); } // Key falls out of current file's range if (cmp_smallest < 0 || cmp_largest > 0) { @@ -862,22 +856,21 @@ Version::~Version() { } int FindFile(const InternalKeyComparator& icmp, - const LevelFilesBrief& file_level, - const Slice& key) { + const LevelFilesBrief& file_level, const Slice& key) { return FindFileInRange(icmp, file_level, key, 0, static_cast(file_level.num_files)); } void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, - const std::vector& files, - Arena* arena) { + const std::vector& files, + Arena* arena) { assert(file_level); assert(arena); size_t num = files.size(); file_level->num_files = num; char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange)); - file_level->files = new (mem)FdWithKeyRange[num]; + file_level->files = new (mem) FdWithKeyRange[num]; for (size_t i = 0; i < num; i++) { Slice smallest_key = files[i]->smallest.Encode(); @@ -898,28 +891,27 @@ void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, } } -static bool AfterFile(const Comparator* ucmp, - const Slice* user_key, const FdWithKeyRange* f) { +static bool AfterFile(const Comparator* ucmp, const Slice* user_key, + const FdWithKeyRange* f) { // nullptr user_key occurs before all keys and is therefore never after *f return (user_key != nullptr && ucmp->CompareWithoutTimestamp(*user_key, ExtractUserKey(f->largest_key)) > 0); } -static bool BeforeFile(const Comparator* ucmp, - const Slice* user_key, const FdWithKeyRange* f) { +static bool BeforeFile(const Comparator* ucmp, const Slice* user_key, + const FdWithKeyRange* f) { // nullptr user_key occurs after all keys and is therefore never before *f return (user_key != nullptr && ucmp->CompareWithoutTimestamp(*user_key, ExtractUserKey(f->smallest_key)) < 0); } -bool SomeFileOverlapsRange( - const InternalKeyComparator& icmp, - bool disjoint_sorted_files, - const LevelFilesBrief& file_level, - const Slice* smallest_user_key, - const Slice* largest_user_key) { +bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, + bool disjoint_sorted_files, + const LevelFilesBrief& file_level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { const Comparator* ucmp = icmp.user_comparator(); if (!disjoint_sorted_files) { // Need to check against all files @@ -1041,9 +1033,7 @@ class LevelIterator final : public InternalIterator { return file_iter_.iter() ? file_iter_.status() : Status::OK(); } - bool PrepareValue() override { - return file_iter_.PrepareValue(); - } + bool PrepareValue() override { return file_iter_.PrepareValue(); } inline bool MayBeOutOfLowerBound() override { assert(Valid()); @@ -1577,9 +1567,8 @@ Status Version::GetTableProperties(std::shared_ptr* tp, if (fname != nullptr) { file_name = *fname; } else { - file_name = - TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(), - file_meta->fd.GetPathId()); + file_name = TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); } s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file, nullptr); @@ -1706,8 +1695,8 @@ Status Version::GetPropertiesOfTablesInRange( false); for (const auto& file_meta : files) { auto fname = - TableFileName(cfd_->ioptions()->cf_paths, - file_meta->fd.GetNumber(), file_meta->fd.GetPathId()); + TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(), + file_meta->fd.GetPathId()); if (props->count(fname) == 0) { // 1. If the table is already present in table cache, load table // properties from there. @@ -1798,14 +1787,13 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file->stats.num_reads_sampled.load(std::memory_order_relaxed), file->being_compacted, file->temperature, file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), - file->TryGetFileCreationTime(), file->file_checksum, - file->file_checksum_func_name); + file->TryGetFileCreationTime(), file->epoch_number, + file->file_checksum, file->file_checksum_func_name); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; level_size += file->fd.GetFileSize(); } - cf_meta->levels.emplace_back( - level, level_size, std::move(files)); + cf_meta->levels.emplace_back(level, level_size, std::move(files)); cf_meta->size += level_size; } for (const auto& meta : vstorage->GetBlobFiles()) { @@ -1896,10 +1884,8 @@ uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { if (current_num_samples_ < file_count) { // casting to avoid overflowing - return - static_cast( - (est * static_cast(file_count) / current_num_samples_) - ); + return static_cast( + (est * static_cast(file_count) / current_num_samples_)); } else { return est; } @@ -2036,8 +2022,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, /*allow_unprepared_value=*/false)); - status = OverlapWithIterator( - ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); + status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, + iter.get(), overlap); if (!status.ok() || *overlap) { break; } @@ -2049,10 +2035,10 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), - TableReaderCaller::kUserIterator, IsFilterSkipped(level, read_options), - level, &range_del_agg)); - status = OverlapWithIterator( - ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); + TableReaderCaller::kUserIterator, IsFilterSkipped(level, read_options), level, + &range_del_agg)); + status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, + iter.get(), overlap); } if (status.ok() && *overlap == false && @@ -2066,7 +2052,8 @@ VersionStorageInfo::VersionStorageInfo( const InternalKeyComparator* internal_comparator, const Comparator* user_comparator, int levels, CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage, - bool _force_consistency_checks) + bool _force_consistency_checks, + EpochNumberRequirement epoch_number_requirement) : internal_comparator_(internal_comparator), user_comparator_(user_comparator), // cfd is nullptr if Version is dummy @@ -2094,7 +2081,8 @@ VersionStorageInfo::VersionStorageInfo( current_num_samples_(0), estimated_compaction_needed_bytes_(0), finalized_(false), - force_consistency_checks_(_force_consistency_checks) { + force_consistency_checks_(_force_consistency_checks), + epoch_number_requirement_(epoch_number_requirement) { if (ref_vstorage != nullptr) { accumulated_file_size_ = ref_vstorage->accumulated_file_size_; accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; @@ -2115,7 +2103,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, const FileOptions& file_opt, const MutableCFOptions mutable_cf_options, const std::shared_ptr& io_tracer, - uint64_t version_number) + uint64_t version_number, + EpochNumberRequirement epoch_number_requirement) : env_(vset->env_), clock_(vset->clock_), cfd_(column_family_data), @@ -2134,7 +2123,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, (cfd_ == nullptr || cfd_->current() == nullptr) ? nullptr : cfd_->current()->storage_info(), - cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks), + cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks, + epoch_number_requirement), vset_(vset), next_(this), prev_(this), @@ -2396,10 +2386,6 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, "Encounter unexpected blob index. Please open DB with " "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); return; - case GetContext::kUnexpectedWideColumnEntity: - *status = - Status::NotSupported("Encountered unexpected wide-column entity"); - return; } f = fp.GetNextFile(); } @@ -2412,25 +2398,36 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, return; } if (!merge_operator_) { - *status = Status::InvalidArgument( + *status = Status::InvalidArgument( "merge_operator is not properly initialized."); return; } // merge_operands are in saver and we hit the beginning of the key history // do a final merge of nullptr and operands; - std::string* str_value = value != nullptr ? value->GetSelf() : nullptr; - *status = MergeHelper::TimedFullMerge( - merge_operator_, user_key, nullptr, merge_context->GetOperands(), - str_value, info_log_, db_statistics_, clock_, - nullptr /* result_operand */, true); - if (LIKELY(value != nullptr)) { - value->PinSelf(); + if (value || columns) { + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + *status = MergeHelper::TimedFullMerge( + merge_operator_, user_key, nullptr, merge_context->GetOperands(), + &result, info_log_, db_statistics_, clock_, + /* result_operand */ nullptr, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (status->ok()) { + if (LIKELY(value != nullptr)) { + *(value->GetSelf()) = std::move(result); + value->PinSelf(); + } else { + assert(columns != nullptr); + columns->SetPlainValue(result); + } + } } } else { if (key_exists != nullptr) { *key_exists = false; } - *status = Status::NotFound(); // Use an empty error message for speed + *status = Status::NotFound(); // Use an empty error message for speed } } @@ -2531,7 +2528,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, std::vector> mget_tasks; while (f != nullptr) { MultiGetRange file_range = fp.CurrentFileRange(); - Cache::Handle* table_handle = nullptr; + TableCache::TypedHandle* table_handle = nullptr; bool skip_filters = IsFilterSkipped(static_cast(fp.GetHitFileLevel()), read_options, fp.IsHitFileLastInLevel()); @@ -2565,16 +2562,19 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, } f = fp.GetNextFileInLevel(); } - if (s.ok() && mget_tasks.size() > 0) { + if (mget_tasks.size() > 0) { RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size()); // Collect all results so far std::vector statuses = folly::coro::blockingWait( folly::coro::collectAllRange(std::move(mget_tasks)) .scheduleOn(&range->context()->executor())); - for (Status stat : statuses) { - if (!stat.ok()) { - s = stat; + if (s.ok()) { + for (Status stat : statuses) { + if (!stat.ok()) { + s = std::move(stat); + break; + } } } @@ -2657,10 +2657,13 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, // do a final merge of nullptr and operands; std::string* str_value = iter->value != nullptr ? iter->value->GetSelf() : nullptr; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. *status = MergeHelper::TimedFullMerge( merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(), str_value, info_log_, db_statistics_, clock_, - nullptr /* result_operand */, true); + /* result_operand */ nullptr, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); if (LIKELY(iter->value != nullptr)) { iter->value->PinSelf(); range->AddValueSize(iter->value->size()); @@ -2710,10 +2713,9 @@ Status Version::ProcessBatch( } while (f) { MultiGetRange file_range = fp.CurrentFileRange(); - Cache::Handle* table_handle = nullptr; - bool skip_filters = - IsFilterSkipped(static_cast(fp.GetHitFileLevel()), read_options, - fp.IsHitFileLastInLevel()); + TableCache::TypedHandle* table_handle = nullptr; + bool skip_filters = IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()); bool skip_range_deletions = false; if (!skip_filters) { Status status = table_cache_->MultiGetFilter( @@ -2821,6 +2823,9 @@ Status Version::MultiGetAsync( unsigned int num_tasks_queued = 0; to_process.pop_front(); if (batch->IsSearchEnded() || batch->GetRange().empty()) { + // If to_process is empty, i.e no more batches to look at, then we need + // schedule the enqueued coroutines and wait for them. Otherwise, we + // skip this batch and move to the next one in to_process. if (!to_process.empty()) { continue; } @@ -2829,9 +2834,6 @@ Status Version::MultiGetAsync( // to_process s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting, to_process, num_tasks_queued, mget_stats); - if (!s.ok()) { - break; - } // If ProcessBatch didn't enqueue any coroutine tasks, it means all // keys were filtered out. So put the batch back in to_process to // lookup in the next level @@ -2842,8 +2844,10 @@ Status Version::MultiGetAsync( waiting.emplace_back(idx); } } - if (to_process.empty()) { - if (s.ok() && mget_tasks.size() > 0) { + // If ProcessBatch() returned an error, then schedule the enqueued + // coroutines and wait for them, then abort the MultiGet. + if (to_process.empty() || !s.ok()) { + if (mget_tasks.size() > 0) { assert(waiting.size()); RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size()); // Collect all results so far @@ -2851,10 +2855,12 @@ Status Version::MultiGetAsync( folly::coro::collectAllRange(std::move(mget_tasks)) .scheduleOn(&range->context()->executor())); mget_tasks.clear(); - for (Status stat : statuses) { - if (!stat.ok()) { - s = stat; - break; + if (s.ok()) { + for (Status stat : statuses) { + if (!stat.ok()) { + s = std::move(stat); + break; + } } } @@ -2877,6 +2883,9 @@ Status Version::MultiGetAsync( assert(!s.ok() || waiting.size() == 0); } } + if (!s.ok()) { + break; + } } uint64_t num_levels = 0; @@ -2920,8 +2929,8 @@ bool Version::IsFilterSkipped(int level, const ReadOptions& read_options, void VersionStorageInfo::GenerateLevelFilesBrief() { level_files_brief_.resize(num_non_empty_levels_); for (int level = 0; level < num_non_empty_levels_; level++) { - DoGenerateLevelFilesBrief( - &level_files_brief_[level], files_[level], &arena_); + DoGenerateLevelFilesBrief(&level_files_brief_[level], files_[level], + &arena_); } } @@ -2955,8 +2964,7 @@ void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options, } bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { - if (file_meta->init_stats_from_file || - file_meta->compensated_file_size > 0) { + if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) { return false; } std::shared_ptr tp; @@ -2974,7 +2982,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { file_meta->num_deletions = tp->num_deletions; file_meta->raw_value_size = tp->raw_value_size; file_meta->raw_key_size = tp->raw_key_size; - + file_meta->num_range_deletions = tp->num_range_deletions; return true; } @@ -3076,11 +3084,15 @@ void VersionStorageInfo::ComputeCompensatedSizes() { // size of deletion entries in a stable workload, the deletion // compensation logic might introduce unwanted effet which changes the // shape of LSM tree. - if (file_meta->num_deletions * 2 >= file_meta->num_entries) { + if ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 >= + file_meta->num_entries) { file_meta->compensated_file_size += - (file_meta->num_deletions * 2 - file_meta->num_entries) * + ((file_meta->num_deletions - file_meta->num_range_deletions) * 2 - + file_meta->num_entries) * average_value_size * kDeletionWeightOnCompaction; } + file_meta->compensated_file_size += + file_meta->compensated_range_deletion_size; } } } @@ -3623,9 +3635,9 @@ struct Fsize { // In normal mode: descending size bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { return (first.file->compensated_file_size > - second.file->compensated_file_size); + second.file->compensated_file_size); } -} // anonymous namespace +} // anonymous namespace void VersionStorageInfo::AddFile(int level, FileMetaData* f) { auto& level_files = files_[level]; @@ -3677,13 +3689,9 @@ void VersionStorageInfo::SetFinalized() { assert(MaxBytesForLevel(level) >= max_bytes_prev_level); max_bytes_prev_level = MaxBytesForLevel(level); } - int num_empty_non_l0_level = 0; for (int level = 0; level < num_levels(); level++) { assert(LevelFiles(level).size() == 0 || LevelFiles(level).size() == LevelFilesBrief(level).num_files); - if (level > 0 && NumLevelBytes(level) > 0) { - num_empty_non_l0_level++; - } if (LevelFiles(level).size() > 0) { assert(level < num_non_empty_levels()); } @@ -3825,7 +3833,7 @@ void SortFileByRoundRobin(const InternalKeyComparator& icmp, } } } -} // namespace +} // anonymous namespace void VersionStorageInfo::UpdateFilesByCompactionPri( const ImmutableOptions& ioptions, const MutableCFOptions& options) { @@ -3998,9 +4006,7 @@ void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() { } } -void Version::Ref() { - ++refs_; -} +void Version::Ref() { ++refs_; } bool Version::Unref() { assert(refs_ >= 1); @@ -4132,9 +4138,8 @@ void VersionStorageInfo::GetCleanInputsWithinInterval( return; } - GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, - hint_index, file_index, - true /* within_interval */); + GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index, + file_index, true /* within_interval */); } // Store in "*inputs" all files in "level" that overlap [begin,end] @@ -4297,8 +4302,7 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", f->fd.GetNumber(), f->fd.smallest_seqno, sztxt, static_cast(f->being_compacted)); - if (ret < 0 || ret >= sz) - break; + if (ret < 0 || ret >= sz) break; len += ret; } // overwrite the last space (only if files_[level].size() is non-zero) @@ -4309,6 +4313,74 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, return scratch->buffer; } +bool VersionStorageInfo::HasMissingEpochNumber() const { + for (int level = 0; level < num_levels_; ++level) { + for (const FileMetaData* f : files_[level]) { + if (f->epoch_number == kUnknownEpochNumber) { + return true; + } + } + } + return false; +} + +uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const { + uint64_t max_epoch_number = kUnknownEpochNumber; + for (int level = 0; level < num_levels_; ++level) { + for (const FileMetaData* f : files_[level]) { + max_epoch_number = std::max(max_epoch_number, f->epoch_number); + } + } + return max_epoch_number; +} + +void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd) { + cfd->ResetNextEpochNumber(); + + bool reserve_epoch_num_for_file_ingested_behind = + cfd->ioptions()->allow_ingest_behind; + if (reserve_epoch_num_for_file_ingested_behind) { + uint64_t reserved_epoch_number = cfd->NewEpochNumber(); + assert(reserved_epoch_number == kReservedEpochNumberForFileIngestedBehind); + ROCKS_LOG_INFO(cfd->ioptions()->info_log.get(), + "[%s]CF has reserved epoch number %" PRIu64 + " for files ingested " + "behind since `Options::allow_ingest_behind` is true", + cfd->GetName().c_str(), reserved_epoch_number); + } + + if (HasMissingEpochNumber()) { + assert(epoch_number_requirement_ == EpochNumberRequirement::kMightMissing); + assert(num_levels_ >= 1); + + for (int level = num_levels_ - 1; level >= 1; --level) { + auto& files_at_level = files_[level]; + if (files_at_level.empty()) { + continue; + } + uint64_t next_epoch_number = cfd->NewEpochNumber(); + for (FileMetaData* f : files_at_level) { + f->epoch_number = next_epoch_number; + } + } + + for (auto file_meta_iter = files_[0].rbegin(); + file_meta_iter != files_[0].rend(); file_meta_iter++) { + FileMetaData* f = *file_meta_iter; + f->epoch_number = cfd->NewEpochNumber(); + } + + ROCKS_LOG_WARN(cfd->ioptions()->info_log.get(), + "[%s]CF's epoch numbers are inferred based on seqno", + cfd->GetName().c_str()); + epoch_number_requirement_ = EpochNumberRequirement::kMustPresent; + } else { + assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent); + cfd->SetNextEpochNumber( + std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber())); + } +} + uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { uint64_t result = 0; std::vector overlaps; @@ -4476,13 +4548,13 @@ uint64_t VersionStorageInfo::EstimateLiveDataSize() const { // no potential overlap, we can safely insert the rest of this level // (if the level is not 0) into the map without checking again because // the elements in the level are sorted and non-overlapping. - auto lb = (found_end && l != 0) ? - ranges.end() : ranges.lower_bound(&file->smallest); + auto lb = (found_end && l != 0) ? ranges.end() + : ranges.lower_bound(&file->smallest); found_end = (lb == ranges.end()); if (found_end || internal_comparator_->Compare( - file->largest, (*lb).second->smallest) < 0) { - ranges.emplace_hint(lb, &file->largest, file); - size += file->fd.file_size; + file->largest, (*lb).second->smallest) < 0) { + ranges.emplace_hint(lb, &file->largest, file); + size += file->fd.file_size; } } } @@ -5049,10 +5121,15 @@ Status VersionSet::ProcessManifestWrites( manifest_file_size_ > db_options_->max_manifest_file_size || new_manifest_force) { TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest"); + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:BeforeNewManifest", nullptr); new_descriptor_log = true; } else { pending_manifest_file_number_ = manifest_file_number_; } + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:PostDecidingCreateNewManifestOrNot", + &new_descriptor_log); // Local cached copy of state variable(s). WriteCurrentStateToManifest() // reads its content after releasing db mutex to avoid race with @@ -5614,7 +5691,8 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Status VersionSet::Recover( const std::vector& column_families, bool read_only, std::string* db_id, bool no_error_if_files_missing) { - // Read "CURRENT" file, which contains a pointer to the current manifest file + // Read "CURRENT" file, which contains a pointer to the current manifest + // file std::string manifest_path; Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path, &manifest_file_number_); @@ -5648,7 +5726,8 @@ Status VersionSet::Recover( true /* checksum */, 0 /* log_number */); VersionEditHandler handler( read_only, column_families, const_cast(this), - /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_); + /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_, + EpochNumberRequirement::kMightMissing); handler.Iterate(reader, &log_read_status); s = handler.status(); if (s.ok()) { @@ -5657,6 +5736,9 @@ Status VersionSet::Recover( assert(current_manifest_file_size != 0); handler.GetDbId(db_id); } + if (s.ok()) { + RecoverEpochNumbers(); + } } if (s.ok()) { @@ -5763,7 +5845,7 @@ std::string ManifestPicker::GetNextManifest(uint64_t* number, } return ret; } -} // namespace +} // anonymous namespace Status VersionSet::TryRecover( const std::vector& column_families, bool read_only, @@ -5816,7 +5898,8 @@ Status VersionSet::TryRecoverFromOneManifest( log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, /*checksum=*/true, /*log_num=*/0); VersionEditHandlerPointInTime handler_pit( - read_only, column_families, const_cast(this), io_tracer_); + read_only, column_families, const_cast(this), io_tracer_, + EpochNumberRequirement::kMightMissing); handler_pit.Iterate(reader, &s); @@ -5825,7 +5908,21 @@ Status VersionSet::TryRecoverFromOneManifest( assert(nullptr != has_missing_table_file); *has_missing_table_file = handler_pit.HasMissingFiles(); - return handler_pit.status(); + s = handler_pit.status(); + if (s.ok()) { + RecoverEpochNumbers(); + } + return s; +} + +void VersionSet::RecoverEpochNumbers() { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + cfd->RecoverEpochNumbers(); + } } Status VersionSet::ListColumnFamilies(std::vector* column_families, @@ -5965,7 +6062,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, } } - delete[] vstorage -> files_; + delete[] vstorage->files_; vstorage->files_ = new_files_list; vstorage->num_levels_ = new_levels; vstorage->ResizeCompactCursors(new_levels); @@ -5974,9 +6071,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, VersionEdit ve; InstrumentedMutex dummy_mutex; InstrumentedMutexLock l(&dummy_mutex); - return versions.LogAndApply( - versions.GetColumnFamilySet()->GetDefault(), - mutable_cf_options, &ve, &dummy_mutex, nullptr, true); + return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &ve, &dummy_mutex, nullptr, + true); } // Get the checksum information including the checksum and checksum function @@ -6062,9 +6159,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, std::unique_ptr file; const std::shared_ptr& fs = options.env->GetFileSystem(); s = fs->NewSequentialFile( - dscname, - fs->OptimizeForManifestRead(file_options_), &file, - nullptr); + dscname, fs->OptimizeForManifestRead(file_options_), &file, nullptr); if (!s.ok()) { return s; } @@ -6147,6 +6242,22 @@ Status VersionSet::WriteCurrentStateToManifest( } } + // New manifest should rollover the WAL deletion record from previous + // manifest. Otherwise, when an addition record of a deleted WAL gets added to + // this new manifest later (which can happens in e.g, SyncWAL()), this new + // manifest creates an illusion that such WAL hasn't been deleted. + VersionEdit wal_deletions; + wal_deletions.DeleteWalsBefore(min_log_number_to_keep()); + std::string wal_deletions_record; + if (!wal_deletions.EncodeTo(&wal_deletions_record)) { + return Status::Corruption("Unable to Encode VersionEdit: " + + wal_deletions.DebugString(true)); + } + io_s = log->AddRecord(wal_deletions_record); + if (!io_s.ok()) { + return io_s; + } + for (auto cfd : *column_family_set_) { assert(cfd); @@ -6167,8 +6278,8 @@ Status VersionSet::WriteCurrentStateToManifest( cfd->internal_comparator().user_comparator()->Name()); std::string record; if (!edit.EncodeTo(&record)) { - return Status::Corruption( - "Unable to Encode VersionEdit:" + edit.DebugString(true)); + return Status::Corruption("Unable to Encode VersionEdit:" + + edit.DebugString(true)); } io_s = log->AddRecord(record); if (!io_s.ok()) { @@ -6198,8 +6309,9 @@ Status VersionSet::WriteCurrentStateToManifest( f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->file_checksum, - f->file_checksum_func_name, f->unique_id); + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id, + f->compensated_range_deletion_size); } } @@ -6226,9 +6338,10 @@ Status VersionSet::WriteCurrentStateToManifest( edit.SetLogNumber(log_number); if (cfd->GetID() == 0) { - // min_log_number_to_keep is for the whole db, not for specific column family. - // So it does not need to be set for every column family, just need to be set once. - // Since default CF can never be dropped, we set the min_log to the default CF here. + // min_log_number_to_keep is for the whole db, not for specific column + // family. So it does not need to be set for every column family, just + // need to be set once. Since default CF can never be dropped, we set + // the min_log to the default CF here. uint64_t min_log = min_log_number_to_keep(); if (min_log != 0) { edit.SetMinLogNumberToKeep(min_log); @@ -6244,8 +6357,8 @@ Status VersionSet::WriteCurrentStateToManifest( std::string record; if (!edit.EncodeTo(&record)) { - return Status::Corruption( - "Unable to Encode VersionEdit:" + edit.DebugString(true)); + return Status::Corruption("Unable to Encode VersionEdit:" + + edit.DebugString(true)); } io_s = log->AddRecord(record); if (!io_s.ok()) { @@ -6276,8 +6389,9 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, const int num_non_empty_levels = vstorage->num_non_empty_levels(); end_level = (end_level == -1) ? num_non_empty_levels : std::min(end_level, num_non_empty_levels); - - assert(start_level <= end_level); + if (end_level <= start_level) { + return 0; + } // Outline of the optimization that uses options.files_size_error_margin. // When approximating the files total size that is used to store a keys range, @@ -6568,7 +6682,7 @@ InternalIterator* VersionSet::MakeInputIterator( const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files + c->num_input_levels() - 1 : c->num_input_levels()); - InternalIterator** list = new InternalIterator* [space]; + InternalIterator** list = new InternalIterator*[space]; size_t num = 0; for (size_t which = 0; which < c->num_input_levels(); which++) { if (c->input_levels(which)->num_files != 0) { @@ -6677,8 +6791,8 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { filemetadata.largestkey = file->largest.user_key().ToString(); filemetadata.smallest_seqno = file->fd.smallest_seqno; filemetadata.largest_seqno = file->fd.largest_seqno; - filemetadata.num_reads_sampled = file->stats.num_reads_sampled.load( - std::memory_order_relaxed); + filemetadata.num_reads_sampled = + file->stats.num_reads_sampled.load(std::memory_order_relaxed); filemetadata.being_compacted = file->being_compacted; filemetadata.num_entries = file->num_entries; filemetadata.num_deletions = file->num_deletions; @@ -6688,6 +6802,7 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { filemetadata.temperature = file->temperature; filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime(); filemetadata.file_creation_time = file->TryGetFileCreationTime(); + filemetadata.epoch_number = file->epoch_number; metadata->push_back(filemetadata); } } @@ -6843,16 +6958,16 @@ Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, InternalStats* internal_stats = cfd->internal_stats(); + TableCache::TypedHandle* handle = nullptr; FileMetaData meta_copy = meta; status = table_cache->FindTable( - ReadOptions(), file_opts, *icmp, meta_copy, - &(meta_copy.table_reader_handle), pe, + ReadOptions(), file_opts, *icmp, meta_copy, &handle, pe, /*no_io=*/false, /*record_read_stats=*/true, internal_stats->GetFileReadHist(level), false, level, /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, meta_copy.temperature); - if (meta_copy.table_reader_handle) { - table_cache->ReleaseHandle(meta_copy.table_reader_handle); + if (handle) { + table_cache->get_cache().Release(handle); } } return status; @@ -6890,12 +7005,17 @@ Status ReactiveVersionSet::Recover( log::Reader* reader = manifest_reader->get(); assert(reader); - manifest_tailer_.reset(new ManifestTailer( - column_families, const_cast(this), io_tracer_)); + manifest_tailer_.reset( + new ManifestTailer(column_families, const_cast(this), + io_tracer_, EpochNumberRequirement::kMightMissing)); manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); - return manifest_tailer_->status(); + s = manifest_tailer_->status(); + if (s.ok()) { + RecoverEpochNumbers(); + } + return s; } Status ReactiveVersionSet::ReadAndApply( diff --git a/db/version_set.h b/db/version_set.h index eed894d6877..b3bc17991a9 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -116,6 +116,10 @@ extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, const std::vector& files, Arena* arena); +enum EpochNumberRequirement { + kMightMissing, + kMustPresent, +}; // Information of the storage associated with each Version, including number of // levels of LSM tree, files information at each level, files marked for @@ -126,7 +130,9 @@ class VersionStorageInfo { const Comparator* user_comparator, int num_levels, CompactionStyle compaction_style, VersionStorageInfo* src_vstorage, - bool _force_consistency_checks); + bool _force_consistency_checks, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); // No copying allowed VersionStorageInfo(const VersionStorageInfo&) = delete; void operator=(const VersionStorageInfo&) = delete; @@ -230,9 +236,7 @@ class VersionStorageInfo { double blob_garbage_collection_age_cutoff, double blob_garbage_collection_force_threshold); - bool level0_non_overlapping() const { - return level0_non_overlapping_; - } + bool level0_non_overlapping() const { return level0_non_overlapping_; } // Updates the oldest snapshot and related internal state, like the bottommost // files marked for compaction. @@ -321,6 +325,17 @@ class VersionStorageInfo { return files_[level]; } + bool HasMissingEpochNumber() const; + uint64_t GetMaxEpochNumberOfFiles() const; + EpochNumberRequirement GetEpochNumberRequirement() const { + return epoch_number_requirement_; + } + void SetEpochNumberRequirement( + EpochNumberRequirement epoch_number_requirement) { + epoch_number_requirement_ = epoch_number_requirement; + } + void RecoverEpochNumbers(ColumnFamilyData* cfd); + class FileLocation { public: FileLocation() = default; @@ -442,6 +457,11 @@ class VersionStorageInfo { return files_marked_for_compaction_; } + void TEST_AddFileMarkedForCompaction(int level, FileMetaData* f) { + f->marked_for_compaction = true; + files_marked_for_compaction_.emplace_back(level, f); + } + // REQUIRES: ComputeCompactionScore has been called // REQUIRES: DB mutex held during access const autovector>& ExpiredTtlFiles() const { @@ -725,6 +745,8 @@ class VersionStorageInfo { // is compiled in release mode bool force_consistency_checks_; + EpochNumberRequirement epoch_number_requirement_; + friend class Version; friend class VersionSet; }; @@ -814,8 +836,8 @@ class Version { Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&, const Slice& smallest_user_key, - const Slice& largest_user_key, - int level, bool* overlap); + const Slice& largest_user_key, int level, + bool* overlap); // Lookup the value for key or get all merge operands for key. // If do_merge = true (default) then lookup value for key. @@ -1001,7 +1023,7 @@ class Version { int hit_file_level, bool skip_filters, bool skip_range_deletions, FdWithKeyRange* f, std::unordered_map& blob_ctxs, - Cache::Handle* table_handle, uint64_t& num_filter_read, + TableCache::TypedHandle* table_handle, uint64_t& num_filter_read, uint64_t& num_index_read, uint64_t& num_sst_read); #ifdef USE_COROUTINES @@ -1033,10 +1055,10 @@ class Version { const MergeOperator* merge_operator_; VersionStorageInfo storage_info_; - VersionSet* vset_; // VersionSet to which this Version belongs - Version* next_; // Next version in linked list - Version* prev_; // Previous version in linked list - int refs_; // Number of live refs to this version + VersionSet* vset_; // VersionSet to which this Version belongs + Version* next_; // Next version in linked list + Version* prev_; // Previous version in linked list + int refs_; // Number of live refs to this version const FileOptions file_options_; const MutableCFOptions mutable_cf_options_; // Cached value to avoid recomputing it on every read. @@ -1050,7 +1072,9 @@ class Version { Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt, MutableCFOptions mutable_cf_options, const std::shared_ptr& io_tracer, - uint64_t version_number = 0); + uint64_t version_number = 0, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); ~Version(); @@ -1191,6 +1215,10 @@ class VersionSet { const std::vector& column_families, bool read_only, std::string* db_id, bool* has_missing_table_file); + // Recover the next epoch number of each CFs and epoch number + // of their files (if missing) + void RecoverEpochNumbers(); + // Reads a manifest file and returns a list of column families in // column_families. static Status ListColumnFamilies(std::vector* column_families, @@ -1408,12 +1436,12 @@ class VersionSet { FileMetaData** metadata, ColumnFamilyData** cfd); // This function doesn't support leveldb SST filenames - void GetLiveFilesMetaData(std::vector *metadata); + void GetLiveFilesMetaData(std::vector* metadata); void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) { assert(table_cache_); - table_cache_->Erase(GetSlice(&blob_file_number)); + table_cache_->Erase(GetSliceForKey(&blob_file_number)); obsolete_blob_files_.emplace_back(blob_file_number, std::move(path)); } diff --git a/db/version_set_sync_and_async.h b/db/version_set_sync_and_async.h index 86f0c207718..51f58cdad0c 100644 --- a/db/version_set_sync_and_async.h +++ b/db/version_set_sync_and_async.h @@ -16,7 +16,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) (const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level, bool skip_filters, bool skip_range_deletions, FdWithKeyRange* f, std::unordered_map& blob_ctxs, - Cache::Handle* table_handle, uint64_t& num_filter_read, + TableCache::TypedHandle* table_handle, uint64_t& num_filter_read, uint64_t& num_index_read, uint64_t& num_sst_read) { bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex && get_perf_context()->per_level_perf_context_enabled; @@ -141,11 +141,6 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); file_range.MarkKeyDone(iter); continue; - case GetContext::kUnexpectedWideColumnEntity: - *status = - Status::NotSupported("Encountered unexpected wide-column entity"); - file_range.MarkKeyDone(iter); - continue; } } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index d38e3ad731f..9234a4d880c 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -14,6 +14,7 @@ #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/log_writer.h" +#include "db/version_edit.h" #include "rocksdb/advanced_options.h" #include "rocksdb/convenience.h" #include "rocksdb/file_system.h" @@ -32,7 +33,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { LevelFilesBrief file_level_; Arena arena_; - GenerateLevelFilesBriefTest() { } + GenerateLevelFilesBriefTest() {} ~GenerateLevelFilesBriefTest() override { for (size_t i = 0; i < files_.size(); i++) { @@ -49,8 +50,8 @@ class GenerateLevelFilesBriefTest : public testing::Test { InternalKey(largest, largest_seq, kTypeValue), smallest_seq, largest_seq, /* marked_for_compact */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); files_.push_back(f); } @@ -142,24 +143,27 @@ class VersionStorageInfoTestBase : public testing::Test { void Add(int level, uint32_t file_number, const char* smallest, const char* largest, uint64_t file_size = 0, - uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber, + uint64_t compensated_range_deletion_size = 0) { constexpr SequenceNumber dummy_seq = 0; Add(level, file_number, GetInternalKey(smallest, dummy_seq), - GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number); + GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number, + compensated_range_deletion_size); } void Add(int level, uint32_t file_number, const InternalKey& smallest, const InternalKey& largest, uint64_t file_size = 0, - uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber, + uint64_t compensated_range_deletion_size = 0) { assert(level < vstorage_.num_levels()); FileMetaData* f = new FileMetaData( file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0, /* largest_seq */ 0, /* marked_for_compact */ false, Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); - f->compensated_file_size = file_size; + kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, compensated_range_deletion_size); vstorage_.AddFile(level, f); } @@ -481,7 +485,8 @@ TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) { TEST_F(VersionStorageInfoTest, GetOverlappingInputs) { // Two files that overlap at the range deletion tombstone sentinel. - Add(1, 1U, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1); + Add(1, 1U, {"a", 0, kTypeValue}, + {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1); Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1); // Two files that overlap at the same user key. Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1); @@ -492,24 +497,26 @@ TEST_F(VersionStorageInfoTest, GetOverlappingInputs) { UpdateVersionStorageInfo(); - ASSERT_EQ("1,2", GetOverlappingFiles( - 1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue})); - ASSERT_EQ("1", GetOverlappingFiles( - 1, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion})); - ASSERT_EQ("2", GetOverlappingFiles( - 1, {"b", kMaxSequenceNumber, kTypeValue}, {"c", 0, kTypeValue})); - ASSERT_EQ("3,4", GetOverlappingFiles( - 1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue})); - ASSERT_EQ("3", GetOverlappingFiles( - 1, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeRangeDeletion})); - ASSERT_EQ("3,4", GetOverlappingFiles( - 1, {"e", kMaxSequenceNumber, kTypeValue}, {"f", 0, kTypeValue})); - ASSERT_EQ("3,4", GetOverlappingFiles( - 1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue})); - ASSERT_EQ("5", GetOverlappingFiles( - 1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue})); - ASSERT_EQ("6", GetOverlappingFiles( - 1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue})); + ASSERT_EQ("1,2", + GetOverlappingFiles(1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue})); + ASSERT_EQ("1", + GetOverlappingFiles(1, {"a", 0, kTypeValue}, + {"b", kMaxSequenceNumber, kTypeRangeDeletion})); + ASSERT_EQ("2", GetOverlappingFiles(1, {"b", kMaxSequenceNumber, kTypeValue}, + {"c", 0, kTypeValue})); + ASSERT_EQ("3,4", + GetOverlappingFiles(1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue})); + ASSERT_EQ("3", + GetOverlappingFiles(1, {"d", 0, kTypeValue}, + {"e", kMaxSequenceNumber, kTypeRangeDeletion})); + ASSERT_EQ("3,4", GetOverlappingFiles(1, {"e", kMaxSequenceNumber, kTypeValue}, + {"f", 0, kTypeValue})); + ASSERT_EQ("3,4", + GetOverlappingFiles(1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue})); + ASSERT_EQ("5", + GetOverlappingFiles(1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue})); + ASSERT_EQ("6", + GetOverlappingFiles(1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue})); } TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) { @@ -925,13 +932,13 @@ class FindLevelFileTest : public testing::Test { bool disjoint_sorted_files_; Arena arena_; - FindLevelFileTest() : disjoint_sorted_files_(true) { } + FindLevelFileTest() : disjoint_sorted_files_(true) {} ~FindLevelFileTest() override {} void LevelFileInit(size_t num = 0) { char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange)); - file_level_.files = new (mem)FdWithKeyRange[num]; + file_level_.files = new (mem) FdWithKeyRange[num]; file_level_.num_files = 0; } @@ -944,19 +951,18 @@ class FindLevelFileTest : public testing::Test { Slice smallest_slice = smallest_key.Encode(); Slice largest_slice = largest_key.Encode(); - char* mem = arena_.AllocateAligned( - smallest_slice.size() + largest_slice.size()); + char* mem = + arena_.AllocateAligned(smallest_slice.size() + largest_slice.size()); memcpy(mem, smallest_slice.data(), smallest_slice.size()); memcpy(mem + smallest_slice.size(), largest_slice.data(), - largest_slice.size()); + largest_slice.size()); // add to file_level_ size_t num = file_level_.num_files; auto& file = file_level_.files[num]; file.fd = FileDescriptor(num + 1, 0, 0); file.smallest_key = Slice(mem, smallest_slice.size()); - file.largest_key = Slice(mem + smallest_slice.size(), - largest_slice.size()); + file.largest_key = Slice(mem + smallest_slice.size(), largest_slice.size()); file_level_.num_files++; } @@ -980,10 +986,10 @@ TEST_F(FindLevelFileTest, LevelEmpty) { LevelFileInit(0); ASSERT_EQ(0, Find("foo")); - ASSERT_TRUE(! Overlaps("a", "z")); - ASSERT_TRUE(! Overlaps(nullptr, "z")); - ASSERT_TRUE(! Overlaps("a", nullptr)); - ASSERT_TRUE(! Overlaps(nullptr, nullptr)); + ASSERT_TRUE(!Overlaps("a", "z")); + ASSERT_TRUE(!Overlaps(nullptr, "z")); + ASSERT_TRUE(!Overlaps("a", nullptr)); + ASSERT_TRUE(!Overlaps(nullptr, nullptr)); } TEST_F(FindLevelFileTest, LevelSingle) { @@ -997,8 +1003,8 @@ TEST_F(FindLevelFileTest, LevelSingle) { ASSERT_EQ(1, Find("q1")); ASSERT_EQ(1, Find("z")); - ASSERT_TRUE(! Overlaps("a", "b")); - ASSERT_TRUE(! Overlaps("z1", "z2")); + ASSERT_TRUE(!Overlaps("a", "b")); + ASSERT_TRUE(!Overlaps("z1", "z2")); ASSERT_TRUE(Overlaps("a", "p")); ASSERT_TRUE(Overlaps("a", "q")); ASSERT_TRUE(Overlaps("a", "z")); @@ -1010,8 +1016,8 @@ TEST_F(FindLevelFileTest, LevelSingle) { ASSERT_TRUE(Overlaps("q", "q")); ASSERT_TRUE(Overlaps("q", "q1")); - ASSERT_TRUE(! Overlaps(nullptr, "j")); - ASSERT_TRUE(! Overlaps("r", nullptr)); + ASSERT_TRUE(!Overlaps(nullptr, "j")); + ASSERT_TRUE(!Overlaps("r", nullptr)); ASSERT_TRUE(Overlaps(nullptr, "p")); ASSERT_TRUE(Overlaps(nullptr, "p1")); ASSERT_TRUE(Overlaps("q", nullptr)); @@ -1043,10 +1049,10 @@ TEST_F(FindLevelFileTest, LevelMultiple) { ASSERT_EQ(3, Find("450")); ASSERT_EQ(4, Find("451")); - ASSERT_TRUE(! Overlaps("100", "149")); - ASSERT_TRUE(! Overlaps("251", "299")); - ASSERT_TRUE(! Overlaps("451", "500")); - ASSERT_TRUE(! Overlaps("351", "399")); + ASSERT_TRUE(!Overlaps("100", "149")); + ASSERT_TRUE(!Overlaps("251", "299")); + ASSERT_TRUE(!Overlaps("451", "500")); + ASSERT_TRUE(!Overlaps("351", "399")); ASSERT_TRUE(Overlaps("100", "150")); ASSERT_TRUE(Overlaps("100", "200")); @@ -1065,8 +1071,8 @@ TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) { Add("200", "250"); Add("300", "350"); Add("400", "450"); - ASSERT_TRUE(! Overlaps(nullptr, "149")); - ASSERT_TRUE(! Overlaps("451", nullptr)); + ASSERT_TRUE(!Overlaps(nullptr, "149")); + ASSERT_TRUE(!Overlaps("451", nullptr)); ASSERT_TRUE(Overlaps(nullptr, nullptr)); ASSERT_TRUE(Overlaps(nullptr, "150")); ASSERT_TRUE(Overlaps(nullptr, "199")); @@ -1084,8 +1090,8 @@ TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) { LevelFileInit(1); Add("200", "200", 5000, 3000); - ASSERT_TRUE(! Overlaps("199", "199")); - ASSERT_TRUE(! Overlaps("201", "300")); + ASSERT_TRUE(!Overlaps("199", "199")); + ASSERT_TRUE(!Overlaps("201", "300")); ASSERT_TRUE(Overlaps("200", "200")); ASSERT_TRUE(Overlaps("190", "200")); ASSERT_TRUE(Overlaps("200", "210")); @@ -1097,8 +1103,8 @@ TEST_F(FindLevelFileTest, LevelOverlappingFiles) { Add("150", "600"); Add("400", "500"); disjoint_sorted_files_ = false; - ASSERT_TRUE(! Overlaps("100", "149")); - ASSERT_TRUE(! Overlaps("601", "700")); + ASSERT_TRUE(!Overlaps("100", "149")); + ASSERT_TRUE(!Overlaps("601", "700")); ASSERT_TRUE(Overlaps("100", "150")); ASSERT_TRUE(Overlaps("100", "200")); ASSERT_TRUE(Overlaps("100", "300")); @@ -2132,6 +2138,17 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) { } } +TEST_F(VersionStorageInfoTest, AddRangeDeletionCompensatedFileSize) { + // Tests that compensated range deletion size is added to compensated file + // size. + Add(4, 100U, "1", "2", 100U, kInvalidBlobFileNumber, 1000U); + + UpdateVersionStorageInfo(); + + auto meta = vstorage_.GetFileMetaDataByNumber(100U); + ASSERT_EQ(meta->compensated_file_size, 100U + 1000U); +} + class VersionSetWithTimestampTest : public VersionSetTest { public: static const std::string kNewCfName; @@ -3189,15 +3206,19 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, std::string column_family; std::string key; // the only key int level = 0; + uint64_t epoch_number; SstInfo(uint64_t file_num, const std::string& cf_name, - const std::string& _key) - : SstInfo(file_num, cf_name, _key, 0) {} + const std::string& _key, + uint64_t _epoch_number = kUnknownEpochNumber) + : SstInfo(file_num, cf_name, _key, 0, _epoch_number) {} SstInfo(uint64_t file_num, const std::string& cf_name, - const std::string& _key, int lvl) + const std::string& _key, int lvl, + uint64_t _epoch_number = kUnknownEpochNumber) : file_number(file_num), column_family(cf_name), key(_key), - level(lvl) {} + level(lvl), + epoch_number(_epoch_number) {} }; // Create dummy sst, return their metadata. Note that only file name and size @@ -3233,8 +3254,9 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, ASSERT_NE(0, file_size); file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false, Temperature::kUnknown, 0, 0, - 0, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + 0, info.epoch_number, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, + 0); } } @@ -3271,11 +3293,11 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { std::vector existing_files = { - SstInfo(100, kDefaultColumnFamilyName, "a"), - SstInfo(102, kDefaultColumnFamilyName, "b"), - SstInfo(103, kDefaultColumnFamilyName, "c"), - SstInfo(107, kDefaultColumnFamilyName, "d"), - SstInfo(110, kDefaultColumnFamilyName, "e")}; + SstInfo(100, kDefaultColumnFamilyName, "a", 100 /* epoch_number */), + SstInfo(102, kDefaultColumnFamilyName, "b", 102 /* epoch_number */), + SstInfo(103, kDefaultColumnFamilyName, "c", 103 /* epoch_number */), + SstInfo(107, kDefaultColumnFamilyName, "d", 107 /* epoch_number */), + SstInfo(110, kDefaultColumnFamilyName, "e", 110 /* epoch_number */)}; std::vector file_metas; CreateDummyTableFiles(existing_files, &file_metas); @@ -3286,10 +3308,12 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { std::string largest_ukey = "b"; InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue); InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue); + FileMetaData meta = FileMetaData( file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + file_num /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3319,11 +3343,16 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { std::vector existing_files = { - SstInfo(100, kDefaultColumnFamilyName, "a"), - SstInfo(102, kDefaultColumnFamilyName, "b"), - SstInfo(103, kDefaultColumnFamilyName, "c"), - SstInfo(107, kDefaultColumnFamilyName, "d"), - SstInfo(110, kDefaultColumnFamilyName, "e")}; + SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */, + 100 /* epoch_number */), + SstInfo(102, kDefaultColumnFamilyName, "b", 0 /* level */, + 102 /* epoch_number */), + SstInfo(103, kDefaultColumnFamilyName, "c", 0 /* level */, + 103 /* epoch_number */), + SstInfo(107, kDefaultColumnFamilyName, "d", 0 /* level */, + 107 /* epoch_number */), + SstInfo(110, kDefaultColumnFamilyName, "e", 0 /* level */, + 110 /* epoch_number */)}; std::vector file_metas; CreateDummyTableFiles(existing_files, &file_metas); @@ -3344,7 +3373,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { FileMetaData meta = FileMetaData( file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + file_num /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3379,11 +3409,16 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { TEST_F(VersionSetTestMissingFiles, NoFileMissing) { std::vector existing_files = { - SstInfo(100, kDefaultColumnFamilyName, "a"), - SstInfo(102, kDefaultColumnFamilyName, "b"), - SstInfo(103, kDefaultColumnFamilyName, "c"), - SstInfo(107, kDefaultColumnFamilyName, "d"), - SstInfo(110, kDefaultColumnFamilyName, "e")}; + SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */, + 100 /* epoch_number */), + SstInfo(102, kDefaultColumnFamilyName, "b", 0 /* level */, + 102 /* epoch_number */), + SstInfo(103, kDefaultColumnFamilyName, "c", 0 /* level */, + 103 /* epoch_number */), + SstInfo(107, kDefaultColumnFamilyName, "d", 0 /* level */, + 107 /* epoch_number */), + SstInfo(110, kDefaultColumnFamilyName, "e", 0 /* level */, + 110 /* epoch_number */)}; std::vector file_metas; CreateDummyTableFiles(existing_files, &file_metas); @@ -3433,7 +3468,8 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) { db_options_.allow_2pc = true; NewDB(); - SstInfo sst(100, kDefaultColumnFamilyName, "a"); + SstInfo sst(100, kDefaultColumnFamilyName, "a", 0 /* level */, + 100 /* epoch_number */); std::vector file_metas; CreateDummyTableFiles({sst}, &file_metas); diff --git a/db/wal_manager.cc b/db/wal_manager.cc index ed76905d4ed..a6060235f21 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -355,7 +355,8 @@ Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs, // Binary Search. avoid opening all files. while (end >= start) { int64_t mid = start + (end - start) / 2; // Avoid overflow. - SequenceNumber current_seq_num = all_logs.at(static_cast(mid))->StartSequence(); + SequenceNumber current_seq_num = + all_logs.at(static_cast(mid))->StartSequence(); if (current_seq_num == target) { end = mid; break; @@ -366,7 +367,8 @@ Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs, } } // end could be -ve. - size_t start_index = static_cast(std::max(static_cast(0), end)); + size_t start_index = + static_cast(std::max(static_cast(0), end)); // The last wal file is always included all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); return Status::OK(); @@ -468,9 +470,8 @@ Status WalManager::ReadFirstLine(const std::string& fname, }; std::unique_ptr file; - Status status = fs_->NewSequentialFile(fname, - fs_->OptimizeForLogRead(file_options_), - &file, nullptr); + Status status = fs_->NewSequentialFile( + fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr); std::unique_ptr file_reader( new SequentialFileReader(std::move(file), fname, io_tracer_)); diff --git a/db/wal_manager.h b/db/wal_manager.h index 771c4849544..8cc0679357a 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -11,11 +11,11 @@ #include #include #include +#include #include +#include #include #include -#include -#include #include "db/version_set.h" #include "file/file_util.h" diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index e3c96c90cdd..4ad4e9749a1 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -69,7 +69,7 @@ class WalManagerTest : public testing::Test { // NOT thread safe void Put(const std::string& key, const std::string& value) { assert(current_log_writer_.get() != nullptr); - uint64_t seq = versions_->LastSequence() + 1; + uint64_t seq = versions_->LastSequence() + 1; WriteBatch batch; ASSERT_OK(batch.Put(key, value)); WriteBatchInternal::SetSequence(&batch, seq); @@ -88,7 +88,8 @@ class WalManagerTest : public testing::Test { std::unique_ptr file_writer; ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer, nullptr)); - current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false)); + current_log_writer_.reset( + new log::Writer(std::move(file_writer), 0, false)); } void CreateArchiveLogs(int num_logs, int entries_per_log) { @@ -215,7 +216,7 @@ int CountRecords(TransactionLogIterator* iter) { EXPECT_OK(iter->status()); return count; } -} // namespace +} // anonymous namespace TEST_F(WalManagerTest, WALArchivalSizeLimit) { db_options_.WAL_ttl_seconds = 0; diff --git a/db/wide/db_wide_basic_test.cc b/db/wide/db_wide_basic_test.cc index bddc5717fd7..1ffe314fef7 100644 --- a/db/wide/db_wide_basic_test.cc +++ b/db/wide/db_wide_basic_test.cc @@ -209,28 +209,234 @@ TEST_F(DBWideBasicTest, PutEntityColumnFamily) { ASSERT_OK(db_->Write(WriteOptions(), &batch)); } -TEST_F(DBWideBasicTest, PutEntityMergeNotSupported) { +TEST_F(DBWideBasicTest, MergePlainKeyValue) { Options options = GetDefaultOptions(); + options.create_if_missing = true; options.merge_operator = MergeOperators::CreateStringAppendOperator(); Reopen(options); + // Put + Merge constexpr char first_key[] = "first"; + constexpr char first_base_value[] = "hello"; + constexpr char first_merge_op[] = "world"; + + // Delete + Merge constexpr char second_key[] = "second"; + constexpr char second_merge_op[] = "foo"; + + // Merge without any preceding KV + constexpr char third_key[] = "third"; + constexpr char third_merge_op[] = "bar"; + + auto write_base = [&]() { + // Write "base" KVs: a Put for the 1st key and a Delete for the 2nd one; + // note there is no "base" KV for the 3rd + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), first_key, + first_base_value)); + ASSERT_OK( + db_->Delete(WriteOptions(), db_->DefaultColumnFamily(), second_key)); + }; + + auto write_merge = [&]() { + // Write Merge operands + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, + first_merge_op)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key, + second_merge_op)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key, + third_merge_op)); + }; + + const std::string expected_first_column(std::string(first_base_value) + "," + + first_merge_op); + const WideColumns expected_first_columns{ + {kDefaultWideColumnName, expected_first_column}}; + const WideColumns expected_second_columns{ + {kDefaultWideColumnName, second_merge_op}}; + const WideColumns expected_third_columns{ + {kDefaultWideColumnName, third_merge_op}}; - // Note: Merge is currently not supported for wide-column entities auto verify = [&]() { + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + ASSERT_EQ(result.columns(), expected_first_columns); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + ASSERT_EQ(result.columns(), expected_second_columns); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + third_key, &result)); + + ASSERT_EQ(result.columns(), expected_third_columns); + } + + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), expected_first_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_first_columns); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), expected_second_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_second_columns); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), expected_third_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_third_columns); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), expected_third_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_third_columns); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), expected_second_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_second_columns); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), expected_first_columns[0].value()); + ASSERT_EQ(iter->columns(), expected_first_columns); + + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + }; + + { + // Base KVs (if any) and Merge operands both in memtable (note: we take a + // snapshot in between to make sure they do not get reconciled during the + // subsequent flush) + write_base(); + ManagedSnapshot snapshot(db_); + write_merge(); + verify(); + + // Base KVs (if any) and Merge operands both in storage + ASSERT_OK(Flush()); + verify(); + } + + // Base KVs (if any) in storage, Merge operands in memtable + DestroyAndReopen(options); + write_base(); + ASSERT_OK(Flush()); + write_merge(); + verify(); +} + +TEST_F(DBWideBasicTest, MergeEntity) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + + const std::string delim("|"); + options.merge_operator = MergeOperators::CreateStringAppendOperator(delim); + + Reopen(options); + + // Test Merge with two entities: one that has the default column and one that + // doesn't + constexpr char first_key[] = "first"; + WideColumns first_columns{{kDefaultWideColumnName, "a"}, + {"attr_name1", "foo"}, + {"attr_name2", "bar"}}; + constexpr char first_merge_operand[] = "bla1"; + + constexpr char second_key[] = "second"; + WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}}; + constexpr char second_merge_operand[] = "bla2"; + + auto write_base = [&]() { + // Use the DB::PutEntity API + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), + first_key, first_columns)); + + // Use WriteBatch + WriteBatch batch; + ASSERT_OK(batch.PutEntity(db_->DefaultColumnFamily(), second_key, + second_columns)); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + }; + + auto write_merge = [&]() { + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, + first_merge_operand)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key, + second_merge_operand)); + }; + + const std::string first_expected_default(first_columns[0].value().ToString() + + delim + first_merge_operand); + const std::string second_expected_default(delim + second_merge_operand); + + auto verify_basic = [&]() { + WideColumns first_expected_columns{ + {kDefaultWideColumnName, first_expected_default}, + first_columns[1], + first_columns[2]}; + + WideColumns second_expected_columns{ + {kDefaultWideColumnName, second_expected_default}, + second_columns[0], + second_columns[1]}; + { PinnableSlice result; - ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key, - &result) - .IsNotSupported()); + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key, + &result)); + ASSERT_EQ(result, first_expected_default); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + ASSERT_EQ(result.columns(), first_expected_columns); } { PinnableSlice result; - ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), - second_key, &result) - .IsNotSupported()); + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key, + &result)); + ASSERT_EQ(result, second_expected_default); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + ASSERT_EQ(result.columns(), second_expected_columns); } { @@ -243,68 +449,148 @@ TEST_F(DBWideBasicTest, PutEntityMergeNotSupported) { db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0], &values[0], &statuses[0]); - ASSERT_TRUE(values[0].empty()); - ASSERT_TRUE(statuses[0].IsNotSupported()); + ASSERT_EQ(values[0], first_expected_default); + ASSERT_OK(statuses[0]); - ASSERT_TRUE(values[1].empty()); - ASSERT_TRUE(statuses[1].IsNotSupported()); + ASSERT_EQ(values[1], second_expected_default); + ASSERT_OK(statuses[1]); } { std::unique_ptr iter(db_->NewIterator(ReadOptions())); iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_expected_default); + ASSERT_EQ(iter->columns(), first_expected_columns); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_expected_default); + ASSERT_EQ(iter->columns(), second_expected_columns); + + iter->Next(); ASSERT_FALSE(iter->Valid()); - ASSERT_TRUE(iter->status().IsNotSupported()); + ASSERT_OK(iter->status()); iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_expected_default); + ASSERT_EQ(iter->columns(), second_expected_columns); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_expected_default); + ASSERT_EQ(iter->columns(), first_expected_columns); + + iter->Prev(); ASSERT_FALSE(iter->Valid()); - ASSERT_TRUE(iter->status().IsNotSupported()); + ASSERT_OK(iter->status()); } }; - // Use the DB::PutEntity API - WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}}; + auto verify_merge_ops_pre_compaction = [&]() { + constexpr size_t num_merge_operands = 2; - ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), - first_key, first_columns)); + GetMergeOperandsOptions get_merge_opts; + get_merge_opts.expected_max_number_of_operands = num_merge_operands; - // Use WriteBatch - WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}}; + { + std::array merge_operands; + int number_of_operands = 0; - WriteBatch batch; - ASSERT_OK( - batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns)); - ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &merge_operands[0], + &get_merge_opts, &number_of_operands)); - ASSERT_OK(Flush()); + ASSERT_EQ(number_of_operands, num_merge_operands); + ASSERT_EQ(merge_operands[0], first_columns[0].value()); + ASSERT_EQ(merge_operands[1], first_merge_operand); + } - // Add a couple of merge operands - constexpr char merge_operand[] = "bla"; + { + std::array merge_operands; + int number_of_operands = 0; - ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, - merge_operand)); - ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key, - merge_operand)); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &merge_operands[0], + &get_merge_opts, &number_of_operands)); - // Try reading when PutEntity is in storage, Merge is in memtable - verify(); + ASSERT_EQ(number_of_operands, num_merge_operands); + ASSERT_TRUE(merge_operands[0].empty()); + ASSERT_EQ(merge_operands[1], second_merge_operand); + } + }; - // Try reading when PutEntity and Merge are both in storage - ASSERT_OK(Flush()); + auto verify_merge_ops_post_compaction = [&]() { + constexpr size_t num_merge_operands = 1; - verify(); + GetMergeOperandsOptions get_merge_opts; + get_merge_opts.expected_max_number_of_operands = num_merge_operands; - // Try reading when PutEntity and Merge are both in memtable - ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), - first_key, first_columns)); - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, - merge_operand)); - ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key, - merge_operand)); + { + std::array merge_operands; + int number_of_operands = 0; - verify(); + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &merge_operands[0], + &get_merge_opts, &number_of_operands)); + + ASSERT_EQ(number_of_operands, num_merge_operands); + ASSERT_EQ(merge_operands[0], first_expected_default); + } + + { + std::array merge_operands; + int number_of_operands = 0; + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &merge_operands[0], + &get_merge_opts, &number_of_operands)); + + ASSERT_EQ(number_of_operands, num_merge_operands); + ASSERT_EQ(merge_operands[0], second_expected_default); + } + }; + + { + // Base KVs and Merge operands both in memtable (note: we take a snapshot in + // between to make sure they do not get reconciled during the subsequent + // flush) + write_base(); + ManagedSnapshot snapshot(db_); + write_merge(); + verify_basic(); + verify_merge_ops_pre_compaction(); + + // Base KVs and Merge operands both in storage + ASSERT_OK(Flush()); + verify_basic(); + verify_merge_ops_pre_compaction(); + } + + // Base KVs in storage, Merge operands in memtable + DestroyAndReopen(options); + write_base(); + ASSERT_OK(Flush()); + write_merge(); + verify_basic(); + verify_merge_ops_pre_compaction(); + + // Flush and compact + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr, + /* end */ nullptr)); + verify_basic(); + verify_merge_ops_post_compaction(); } TEST_F(DBWideBasicTest, PutEntityTimestampError) { diff --git a/db/write_batch.cc b/db/write_batch.cc index 65a96a9d1db..4d310d9ea55 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -158,7 +158,7 @@ struct BatchContentClassifier : public WriteBatch::Handler { } }; -} // anon namespace +} // anonymous namespace struct SavePoints { std::stack> stack; @@ -231,18 +231,16 @@ WriteBatch& WriteBatch::operator=(WriteBatch&& src) { return *this; } -WriteBatch::~WriteBatch() { } +WriteBatch::~WriteBatch() {} -WriteBatch::Handler::~Handler() { } +WriteBatch::Handler::~Handler() {} void WriteBatch::Handler::LogData(const Slice& /*blob*/) { // If the user has not specified something to do with blobs, then we ignore // them. } -bool WriteBatch::Handler::Continue() { - return true; -} +bool WriteBatch::Handler::Continue() { return true; } void WriteBatch::Clear() { rep_.clear(); @@ -779,7 +777,7 @@ Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family, } return Status::OK(); } -} // namespace +} // anonymous namespace Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id, const Slice& key, const Slice& value) { @@ -1481,8 +1479,27 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, return WriteBatchInternal::Merge(this, cf_id, key, value); } - return Status::InvalidArgument( - "Cannot call this method on column family enabling timestamp"); + needs_in_place_update_ts_ = true; + has_key_with_ts_ = true; + std::string dummy_ts(ts_sz, '\0'); + std::array key_with_ts{{key, dummy_ts}}; + + return WriteBatchInternal::Merge( + this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); +} + +Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& ts, const Slice& value) { + const Status s = CheckColumnFamilyTimestampSize(column_family, ts); + if (!s.ok()) { + return s; + } + has_key_with_ts_ = true; + assert(column_family); + uint32_t cf_id = column_family->GetID(); + std::array key_with_ts{{key, ts}}; + return WriteBatchInternal::Merge( + this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1)); } Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id, @@ -1727,7 +1744,6 @@ Status WriteBatch::VerifyChecksum() const { namespace { class MemTableInserter : public WriteBatch::Handler { - SequenceNumber sequence_; ColumnFamilyMemTables* const cf_mems_; FlushScheduler* const flush_scheduler_; @@ -1738,7 +1754,7 @@ class MemTableInserter : public WriteBatch::Handler { uint64_t log_number_ref_; DBImpl* db_; const bool concurrent_memtable_writes_; - bool post_info_created_; + bool post_info_created_; const WriteBatch::ProtectionInfo* prot_info_; size_t prot_info_idx_; @@ -1764,8 +1780,8 @@ class MemTableInserter : public WriteBatch::Handler { // Whether this batch was unprepared or not bool unprepared_batch_; using DupDetector = std::aligned_storage::type; - DupDetector duplicate_detector_; - bool dup_dectector_on_; + DupDetector duplicate_detector_; + bool dup_dectector_on_; bool hint_per_batch_; bool hint_created_; @@ -1785,7 +1801,7 @@ class MemTableInserter : public WriteBatch::Handler { MemPostInfoMap& GetPostMap() { assert(concurrent_memtable_writes_); - if(!post_info_created_) { + if (!post_info_created_) { new (&mem_post_info_map_) MemPostInfoMap(); post_info_created_ = true; } @@ -1799,8 +1815,8 @@ class MemTableInserter : public WriteBatch::Handler { new (&duplicate_detector_) DuplicateDetector(db_); dup_dectector_on_ = true; } - return reinterpret_cast - (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_); + return reinterpret_cast(&duplicate_detector_) + ->IsDuplicateKeySeq(column_family_id, key, sequence_); } const ProtectionInfoKVOC64* NextProtectionInfo() { @@ -1876,12 +1892,11 @@ class MemTableInserter : public WriteBatch::Handler { ~MemTableInserter() override { if (dup_dectector_on_) { - reinterpret_cast - (&duplicate_detector_)->~DuplicateDetector(); + reinterpret_cast(&duplicate_detector_) + ->~DuplicateDetector(); } if (post_info_created_) { - reinterpret_cast - (&mem_post_info_map_)->~MemPostInfoMap(); + reinterpret_cast(&mem_post_info_map_)->~MemPostInfoMap(); } if (hint_created_) { for (auto iter : GetHintMap()) { @@ -1923,7 +1938,7 @@ class MemTableInserter : public WriteBatch::Handler { assert(concurrent_memtable_writes_); // If post info was not created there is nothing // to process and no need to create on demand - if(post_info_created_) { + if (post_info_created_) { for (auto& pair : GetPostMap()) { pair.first->BatchPostProcess(pair.second); } @@ -2036,6 +2051,7 @@ class MemTableInserter : public WriteBatch::Handler { if (cf_handle == nullptr) { cf_handle = db_->DefaultColumnFamily(); } + // TODO (yanqin): fix when user-defined timestamp is enabled. get_status = db_->Get(ropts, cf_handle, key, &prev_value); } // Intentionally overwrites the `NotFound` in `ret_status`. @@ -2483,10 +2499,14 @@ class MemTableInserter : public WriteBatch::Handler { assert(merge_operator); std::string new_value; + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. Status merge_status = MergeHelper::TimedFullMerge( merge_operator, key, &get_value_slice, {value}, &new_value, moptions->info_log, moptions->statistics, - SystemClock::Default().get()); + SystemClock::Default().get(), /* result_operand */ nullptr, + /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr); if (!merge_status.ok()) { // Failed to merge! @@ -2844,7 +2864,7 @@ class MemTableInserter : public WriteBatch::Handler { } }; -} // namespace +} // anonymous namespace // This function can only be called in these conditions: // 1) During Recovery() @@ -3015,7 +3035,7 @@ class ProtectionInfoUpdater : public WriteBatch::Handler { WriteBatch::ProtectionInfo* const prot_info_ = nullptr; }; -} // namespace +} // anonymous namespace Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { return SetContents(b, contents.ToString()); diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 2b86b947cf1..524f4f283f6 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -77,7 +77,6 @@ struct WriteBatch::ProtectionInfo { // WriteBatch that we don't want in the public WriteBatch interface. class WriteBatchInternal { public: - // WriteBatch header has an 8-byte sequence number followed by a 4-byte count. static constexpr size_t kHeader = 12; @@ -149,17 +148,13 @@ class WriteBatchInternal { // This offset is only valid if the batch is not empty. static size_t GetFirstOffset(WriteBatch* batch); - static Slice Contents(const WriteBatch* batch) { - return Slice(batch->rep_); - } + static Slice Contents(const WriteBatch* batch) { return Slice(batch->rep_); } static std::string StealContents(WriteBatch* batch) { return std::move(batch->rep_); } - static size_t ByteSize(const WriteBatch* batch) { - return batch->rep_.size(); - } + static size_t ByteSize(const WriteBatch* batch) { return batch->rep_.size(); } static Status SetContents(WriteBatch* batch, const Slice& contents); static Status SetContents(WriteBatch* batch, std::string contents); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 9e436cf50bd..d233853e21a 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -174,9 +174,10 @@ TEST_F(WriteBatchTest, Corruption) { Slice contents = WriteBatchInternal::Contents(&batch); ASSERT_OK(WriteBatchInternal::SetContents( &batch, Slice(contents.data(), contents.size() - 1))); - ASSERT_EQ("Put(foo, bar)@200" - "Corruption: bad WriteBatch Delete", - PrintContents(&batch)); + ASSERT_EQ( + "Put(foo, bar)@200" + "Corruption: bad WriteBatch Delete", + PrintContents(&batch)); } TEST_F(WriteBatchTest, Append) { @@ -184,28 +185,28 @@ TEST_F(WriteBatchTest, Append) { WriteBatchInternal::SetSequence(&b1, 200); WriteBatchInternal::SetSequence(&b2, 300); ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); - ASSERT_EQ("", - PrintContents(&b1)); + ASSERT_EQ("", PrintContents(&b1)); ASSERT_EQ(0u, b1.Count()); ASSERT_OK(b2.Put("a", "va")); ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); - ASSERT_EQ("Put(a, va)@200", - PrintContents(&b1)); + ASSERT_EQ("Put(a, va)@200", PrintContents(&b1)); ASSERT_EQ(1u, b1.Count()); b2.Clear(); ASSERT_OK(b2.Put("b", "vb")); ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); - ASSERT_EQ("Put(a, va)@200" - "Put(b, vb)@201", - PrintContents(&b1)); + ASSERT_EQ( + "Put(a, va)@200" + "Put(b, vb)@201", + PrintContents(&b1)); ASSERT_EQ(2u, b1.Count()); ASSERT_OK(b2.Delete("foo")); ASSERT_OK(WriteBatchInternal::Append(&b1, &b2)); - ASSERT_EQ("Put(a, va)@200" - "Put(b, vb)@202" - "Put(b, vb)@201" - "Delete(foo)@203", - PrintContents(&b1)); + ASSERT_EQ( + "Put(a, va)@200" + "Put(b, vb)@202" + "Put(b, vb)@201" + "Delete(foo)@203", + PrintContents(&b1)); ASSERT_EQ(4u, b1.Count()); b2.Clear(); ASSERT_OK(b2.Put("c", "cc")); @@ -247,89 +248,88 @@ TEST_F(WriteBatchTest, SingleDeletion) { } namespace { - struct TestHandler : public WriteBatch::Handler { - std::string seen; - Status PutCF(uint32_t column_family_id, const Slice& key, - const Slice& value) override { - if (column_family_id == 0) { - seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; - } else { - seen += "PutCF(" + std::to_string(column_family_id) + ", " + - key.ToString() + ", " + value.ToString() + ")"; - } - return Status::OK(); - } - Status DeleteCF(uint32_t column_family_id, const Slice& key) override { - if (column_family_id == 0) { - seen += "Delete(" + key.ToString() + ")"; - } else { - seen += "DeleteCF(" + std::to_string(column_family_id) + ", " + - key.ToString() + ")"; - } - return Status::OK(); - } - Status SingleDeleteCF(uint32_t column_family_id, - const Slice& key) override { - if (column_family_id == 0) { - seen += "SingleDelete(" + key.ToString() + ")"; - } else { - seen += "SingleDeleteCF(" + std::to_string(column_family_id) + ", " + - key.ToString() + ")"; - } - return Status::OK(); - } - Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, - const Slice& end_key) override { - if (column_family_id == 0) { - seen += "DeleteRange(" + begin_key.ToString() + ", " + - end_key.ToString() + ")"; - } else { - seen += "DeleteRangeCF(" + std::to_string(column_family_id) + ", " + - begin_key.ToString() + ", " + end_key.ToString() + ")"; - } - return Status::OK(); - } - Status MergeCF(uint32_t column_family_id, const Slice& key, - const Slice& value) override { - if (column_family_id == 0) { - seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; - } else { - seen += "MergeCF(" + std::to_string(column_family_id) + ", " + - key.ToString() + ", " + value.ToString() + ")"; - } - return Status::OK(); - } - void LogData(const Slice& blob) override { - seen += "LogData(" + blob.ToString() + ")"; - } - Status MarkBeginPrepare(bool unprepare) override { - seen += - "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")"; - return Status::OK(); - } - Status MarkEndPrepare(const Slice& xid) override { - seen += "MarkEndPrepare(" + xid.ToString() + ")"; - return Status::OK(); +struct TestHandler : public WriteBatch::Handler { + std::string seen; + Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (column_family_id == 0) { + seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "PutCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; } - Status MarkNoop(bool empty_batch) override { - seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")"; - return Status::OK(); + return Status::OK(); + } + Status DeleteCF(uint32_t column_family_id, const Slice& key) override { + if (column_family_id == 0) { + seen += "Delete(" + key.ToString() + ")"; + } else { + seen += "DeleteCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ")"; } - Status MarkCommit(const Slice& xid) override { - seen += "MarkCommit(" + xid.ToString() + ")"; - return Status::OK(); + return Status::OK(); + } + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override { + if (column_family_id == 0) { + seen += "SingleDelete(" + key.ToString() + ")"; + } else { + seen += "SingleDeleteCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ")"; } - Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override { - seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " + - ts.ToString(true) + ")"; - return Status::OK(); + return Status::OK(); + } + Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key, + const Slice& end_key) override { + if (column_family_id == 0) { + seen += "DeleteRange(" + begin_key.ToString() + ", " + + end_key.ToString() + ")"; + } else { + seen += "DeleteRangeCF(" + std::to_string(column_family_id) + ", " + + begin_key.ToString() + ", " + end_key.ToString() + ")"; } - Status MarkRollback(const Slice& xid) override { - seen += "MarkRollback(" + xid.ToString() + ")"; - return Status::OK(); + return Status::OK(); + } + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + if (column_family_id == 0) { + seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; + } else { + seen += "MergeCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + value.ToString() + ")"; } - }; -} + return Status::OK(); + } + void LogData(const Slice& blob) override { + seen += "LogData(" + blob.ToString() + ")"; + } + Status MarkBeginPrepare(bool unprepare) override { + seen += + "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")"; + return Status::OK(); + } + Status MarkEndPrepare(const Slice& xid) override { + seen += "MarkEndPrepare(" + xid.ToString() + ")"; + return Status::OK(); + } + Status MarkNoop(bool empty_batch) override { + seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")"; + return Status::OK(); + } + Status MarkCommit(const Slice& xid) override { + seen += "MarkCommit(" + xid.ToString() + ")"; + return Status::OK(); + } + Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override { + seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " + + ts.ToString(true) + ")"; + return Status::OK(); + } + Status MarkRollback(const Slice& xid) override { + seen += "MarkRollback(" + xid.ToString() + ")"; + return Status::OK(); + } +}; +} // anonymous namespace TEST_F(WriteBatchTest, PutNotImplemented) { WriteBatch batch; @@ -609,24 +609,25 @@ TEST_F(WriteBatchTest, PutGatherSlices) { { // Try a write where the key is one slice but the value is two Slice key_slice("baz"); - Slice value_slices[2] = { Slice("header"), Slice("payload") }; + Slice value_slices[2] = {Slice("header"), Slice("payload")}; ASSERT_OK( batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2))); } { // One where the key is composite but the value is a single slice - Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") }; + Slice key_slices[3] = {Slice("key"), Slice("part2"), Slice("part3")}; Slice value_slice("value"); ASSERT_OK( batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1))); } WriteBatchInternal::SetSequence(&batch, 100); - ASSERT_EQ("Put(baz, headerpayload)@101" - "Put(foo, bar)@100" - "Put(keypart2part3, value)@102", - PrintContents(&batch)); + ASSERT_EQ( + "Put(baz, headerpayload)@101" + "Put(foo, bar)@100" + "Put(keypart2part3, value)@102", + PrintContents(&batch)); ASSERT_EQ(3u, batch.Count()); } @@ -646,7 +647,7 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { uint32_t id_; const Comparator* const ucmp_ = BytewiseComparator(); }; -} // namespace anonymous +} // anonymous namespace TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { WriteBatch batch; @@ -948,7 +949,7 @@ Status CheckTimestampsInWriteBatch( TimestampChecker ts_checker(cf_to_ucmps, timestamp); return wb.Iterate(&ts_checker); } -} // namespace +} // anonymous namespace TEST_F(WriteBatchTest, SanityChecks) { ColumnFamilyHandleImplDummy cf0(0, @@ -961,14 +962,14 @@ TEST_F(WriteBatchTest, SanityChecks) { ASSERT_TRUE(wb.Put(nullptr, "key", "ts", "value").IsInvalidArgument()); ASSERT_TRUE(wb.Delete(nullptr, "key", "ts").IsInvalidArgument()); ASSERT_TRUE(wb.SingleDelete(nullptr, "key", "ts").IsInvalidArgument()); - ASSERT_TRUE(wb.Merge(nullptr, "key", "ts", "value").IsNotSupported()); + ASSERT_TRUE(wb.Merge(nullptr, "key", "ts", "value").IsInvalidArgument()); ASSERT_TRUE(wb.DeleteRange(nullptr, "begin_key", "end_key", "ts") .IsInvalidArgument()); ASSERT_TRUE(wb.Put(&cf4, "key", "ts", "value").IsInvalidArgument()); ASSERT_TRUE(wb.Delete(&cf4, "key", "ts").IsInvalidArgument()); ASSERT_TRUE(wb.SingleDelete(&cf4, "key", "ts").IsInvalidArgument()); - ASSERT_TRUE(wb.Merge(&cf4, "key", "ts", "value").IsNotSupported()); + ASSERT_TRUE(wb.Merge(&cf4, "key", "ts", "value").IsInvalidArgument()); ASSERT_TRUE( wb.DeleteRange(&cf4, "begin_key", "end_key", "ts").IsInvalidArgument()); @@ -978,7 +979,7 @@ TEST_F(WriteBatchTest, SanityChecks) { ASSERT_TRUE(wb.Put(&cf0, "key", ts, "value").IsInvalidArgument()); ASSERT_TRUE(wb.Delete(&cf0, "key", ts).IsInvalidArgument()); ASSERT_TRUE(wb.SingleDelete(&cf0, "key", ts).IsInvalidArgument()); - ASSERT_TRUE(wb.Merge(&cf0, "key", ts, "value").IsNotSupported()); + ASSERT_TRUE(wb.Merge(&cf0, "key", ts, "value").IsInvalidArgument()); ASSERT_TRUE( wb.DeleteRange(&cf0, "begin_key", "end_key", ts).IsInvalidArgument()); diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index 16061739ef4..e6ebaae08c7 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -5,6 +5,8 @@ #ifndef ROCKSDB_LITE +#include "db/write_callback.h" + #include #include #include @@ -12,7 +14,6 @@ #include #include "db/db_impl/db_impl.h" -#include "db/write_callback.h" #include "port/port.h" #include "rocksdb/db.h" #include "rocksdb/write_batch.h" @@ -37,11 +38,11 @@ class WriteCallbackTestWriteCallback1 : public WriteCallback { public: bool was_called = false; - Status Callback(DB *db) override { + Status Callback(DB* db) override { was_called = true; // Make sure db is a DBImpl - DBImpl* db_impl = dynamic_cast (db); + DBImpl* db_impl = dynamic_cast(db); if (db_impl == nullptr) { return Status::InvalidArgument(""); } @@ -397,7 +398,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) { Status s = DB::Open(options, dbname, &db); ASSERT_OK(s); - db_impl = dynamic_cast (db); + db_impl = dynamic_cast(db); ASSERT_TRUE(db_impl); WriteBatch wb; diff --git a/db/write_controller.h b/db/write_controller.h index c32b70b9418..bcead165b34 100644 --- a/db/write_controller.h +++ b/db/write_controller.h @@ -9,6 +9,7 @@ #include #include + #include "rocksdb/rate_limiter.h" namespace ROCKSDB_NAMESPACE { diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc index 69c2418e956..b6321a3bc9b 100644 --- a/db/write_controller_test.cc +++ b/db/write_controller_test.cc @@ -20,7 +20,7 @@ class TimeSetClock : public SystemClockWrapper { uint64_t now_micros_ = 6666; uint64_t NowNanos() override { return now_micros_ * std::milli::den; } }; -} // namespace +} // anonymous namespace class WriteControllerTest : public testing::Test { public: WriteControllerTest() { clock_ = std::make_shared(); } diff --git a/db/write_thread.cc b/db/write_thread.cc index 40580710a4f..de1744cf048 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -360,8 +360,11 @@ void WriteThread::EndWriteStall() { // Unlink write_stall_dummy_ from the write queue. This will unblock // pending write threads to enqueue themselves assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_); - assert(write_stall_dummy_.link_older != nullptr); - write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer; + // write_stall_dummy_.link_older can be nullptr only if LockWAL() has been + // called. + if (write_stall_dummy_.link_older) { + write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer; + } newest_writer_.exchange(write_stall_dummy_.link_older); // Wake up writers @@ -397,8 +400,9 @@ void WriteThread::JoinBatchGroup(Writer* w) { * writes in parallel. */ TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w); - AwaitState(w, STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER | - STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED, + AwaitState(w, + STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER | + STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED, &jbg_ctx); TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w); } @@ -595,10 +599,10 @@ void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) { } } -static WriteThread::AdaptationContext cpmtw_ctx("CompleteParallelMemTableWriter"); +static WriteThread::AdaptationContext cpmtw_ctx( + "CompleteParallelMemTableWriter"); // This method is called by both the leader and parallel followers bool WriteThread::CompleteParallelMemTableWriter(Writer* w) { - auto* write_group = w->write_group; if (!w->status.ok()) { std::lock_guard guard(write_group->leader->StateMutex()); @@ -718,8 +722,9 @@ void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group, SetState(new_leader, STATE_GROUP_LEADER); } - AwaitState(leader, STATE_MEMTABLE_WRITER_LEADER | - STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED, + AwaitState(leader, + STATE_MEMTABLE_WRITER_LEADER | STATE_PARALLEL_MEMTABLE_WRITER | + STATE_COMPLETED, &eabgl_ctx); } else { Writer* head = newest_writer_.load(std::memory_order_acquire); diff --git a/db_stress_tool/batched_ops_stress.cc b/db_stress_tool/batched_ops_stress.cc index 1f87e752e01..3f34460762a 100644 --- a/db_stress_tool/batched_ops_stress.cc +++ b/db_stress_tool/batched_ops_stress.cc @@ -18,8 +18,8 @@ class BatchedOpsStressTest : public StressTest { bool IsStateTracked() const override { return false; } - // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ... - // ("9"+K, "9"+V) in DB atomically i.e in a single batch. + // Given a key K and value V, this puts ("0"+K, V+"0"), ("1"+K, V+"1"), ..., + // ("9"+K, V+"9") in DB atomically i.e in a single batch. // Also refer BatchedOpsStressTest::TestGet Status TestPut(ThreadState* thread, WriteOptions& write_opts, const ReadOptions& /* read_opts */, @@ -29,12 +29,12 @@ class BatchedOpsStressTest : public StressTest { assert(!rand_column_families.empty()); assert(!rand_keys.empty()); - const std::string key_suffix = Key(rand_keys[0]); + const std::string key_body = Key(rand_keys[0]); const uint32_t value_base = thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL; const size_t sz = GenerateValue(value_base, value, sizeof(value)); - const std::string value_suffix = Slice(value, sz).ToString(); + const std::string value_body = Slice(value, sz).ToString(); WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, FLAGS_batch_protection_bytes_per_key, @@ -44,10 +44,14 @@ class BatchedOpsStressTest : public StressTest { assert(cfh); for (int i = 9; i >= 0; --i) { - const std::string prefix = std::to_string(i); + const std::string num = std::to_string(i); - const std::string k = prefix + key_suffix; - const std::string v = prefix + value_suffix; + // Note: the digit in num is prepended to the key; however, it is appended + // to the value because we want the "value base" to be encoded uniformly + // at the beginning of the value for all types of stress tests (e.g. + // batched, non-batched, CF consistency). + const std::string k = num + key_body; + const std::string v = value_body + num; if (FLAGS_use_merge) { batch.Merge(cfh, k, v); @@ -72,7 +76,7 @@ class BatchedOpsStressTest : public StressTest { return s; } - // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K) + // Given a key K, this deletes ("0"+K), ("1"+K), ..., ("9"+K) // in DB atomically i.e in a single batch. Also refer MultiGet. Status TestDelete(ThreadState* thread, WriteOptions& writeoptions, const std::vector& rand_column_families, @@ -122,9 +126,9 @@ class BatchedOpsStressTest : public StressTest { std::terminate(); } - // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K + // Given a key K, this gets values for "0"+K, "1"+K, ..., "9"+K // in the same snapshot, and verifies that all the values are of the form - // "0"+V, "1"+V,..."9"+V. + // V+"0", V+"1", ..., V+"9". // ASSUMES that BatchedOpsStressTest::TestPut was used to put (K, V) into // the DB. Status TestGet(ThreadState* thread, const ReadOptions& readoptions, @@ -156,13 +160,19 @@ class BatchedOpsStressTest : public StressTest { } else { values[i] = from_db; - char expected_prefix = (keys[i])[0]; - char actual_prefix = (values[i])[0]; - if (actual_prefix != expected_prefix) { - fprintf(stderr, "error expected prefix = %c actual = %c\n", - expected_prefix, actual_prefix); + assert(!keys[i].empty()); + assert(!values[i].empty()); + + const char expected = keys[i].front(); + const char actual = values[i].back(); + + if (expected != actual) { + fprintf(stderr, "get error expected = %c actual = %c\n", expected, + actual); } - (values[i])[0] = ' '; // blank out the differing character + + values[i].pop_back(); // get rid of the differing character + thread->stats.AddGets(1, 1); } } @@ -171,7 +181,7 @@ class BatchedOpsStressTest : public StressTest { // Now that we retrieved all values, check that they all match for (int i = 1; i < 10; i++) { if (values[i] != values[0]) { - fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", + fprintf(stderr, "get error: inconsistent values for key %s: %s, %s\n", key.ToString(true).c_str(), StringToHex(values[0]).c_str(), StringToHex(values[i]).c_str()); // we continue after error rather than exiting so that we can @@ -188,8 +198,8 @@ class BatchedOpsStressTest : public StressTest { const std::vector& rand_keys) override { size_t num_keys = rand_keys.size(); std::vector ret_status(num_keys); - std::array keys = {{"0", "1", "2", "3", "4", - "5", "6", "7", "8", "9"}}; + std::array keys = { + {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}}; size_t num_prefixes = keys.size(); for (size_t rand_key = 0; rand_key < num_keys; ++rand_key) { std::vector key_slices; @@ -214,7 +224,7 @@ class BatchedOpsStressTest : public StressTest { for (size_t i = 0; i < num_prefixes; i++) { Status s = statuses[i]; if (!s.ok() && !s.IsNotFound()) { - fprintf(stderr, "get error: %s\n", s.ToString().c_str()); + fprintf(stderr, "multiget error: %s\n", s.ToString().c_str()); thread->stats.AddErrors(1); ret_status[rand_key] = s; // we continue after error rather than exiting so that we can @@ -223,17 +233,19 @@ class BatchedOpsStressTest : public StressTest { thread->stats.AddGets(1, 0); ret_status[rand_key] = s; } else { - char expected_prefix = (keys[i])[0]; - char actual_prefix = (values[i])[0]; - if (actual_prefix != expected_prefix) { - fprintf(stderr, "error expected prefix = %c actual = %c\n", - expected_prefix, actual_prefix); + assert(!keys[i].empty()); + assert(!values[i].empty()); + + const char expected = keys[i][0]; + const char actual = values[i][values[i].size() - 1]; + + if (expected != actual) { + fprintf(stderr, "multiget error expected = %c actual = %c\n", + expected, actual); } - std::string str; - str.assign(values[i].data(), values[i].size()); - values[i].Reset(); - str[0] = ' '; // blank out the differing character - values[i].PinSelf(str); + + values[i].remove_suffix(1); // get rid of the differing character + thread->stats.AddGets(1, 1); } } @@ -242,7 +254,8 @@ class BatchedOpsStressTest : public StressTest { // Now that we retrieved all values, check that they all match for (size_t i = 1; i < num_prefixes; i++) { if (values[i] != values[0]) { - fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n", + fprintf(stderr, + "multiget error: inconsistent values for key %s: %s, %s\n", StringToHex(key_str[i]).c_str(), StringToHex(values[0].ToString()).c_str(), StringToHex(values[i].ToString()).c_str()); @@ -255,11 +268,11 @@ class BatchedOpsStressTest : public StressTest { return ret_status; } - // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P + // Given a key, this does prefix scans for "0"+P, "1"+P, ..., "9"+P // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes // of the key. Each of these 10 scans returns a series of values; // each series should be the same length, and it is verified for each - // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V. + // index i that all the i'th values are of the form V+"0", V+"1", ..., V+"9". // ASSUMES that MultiPut was used to put (K, V) Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions, const std::vector& rand_column_families, @@ -317,21 +330,24 @@ class BatchedOpsStressTest : public StressTest { iters[i]->key().starts_with(prefix_slices[i])); values[i] = iters[i]->value().ToString(); - // make sure the first character of the value is the expected digit - const char expected_first = prefixes[i][0]; - const char actual_first = values[i][0]; + // make sure the last character of the value is the expected digit + assert(!prefixes[i].empty()); + assert(!values[i].empty()); + + const char expected = prefixes[i].front(); + const char actual = values[i].back(); - if (actual_first != expected_first) { - fprintf(stderr, "error expected first = %c actual = %c\n", - expected_first, actual_first); + if (expected != actual) { + fprintf(stderr, "prefix scan error expected = %c actual = %c\n", + expected, actual); } - values[i][0] = ' '; // blank out the differing character + values[i].pop_back(); // get rid of the differing character // make sure all values are equivalent if (values[i] != values[0]) { fprintf(stderr, - "error : %" ROCKSDB_PRIszt + "prefix scan error : %" ROCKSDB_PRIszt ", inconsistent values for prefix %s: %s, %s\n", i, prefix_slices[i].ToString(/* hex */ true).c_str(), StringToHex(values[0]).c_str(), @@ -341,16 +357,11 @@ class BatchedOpsStressTest : public StressTest { } // make sure value() and columns() are consistent - // note: in these tests, value base is stored after a single-digit - // prefix - Slice value_base_slice = iters[i]->value(); - value_base_slice.remove_prefix(1); - const WideColumns expected_columns = GenerateExpectedWideColumns( - GetValueBase(value_base_slice), iters[i]->value()); + GetValueBase(iters[i]->value()), iters[i]->value()); if (iters[i]->columns() != expected_columns) { fprintf(stderr, - "error : %" ROCKSDB_PRIszt + "prefix scan error : %" ROCKSDB_PRIszt ", value and columns inconsistent for prefix %s: %s\n", i, prefix_slices[i].ToString(/* hex */ true).c_str(), DebugString(iters[i]->value(), iters[i]->columns(), @@ -377,11 +388,6 @@ class BatchedOpsStressTest : public StressTest { return Status::OK(); } - Slice GetValueBaseSlice(Slice slice) override { - slice.remove_prefix(1); - return slice; - } - void VerifyDb(ThreadState* /* thread */) const override {} void ContinuouslyVerifyDb(ThreadState* /* thread */) const override {} diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 6fdc3eb3c76..397d22299e0 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -150,6 +150,7 @@ DECLARE_string(cache_type); DECLARE_uint64(subcompactions); DECLARE_uint64(periodic_compaction_seconds); DECLARE_uint64(compaction_ttl); +DECLARE_bool(fifo_allow_compaction); DECLARE_bool(allow_concurrent_memtable_write); DECLARE_double(experimental_mempurge_threshold); DECLARE_bool(enable_write_thread_adaptive_yield); @@ -508,8 +509,8 @@ extern inline std::string Key(int64_t val) { if (offset < weight) { // Use the bottom 3 bits of offset as the number of trailing 'x's in the // key. If the next key is going to be of the next level, then skip the - // trailer as it would break ordering. If the key length is already at max, - // skip the trailer. + // trailer as it would break ordering. If the key length is already at + // max, skip the trailer. if (offset < weight - 1 && level < levels - 1) { size_t trailer_len = offset & 0x7; key.append(trailer_len, 'x'); diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index ed1240e0078..2c8dcf61086 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -56,12 +56,11 @@ void ThreadBody(void* v) { } } -bool RunStressTest(StressTest* stress) { +bool RunStressTest(SharedState* shared) { SystemClock* clock = db_stress_env->GetSystemClock().get(); + StressTest* stress = shared->GetStressTest(); - SharedState shared(db_stress_env, stress); - - if (shared.ShouldVerifyAtBeginning() && FLAGS_preserve_unverified_changes) { + if (shared->ShouldVerifyAtBeginning() && FLAGS_preserve_unverified_changes) { Status s = InitUnverifiedSubdir(FLAGS_db); if (s.ok() && !FLAGS_expected_values_dir.empty()) { s = InitUnverifiedSubdir(FLAGS_expected_values_dir); @@ -73,8 +72,8 @@ bool RunStressTest(StressTest* stress) { } } - stress->InitDb(&shared); - stress->FinishInitDb(&shared); + stress->InitDb(shared); + stress->FinishInitDb(shared); if (FLAGS_sync_fault_injection) { fault_fs_guard->SetFilesystemDirectWritable(false); @@ -88,28 +87,28 @@ bool RunStressTest(StressTest* stress) { fprintf(stdout, "%s Initializing worker threads\n", clock->TimeToString(now / 1000000).c_str()); - shared.SetThreads(n); + shared->SetThreads(n); if (FLAGS_compaction_thread_pool_adjust_interval > 0) { - shared.IncBgThreads(); + shared->IncBgThreads(); } if (FLAGS_continuous_verification_interval > 0) { - shared.IncBgThreads(); + shared->IncBgThreads(); } std::vector threads(n); for (uint32_t i = 0; i < n; i++) { - threads[i] = new ThreadState(i, &shared); + threads[i] = new ThreadState(i, shared); db_stress_env->StartThread(ThreadBody, threads[i]); } - ThreadState bg_thread(0, &shared); + ThreadState bg_thread(0, shared); if (FLAGS_compaction_thread_pool_adjust_interval > 0) { db_stress_env->StartThread(PoolSizeChangeThread, &bg_thread); } - ThreadState continuous_verification_thread(0, &shared); + ThreadState continuous_verification_thread(0, shared); if (FLAGS_continuous_verification_interval > 0) { db_stress_env->StartThread(DbVerificationThread, &continuous_verification_thread); @@ -120,12 +119,12 @@ bool RunStressTest(StressTest* stress) { // wait for others to operate -> verify -> done { - MutexLock l(shared.GetMutex()); - while (!shared.AllInitialized()) { - shared.GetCondVar()->Wait(); + MutexLock l(shared->GetMutex()); + while (!shared->AllInitialized()) { + shared->GetCondVar()->Wait(); } - if (shared.ShouldVerifyAtBeginning()) { - if (shared.HasVerificationFailedYet()) { + if (shared->ShouldVerifyAtBeginning()) { + if (shared->HasVerificationFailedYet()) { fprintf(stderr, "Crash-recovery verification failed :(\n"); } else { fprintf(stdout, "Crash-recovery verification passed :)\n"); @@ -144,17 +143,17 @@ bool RunStressTest(StressTest* stress) { // This is after the verification step to avoid making all those `Get()`s // and `MultiGet()`s contend on the DB-wide trace mutex. if (!FLAGS_expected_values_dir.empty()) { - stress->TrackExpectedState(&shared); + stress->TrackExpectedState(shared); } now = clock->NowMicros(); fprintf(stdout, "%s Starting database operations\n", clock->TimeToString(now / 1000000).c_str()); - shared.SetStart(); - shared.GetCondVar()->SignalAll(); - while (!shared.AllOperated()) { - shared.GetCondVar()->Wait(); + shared->SetStart(); + shared->GetCondVar()->SignalAll(); + while (!shared->AllOperated()) { + shared->GetCondVar()->Wait(); } now = clock->NowMicros(); @@ -169,10 +168,10 @@ bool RunStressTest(StressTest* stress) { clock->TimeToString((uint64_t)now / 1000000).c_str()); } - shared.SetStartVerify(); - shared.GetCondVar()->SignalAll(); - while (!shared.AllDone()) { - shared.GetCondVar()->Wait(); + shared->SetStartVerify(); + shared->GetCondVar()->SignalAll(); + while (!shared->AllDone()) { + shared->GetCondVar()->Wait(); } } @@ -187,7 +186,7 @@ bool RunStressTest(StressTest* stress) { } now = clock->NowMicros(); if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots && - !shared.HasVerificationFailedYet()) { + !shared->HasVerificationFailedYet()) { fprintf(stdout, "%s Verification successful\n", clock->TimeToString(now / 1000000).c_str()); } @@ -195,14 +194,14 @@ bool RunStressTest(StressTest* stress) { if (FLAGS_compaction_thread_pool_adjust_interval > 0 || FLAGS_continuous_verification_interval > 0) { - MutexLock l(shared.GetMutex()); - shared.SetShouldStopBgThread(); - while (!shared.BgThreadsFinished()) { - shared.GetCondVar()->Wait(); + MutexLock l(shared->GetMutex()); + shared->SetShouldStopBgThread(); + while (!shared->BgThreadsFinished()) { + shared->GetCondVar()->Wait(); } } - if (shared.HasVerificationFailedYet()) { + if (shared->HasVerificationFailedYet()) { fprintf(stderr, "Verification failed :(\n"); return false; } diff --git a/db_stress_tool/db_stress_driver.h b/db_stress_tool/db_stress_driver.h index ff701fcb298..a173470ff7d 100644 --- a/db_stress_tool/db_stress_driver.h +++ b/db_stress_tool/db_stress_driver.h @@ -7,11 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db_stress_tool/db_stress_shared_state.h" #ifdef GFLAGS #pragma once #include "db_stress_tool/db_stress_test_base.h" namespace ROCKSDB_NAMESPACE { extern void ThreadBody(void* /*thread_state*/); -extern bool RunStressTest(StressTest*); +extern bool RunStressTest(SharedState*); } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index 21f6db2ab3c..af60df9bc20 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -12,13 +12,15 @@ #include "db_stress_tool/db_stress_common.h" namespace ROCKSDB_NAMESPACE { -class DbStressEnvWrapper : public EnvWrapper { +class DbStressFSWrapper : public FileSystemWrapper { public: - explicit DbStressEnvWrapper(Env* t) : EnvWrapper(t) {} - static const char* kClassName() { return "DbStressEnv"; } + explicit DbStressFSWrapper(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + static const char* kClassName() { return "DbStressFS"; } const char* Name() const override { return kClassName(); } - Status DeleteFile(const std::string& f) override { + IOStatus DeleteFile(const std::string& f, const IOOptions& opts, + IODebugContext* dbg) override { // We determine whether it is a manifest file by searching a strong, // so that there will be false positive if the directory path contains the // keyword but it is unlikely. @@ -28,11 +30,11 @@ class DbStressEnvWrapper : public EnvWrapper { f.find("checkpoint") != std::string::npos || f.find(".backup") != std::string::npos || f.find(".restore") != std::string::npos) { - return target()->DeleteFile(f); + return target()->DeleteFile(f, opts, dbg); } // Rename the file instead of deletion to keep the history, and // at the same time it is not visible to RocksDB. - return target()->RenameFile(f, f + "_renamed_"); + return target()->RenameFile(f, f + "_renamed_", opts, dbg); } // If true, all manifest files will not be delted in DeleteFile(). diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 7adc6650993..ef542db109e 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -376,6 +376,10 @@ DEFINE_uint64(periodic_compaction_seconds, 1000, DEFINE_uint64(compaction_ttl, 1000, "Files older than TTL will be compacted to the next level."); +DEFINE_bool(fifo_allow_compaction, false, + "If true, set `Options::compaction_options_fifo.allow_compaction = " + "true`. It only take effect when FIFO compaction is used."); + DEFINE_bool(allow_concurrent_memtable_write, false, "Allow multi-writers to update mem tables in parallel."); diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index c53a0742bff..5565c62211d 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -333,9 +333,7 @@ class SharedState { uint64_t GetStartTimestamp() const { return start_timestamp_; } private: - static void IgnoreReadErrorCallback(void*) { - ignore_read_error = true; - } + static void IgnoreReadErrorCallback(void*) { ignore_read_error = true; } // Pick random keys in each column family that will not experience overwrite. std::unordered_set GenerateNoOverwriteIds() const { diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index ad24855e1ab..e939954671f 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -12,7 +12,6 @@ #include "util/compression.h" #ifdef GFLAGS -#include "cache/fast_lru_cache.h" #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_compaction_filter.h" #include "db_stress_tool/db_stress_driver.h" @@ -124,10 +123,6 @@ std::shared_ptr StressTest::NewCache(size_t capacity, FLAGS_block_size /*estimated_entry_charge*/, num_shard_bits) .MakeSharedCache(); - } else if (FLAGS_cache_type == "fast_lru_cache") { - return NewFastLRUCache(static_cast(capacity), FLAGS_block_size, - num_shard_bits, false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy); } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts; opts.capacity = capacity; @@ -522,9 +517,18 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, shared->Put(cf_idx, k, value_base, true /* pending */); + std::string ts; + if (FLAGS_user_timestamp_size > 0) { + ts = GetNowNanos(); + } + if (FLAGS_use_merge) { if (!FLAGS_use_txn) { - s = db_->Merge(write_opts, cfh, key, v); + if (FLAGS_user_timestamp_size > 0) { + s = db_->Merge(write_opts, cfh, key, ts, v); + } else { + s = db_->Merge(write_opts, cfh, key, v); + } } else { #ifndef ROCKSDB_LITE Transaction* txn; @@ -543,7 +547,6 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, } else { if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size > 0) { - const std::string ts = GetNowNanos(); s = db_->Put(write_opts, cfh, key, ts, v); } else { s = db_->Put(write_opts, cfh, key, v); @@ -1043,9 +1046,11 @@ void StressTest::OperateDb(ThreadState* thread) { TestIterateAgainstExpected(thread, read_opts, rand_column_families, rand_keys); } else { - int num_seeks = static_cast( - std::min(static_cast(thread->rand.Uniform(4)), - FLAGS_ops_per_thread - i - 1)); + int num_seeks = static_cast(std::min( + std::max(static_cast(thread->rand.Uniform(4)), + static_cast(1)), + std::max(static_cast(FLAGS_ops_per_thread - i - 1), + static_cast(1)))); rand_keys = GenerateNKeys(thread, num_seeks, i); i += num_seeks - 1; TestIterate(thread, read_opts, rand_column_families, rand_keys); @@ -1487,10 +1492,8 @@ void StressTest::VerifyIterator(ThreadState* thread, } if (!*diverged && iter->Valid()) { - const Slice value_base_slice = GetValueBaseSlice(iter->value()); - - const WideColumns expected_columns = GenerateExpectedWideColumns( - GetValueBase(value_base_slice), iter->value()); + const WideColumns expected_columns = + GenerateExpectedWideColumns(GetValueBase(iter->value()), iter->value()); if (iter->columns() != expected_columns) { fprintf(stderr, "Value and columns inconsistent for iterator: %s\n", DebugString(iter->value(), iter->columns(), expected_columns) @@ -2967,7 +2970,8 @@ void StressTest::MaybeUseOlderTimestampForRangeScan(ThreadState* thread, ts_slice = ts_str; read_opts.timestamp = &ts_slice; - if (!thread->rand.OneInOpt(3)) { + // TODO (yanqin): support Merge with iter_start_ts + if (!thread->rand.OneInOpt(3) || FLAGS_use_merge || FLAGS_use_full_merge_v1) { return; } @@ -2988,10 +2992,6 @@ void CheckAndSetOptionsForUserTimestamp(Options& options) { static_cast(cmp->timestamp_size())); exit(1); } - if (FLAGS_use_merge || FLAGS_use_full_merge_v1) { - fprintf(stderr, "Merge does not support timestamp yet.\n"); - exit(1); - } if (FLAGS_use_txn) { fprintf(stderr, "TransactionDB does not support timestamp yet.\n"); exit(1); @@ -3027,7 +3027,7 @@ bool InitializeOptionsFromFile(Options& options) { FLAGS_options_file.c_str(), s.ToString().c_str()); exit(1); } - db_options.env = new DbStressEnvWrapper(db_stress_env); + db_options.env = new CompositeEnvWrapper(db_stress_env); options = Options(db_options, cf_descriptors[0].options); return true; } @@ -3123,6 +3123,11 @@ void InitializeOptionsFromFlags( options.max_background_flushes = FLAGS_max_background_flushes; options.compaction_style = static_cast(FLAGS_compaction_style); + if (options.compaction_style == + ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleFIFO) { + options.compaction_options_fifo.allow_compaction = + FLAGS_fifo_allow_compaction; + } options.compaction_pri = static_cast(FLAGS_compaction_pri); options.num_levels = FLAGS_num_levels; diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index 4ec08dee899..81fbbe24b15 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -139,8 +139,6 @@ class StressTest { return column_families_[column_family_id]; } - virtual Slice GetValueBaseSlice(Slice slice) { return slice; } - #ifndef ROCKSDB_LITE // Generated a list of keys that close to boundaries of SST keys. // If there isn't any SST file in the DB, return empty list. diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index 6c5e952db63..fd28856b731 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -20,6 +20,7 @@ // NOTE that if FLAGS_test_batches_snapshots is set, the test will have // different behavior. See comment of the flag for details. +#include "db_stress_tool/db_stress_shared_state.h" #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_driver.h" @@ -29,8 +30,8 @@ namespace ROCKSDB_NAMESPACE { namespace { static std::shared_ptr env_guard; -static std::shared_ptr env_wrapper_guard; -static std::shared_ptr +static std::shared_ptr env_wrapper_guard; +static std::shared_ptr dbsl_env_wrapper_guard; static std::shared_ptr fault_env_guard; } // namespace @@ -77,7 +78,7 @@ int db_stress_tool(int argc, char** argv) { s.ToString().c_str()); exit(1); } - dbsl_env_wrapper_guard = std::make_shared(raw_env); + dbsl_env_wrapper_guard = std::make_shared(raw_env); db_stress_listener_env = dbsl_env_wrapper_guard.get(); if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection || @@ -96,17 +97,16 @@ int db_stress_tool(int argc, char** argv) { raw_env = fault_env_guard.get(); } - env_wrapper_guard = std::make_shared(raw_env); - db_stress_env = env_wrapper_guard.get(); - - if (FLAGS_write_fault_one_in) { - // In the write injection case, we need to use the FS interface and returns - // the IOStatus with different error and flags. Therefore, - // DbStressEnvWrapper cannot be used which will swallow the FS - // implementations. We should directly use the raw_env which is the - // CompositeEnvWrapper of env and fault_fs. - db_stress_env = raw_env; + env_wrapper_guard = std::make_shared( + raw_env, std::make_shared(raw_env->GetFileSystem())); + if (!env_opts) { + // If using the default Env (Posix), wrap DbStressEnvWrapper with the + // legacy EnvWrapper. This is a temporary fix for the ReadAsync interface + // not being properly supported with Posix and db_stress. The EnvWrapper + // has a default implementation of ReadAsync that redirects to Read. + env_wrapper_guard = std::make_shared(env_wrapper_guard); } + db_stress_env = env_wrapper_guard.get(); FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); @@ -341,7 +341,7 @@ int db_stress_tool(int argc, char** argv) { key_gen_ctx.weights.emplace_back(key_gen_ctx.window - keys_per_level * (levels - 1)); } - + std::unique_ptr shared; std::unique_ptr stress; if (FLAGS_test_cf_consistency) { stress.reset(CreateCfConsistencyStressTest()); @@ -354,7 +354,8 @@ int db_stress_tool(int argc, char** argv) { } // Initialize the Zipfian pre-calculated array InitializeHotKeyGenerator(FLAGS_hot_key_alpha); - if (RunStressTest(stress.get())) { + shared.reset(new SharedState(db_stress_env, stress.get())); + if (RunStressTest(shared.get())) { return 0; } else { return 1; diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 031134a0cfa..01f5d67636e 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -60,11 +60,9 @@ class NonBatchedOpsStressTest : public StressTest { constexpr int num_methods = static_cast(VerificationMethod::kNumberOfMethods); - // Note: Merge/GetMergeOperands is currently not supported for wide-column - // entities const VerificationMethod method = static_cast(thread->rand.Uniform( - FLAGS_use_put_entity_one_in > 0 ? num_methods - 1 : num_methods)); + (FLAGS_user_timestamp_size > 0) ? num_methods - 1 : num_methods)); if (method == VerificationMethod::kIterator) { std::unique_ptr iter( @@ -702,6 +700,11 @@ class NonBatchedOpsStressTest : public StressTest { uint64_t count = 0; Status s; + if (fault_fs_guard) { + fault_fs_guard->EnableErrorInjection(); + SharedState::ignore_read_error = false; + } + for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix); iter->Next()) { ++count; @@ -735,13 +738,20 @@ class NonBatchedOpsStressTest : public StressTest { s = iter->status(); } - if (!s.ok()) { + uint64_t error_count = 0; + if (fault_fs_guard) { + error_count = fault_fs_guard->GetAndResetErrorCount(); + } + if (!s.ok() && (!fault_fs_guard || (fault_fs_guard && !error_count))) { fprintf(stderr, "TestPrefixScan error: %s\n", s.ToString().c_str()); thread->stats.AddErrors(1); return s; } + if (fault_fs_guard) { + fault_fs_guard->DisableErrorInjection(); + } thread->stats.AddPrefixes(1, count); return Status::OK(); @@ -808,7 +818,11 @@ class NonBatchedOpsStressTest : public StressTest { if (FLAGS_use_merge) { if (!FLAGS_use_txn) { - s = db_->Merge(write_opts, cfh, k, v); + if (FLAGS_user_timestamp_size == 0) { + s = db_->Merge(write_opts, cfh, k, v); + } else { + s = db_->Merge(write_opts, cfh, k, write_ts, v); + } } else { #ifndef ROCKSDB_LITE Transaction* txn; diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock index 68cd9496334..a30377aeb40 100644 --- a/docs/Gemfile.lock +++ b/docs/Gemfile.lock @@ -232,7 +232,7 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.15.0) multipart-post (2.1.1) - nokogiri (1.13.9) + nokogiri (1.13.10) mini_portile2 (~> 2.8.0) racc (~> 1.4) octokit (4.22.0) @@ -241,7 +241,7 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (4.0.6) - racc (1.6.0) + racc (1.6.1) rb-fsevent (0.11.1) rb-inotify (0.10.1) ffi (~> 1.0) diff --git a/docs/_data/authors.yml b/docs/_data/authors.yml index cad4a7c3546..210987c0b5f 100644 --- a/docs/_data/authors.yml +++ b/docs/_data/authors.yml @@ -75,3 +75,7 @@ ltamasi: cbi42: full_name: Changyu Bi fbid: 100078474793041 + +zjay: + full_name: Jay Zhuang + fbid: 100032386042884 diff --git a/docs/_posts/2022-10-31-align-compaction-output-file.markdown b/docs/_posts/2022-10-31-align-compaction-output-file.markdown new file mode 100644 index 00000000000..a2db41bc35c --- /dev/null +++ b/docs/_posts/2022-10-31-align-compaction-output-file.markdown @@ -0,0 +1,107 @@ +--- +title: Reduce Write Amplification by Aligning Compaction Output File Boundaries +layout: post +author: +- zjay +category: blog +--- +## TL;DR +By cutting the compaction output file earlier and allowing larger than targeted_file_size to align the compaction output files to the next level files, it can **reduce WA (Write Amplification) by more than 10%**. The feature is **enabled by default** after the user upgrades RocksDB to version `7.8.0+`. + +## Background +RocksDB level compaction picks one file from the source level and compacts to the next level, which is a typical partial merge compaction algorithm. Compared to the full merge compaction strategy for example [universal compaction](https://github.com/facebook/rocksdb/wiki/Universal-Compaction), it has the benefits of smaller compaction size, better parallelism, etc. But it also has a larger write amplification (typically 20-30 times user data). One of the problems is wasted compaction at the beginning and ending: + +![](/static/images/align-compaction-output/file_cut_normal.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +In the diagram above, `SST11` is selected for the compaction, it overlaps with `SST20` to `SST23`, so all these files are selected for compaction. But the beginning and ending of the SST on Level 2 are wasted, which also means it will be compacted again when `SST10` is compacting down. If the file boundaries are aligned, then the wasted compaction size could be reduced. On average, the wasted compaction is `1` file size: `0.5` at the beginning, and `0.5` at the end. Typically the average compaction fan-out is about 6 (with the default max_bytes_for_level_multiplier = 10), then `1 / (6 + 1) ~= 14%` of compaction is wasted. +## implementation +To reduce such wasted compaction, RocksDB now tries to align the compaction output file to the next level's file. So future compactions will have fewer wasted compaction. For example, the above case might be cut like this: + +![](/static/images/align-compaction-output/file_cut_align.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +The trade-off is the file won't be cut exactly after it exceeds target_file_size_base, instead, it will be more likely cut when it's aligned with the next level file's boundary, so the file size might be more varied. It could be as small as 50% of `target_file_size` or as large as `2x target_file_size`. It will only impact non-bottommost-level files, which should be only `~11%` of the data. +Internally, RocksDB tries to cut the file so its size is close to the `target_file_size` setting but also aligned with the next level boundary. When the compaction output file hit a next-level file boundary, either the beginning or ending boundary, it will cut if: +``` +current_size > ((5 * min(bounderies_num, 8) + 50) / 100) * target_file_size +``` +([details](https://github.com/facebook/rocksdb/blob/23fa5b7789d6acd0c211d6bdd41448bbf1513bb6/db/compaction/compaction_outputs.cc#L270-L290)) + +The file size is also capped at `2x target_file_size`: [details](https://github.com/facebook/rocksdb/blob/f726d29a8268ae4e2ffeec09172383cff2ab4db9/db/compaction/compaction.cc#L273-L277). +Another benefit of cutting the file earlier is having more trivial move compaction, which is moving the file from a high level to a low level without compacting anything. Based on a compaction simulator test, the trivial move data is increased by 30% (but still less than 1% compaction data is trivial move): + +![](/static/images/align-compaction-output/file_cut_trival_move.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +Based on the db_bench test, it can save `~12%` compaction load, here is the test command and result: +``` +TEST_TMPDIR=/data/dbbench ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432 + +# baseline: +Flush(GB): cumulative 25.882, interval 7.216 +Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds + +# with this change: +Flush(GB): cumulative 25.882, interval 7.753 +Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds +``` + +The feature is enabled by default by upgrading to RocksDB 7.8 or later versions, as the feature should have a limited impact on the file size and have great write amplification improvements. If in a rare case, it needs to opt out, set +``` +options.level_compaction_dynamic_file_size = false; +``` + +## Other Options and Benchmark +We also tested a few other options, starting with a fixed threshold: 75% of the target_file_size and 50%. Then with a dynamic threshold that is explained, but still limiting file size smaller than the target_file_size. +1. Baseline (main branch before [PR#10655](https://github.com/facebook/rocksdb/pull/10655)); +2. Fixed Threshold `75%`: after 75% of target file size, cut the file whenever it aligns with a low level file boundary; +3. Fixed Threshold `50%`: reduce the threshold to 50% of target file size; +4. Dynamic Threshold `(5*bounderies_num + 50)` percent of target file size and maxed at 90%; +5. Dynamic Threshold + allow 2x the target file size (chosen option). + +### Test Environment and Data +To speed up the benchmark, we introduced a compaction simulator within Rocksdb ([details](https://github.com/jay-zhuang/rocksdb/tree/compaction_sim)), which replaced the physical SST with in-memory data (a large bitset). Which can test compaction more consistently. As it's a simulator, it has its limitations: + +it assumes each key-value has the same size; +1. no deletion (but has override); +2. doesn't consider data compression; +3. single-threaded and finish all compactions before the next flush (so no write stall). + +We use 3 kinds of the dataset for tests: +1. Random Data, has an override, evenly distributed; +2. Zipf distribution with alpha = 1.01, moderately skewed; +3. Zipf distribution with alpha = 1.2, highly skewed. + +#### Write Amplification + +![](/static/images/align-compaction-output/write_amp_compare.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 100%"} + +As we can see, all options are better than the baseline. Option5 (brown) and option3 (green) have similar WA improvements. (The sudden WA drop during ~40G Random Dataset is because we enabled `level_compaction_dynamic_level_bytes` and the level number was increased from 3 to 4, the similar test result without enabling `level_compaction_dynamic_level_bytes`). + +#### File Size Distribution at the End of Test +This is the file size distribution at the end of the test, which loads about 100G data. As this change only impacts the non-bottommost file size, and the majority of the SST files are bottommost, there're no significant differences: + +![](/static/images/align-compaction-output/file_size_compare.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 100%"} + +#### All Compaction Generated File Sizes +The high-level files are much more likely to be compacted, so all compaction-generated files size has more significant change: + +![](/static/images/align-compaction-output/compaction_output_file_size_compare.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 100%"} + +Overall option5 has most of the file size close to the target file size. vs. option3 has a much smaller size. Here are more detailed stats for compaction output file size: +``` + base 50p 75p dynamic 2xdynamic +count 1.656000e+03 1.960000e+03 1.770000e+03 1.687000e+03 1.705000e+03 +mean 3.116062e+07 2.634125e+07 2.917876e+07 3.060135e+07 3.028076e+07 +std 7.145242e+06 1.065134e+07 8.800474e+06 7.612939e+06 8.046139e+06 +``` + +## Summary +Allowing more dynamic file size and aligning the compaction output file to the next level file's boundary improves the RocksDB write amplification by more than 10%, which will be enabled by default in `7.8.0` release. We picked a simple algorithm to decide when to cut the output file, which can be further improved. For example, by estimating output file size with index information. Any suggestions or PR are welcomed. + +## Acknowledgements +We thank Siying Dong for initializing the file-cutting idea and thank Andrew Kryczka, Mark Callaghan for contributing to the ideas. And Changyu Bi for the detailed code review. diff --git a/docs/_posts/2022-11-09-time-aware-tiered-storage.markdown b/docs/_posts/2022-11-09-time-aware-tiered-storage.markdown new file mode 100644 index 00000000000..03a6b02ef97 --- /dev/null +++ b/docs/_posts/2022-11-09-time-aware-tiered-storage.markdown @@ -0,0 +1,121 @@ +--- +title: Time-Aware Tiered Storage in RocksDB +layout: post +author: +- zjay +category: blog +--- +## TL:DR +Tiered storage is now natively supported in the RocksDB with the option [`last_level_temperature`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L910), time-aware Tiered storage feature guarantees the recently written data are put in the hot tier storage with the option [`preclude_last_level_data_seconds`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L927). + +## Background +RocksDB Tiered Storage assigns a data temperature when creating the new SST which [hints the file system](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/file_system.h#L162) to put the data on the corresponding storage media, so the data in a single DB instance can be placed on different storage media. Before the feature, the user typically creates multiple DB instances for different storage media, for example, one DB instance stores the recent hot data and migrates the data to another cold DB instance when the data becomes cold. Tracking and migrating the data could be challenging. With the RocksDB tiered storage feature, RocksDB compaction migrates the data from hot storage to cold storage. + +![](/static/images/time-aware-tiered-storage/tiered_storage_overview.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +Currently, RocksDB supports assigning the last level file temperature. In an LSM tree, typically the last level data is most likely the coldest. As the most recent data is on the higher level and gradually compacted to the lower level. The higher level data is more likely to be read, because: +1. RocksDB read always queries from the higher level to the lower level until it finds the data; +2. The high-level data is much more likely to be read and written by the compactions. + +### Problem +Generally in the LSM tree, hotter data is likely on the higher levels as mentioned before, **but it is not always the case**, for example for the skewed dataset, the recent data could be compacted to the last level first. For the universal compaction, a major compaction would compact all data to the last level (the cold tier) which includes both recent data that should be cataloged as hot data. In production, **we found the majority of the compaction load is actually major compaction (more than 80%)**. + +![](/static/images/time-aware-tiered-storage/tiered_storage_problem.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +### Goal and Non-goals +It’s hard to predict the hot and cold data. The most frequently accessed data should be cataloged as hot data. But it is hard to predict which key is going to be accessed most, it is also hard to track the per-key based access history. The time-aware tiered storage feature is only **focusing on the use cases that the more recent data is more likely to be accessed**. Which is the majority of the cases, but not all. + +## User APIs +Here are the 3 main tiered storage options: +```c++ +Temperature last_level_temperature = Temperature::kUnknown; +uint64_t preclude_last_level_data_seconds = 0; +uint64_t preserve_internal_time_seconds = 0; +``` +[`last_level_temperature`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L910) defines the data temperature for the last level SST files, which is typically kCold or kWarm. RocksDB doesn’t check the option value, instead it just passes that to the file_system API with [`FileOptions.temperature`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/file_system.h#L162) when creating the last level SST files. For all the other files, non-last-level SST files, and non-SST files like manifest files, the temperature is set to kUnknown, which typically maps to hot data. +The user can also get each SST’s temperature information through APIs: +```c++ +db.GetLiveFilesStorageInfo(); +db.GetLiveFilesMetaData(); +db.GetColumnFamilyMetaData(); +``` + +### User Metrics +Here are the tiered storage related statistics: +```c++ +HOT_FILE_READ_BYTES, +WARM_FILE_READ_BYTES, +COLD_FILE_READ_BYTES, +HOT_FILE_READ_COUNT, +WARM_FILE_READ_COUNT, +COLD_FILE_READ_COUNT, +// Last level and non-last level statistics +LAST_LEVEL_READ_BYTES, +LAST_LEVEL_READ_COUNT, +NON_LAST_LEVEL_READ_BYTES, +NON_LAST_LEVEL_READ_COUNT, +``` + +And more details from `IOStats`: +```c++ +struct FileIOByTemperature { +// the number of bytes read to Temperature::kHot file +uint64_t hot_file_bytes_read; +// the number of bytes read to Temperature::kWarm file +uint64_t warm_file_bytes_read; +// the number of bytes read to Temperature::kCold file +uint64_t cold_file_bytes_read; +// total number of reads to Temperature::kHot file +uint64_t hot_file_read_count; +// total number of reads to Temperature::kWarm file +uint64_t warm_file_read_count; +// total number of reads to Temperature::kCold file +uint64_t cold_file_read_count; +``` + +## Implementation +There are 2 main components for this feature. One is the **time-tracking**, and another is the **per-key based placement compaction**. These 2 components are relatively independent and linked together during the compaction initialization phase which gets the sequence number for splitting the hot and cold data. The time-tracking components can even be enabled independently by setting the option [`preserve_internal_time_seconds`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L950). The purpose of that is before migrating existing user cases to the tiered storage feature and avoid compacting the existing hot data to the cold tier (detailed in the migration session below). + +Unlike the user-defined timestamp feature, the time tracking feature doesn’t have accurate time information for each key. It only samples the time information and gives a rough estimation for the key write time. Here is the high-level graph for the implementation: + +![](/static/images/time-aware-tiered-storage/tiered_storage_design.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +### Time Tracking +Time tracking information is recorded by a [periodic task](https://github.com/facebook/rocksdb/blob/d9e71fb2c53726d9c5ed73b4ec962a7ed6ef15ec/db/periodic_task_scheduler.cc#L36) which gets the latest sequence number and the current time and then stores it in an in-memory data structure. The interval of the periodic task is determined by the user setting [`preserve_internal_time_seconds`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L950) and dividing that by 100. For example, if 3 days of data should be precluded from the last level, then the interval of the periodic task is about 0.7 hours (3 * 24 / 100 ~= 0.72), which also means only the latest 100 seq->time pairs needed in memory. + +Currently, the in-memory seq_time_mapping is only used during Flush() and encoded to the SST property. The data is delta encoded and again maximum 100 pairs are stored, so the extra data size is pretty small (far less than 1KB per SST) and only non-last-level SSTs need to have that information. Internally, RocksDB also uses the minimal sequence number and SST creation time from the SST metadata to improve the time accuracy. +**The sequence number to time information is distributed in each SST**, ranging from the min seqno to max seqno for that SST file, so each SST has its self-contained time information. This also means there could be redundancy for the time information, for example, if 2 SSTs have an overlapped sequence number (which is very likely for non-L0 files), the same seq->time pair may exist in both SSTs. +For the future, the time information could also be useful for other potential features like a better estimate of the oldest timestamp for an SST which is critical for the RocksDB TTL feature. + +### Per-Key Placement Compaction + +![](/static/images/time-aware-tiered-storage/per_key_placement_compaction.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +Compare to normal compaction which only outputs the data to a single level, Per-key placement compaction can output data to 2 different levels, as per-per placement compaction is only for the last level compaction, so the 2 output levels would **always be the penultimate level, and the last level**. The compaction places the key to its corresponding tier by simply checking the key’s sequence number. + +At the beginning of the compaction, the compaction job collects all seq to time information from every input SSTs and merges them together, then based on the current time to get the oldest sequence number that should be put into non-last-level (hot tier). During the last level compaction, as long as the key is newer than the oldest_sequence_number, it will be placed in the penultimate level (hot tier) instead of the last level (cold tier). + +Note, RocksDB also places the keys that are within the user snapshot in the hot tier, there’re a few reasons for that: +1. It’s reasonable to assume snapshot-protected data are hot data; +2. Avoid mixing the sequence number not zeroed out data with old last-level data, which is desirable to reduce the oldest obsolete data time (it’s defined as the oldest SST time that has a non-zero sequence number). It also means tombstones are always placed in the hot tier, which is also desirable as it should be pretty small. +3. The original motivation was to avoid moving data from the lower level to a higher level in case the user increases the [`preclude_last_level_data_seconds`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L927), so the snapshot-protected data in the last level will become hot again, and moving data to a higher level. It’s not always safe to move data from a lower level to a higher level in the LSM tree which could cause key conflict. Later we added a conflict check to allow the data to move up as long as there’s no key conflict, but then the movement is not guaranteed (see Migration for details) + +### Migration +Once the user enables the feature, it enables both time tracking and per-key placement compaction **at the same time**. As the existing data, it can still be mismarked as cold data. To have a smooth migration to the feature. The user can enable the time-tracking feature first. For example, if the user plans to set [`preclude_last_level_data_seconds`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L927) to 3 days, the user can enable time tracking 3 days earlier with [`preserve_internal_time_seconds`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L950). Then when enabling the tiered storage feature, it already has the time information for the last 3 days' hot data, then per-key placement compaction won’t compact them to the last level. + +Just preserving the time information won’t prevent the data from compacting to the last level (which should be still on the hot tier). Once the [`preclude_last_level_data_seconds`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L927) and [`last_level_temperature`](https://github.com/facebook/rocksdb/blob/b0d9776b704af01c2b5385e9d53754e0c8176373/include/rocksdb/advanced_options.h#L910) features are enabled, some of the last-level data might need to move up. Currently, RocksDB just does a conflict check, the hot/cold split in this case is not guaranteed. + +![](/static/images/time-aware-tiered-storage/compaction_moving_up_conflict.png) +{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"} + +## Summary +Time-aware tired storage feature guarantees the new data is placed in the hot tier, which **is ideal for the tiering use cases where the most recent data is likely the hot data**. It’s done by tracking the write time information and per-key placement compaction to split the hot/cold data. + +The tiered storage feature is actively being developed, any suggestions or PRs will be welcomed. + +## Acknowledgements +We thank Siying Dong and Andrew Kryczka for brainstorming and reviewing the feature design and implementation. And it was my fortune to work with the RocksDB team members! \ No newline at end of file diff --git a/docs/static/images/align-compaction-output/compaction_output_file_size_compare.png b/docs/static/images/align-compaction-output/compaction_output_file_size_compare.png new file mode 100644 index 00000000000..2ce86fb289d Binary files /dev/null and b/docs/static/images/align-compaction-output/compaction_output_file_size_compare.png differ diff --git a/docs/static/images/align-compaction-output/file_cut_align.png b/docs/static/images/align-compaction-output/file_cut_align.png new file mode 100644 index 00000000000..bc3e8990ec2 Binary files /dev/null and b/docs/static/images/align-compaction-output/file_cut_align.png differ diff --git a/docs/static/images/align-compaction-output/file_cut_normal.png b/docs/static/images/align-compaction-output/file_cut_normal.png new file mode 100644 index 00000000000..e17133ed2a3 Binary files /dev/null and b/docs/static/images/align-compaction-output/file_cut_normal.png differ diff --git a/docs/static/images/align-compaction-output/file_cut_trival_move.png b/docs/static/images/align-compaction-output/file_cut_trival_move.png new file mode 100644 index 00000000000..7aca9aeb543 Binary files /dev/null and b/docs/static/images/align-compaction-output/file_cut_trival_move.png differ diff --git a/docs/static/images/align-compaction-output/file_size_compare.png b/docs/static/images/align-compaction-output/file_size_compare.png new file mode 100644 index 00000000000..5f39a806f51 Binary files /dev/null and b/docs/static/images/align-compaction-output/file_size_compare.png differ diff --git a/docs/static/images/align-compaction-output/write_amp_compare.png b/docs/static/images/align-compaction-output/write_amp_compare.png new file mode 100644 index 00000000000..8b20f2ae3d3 Binary files /dev/null and b/docs/static/images/align-compaction-output/write_amp_compare.png differ diff --git a/docs/static/images/time-aware-tiered-storage/compaction_moving_up_conflict.png b/docs/static/images/time-aware-tiered-storage/compaction_moving_up_conflict.png new file mode 100644 index 00000000000..8feaef203c4 Binary files /dev/null and b/docs/static/images/time-aware-tiered-storage/compaction_moving_up_conflict.png differ diff --git a/docs/static/images/time-aware-tiered-storage/per_key_placement_compaction.png b/docs/static/images/time-aware-tiered-storage/per_key_placement_compaction.png new file mode 100644 index 00000000000..0b232d1fe7f Binary files /dev/null and b/docs/static/images/time-aware-tiered-storage/per_key_placement_compaction.png differ diff --git a/docs/static/images/time-aware-tiered-storage/tiered_storage_design.png b/docs/static/images/time-aware-tiered-storage/tiered_storage_design.png new file mode 100644 index 00000000000..7e5158c18bf Binary files /dev/null and b/docs/static/images/time-aware-tiered-storage/tiered_storage_design.png differ diff --git a/docs/static/images/time-aware-tiered-storage/tiered_storage_overview.png b/docs/static/images/time-aware-tiered-storage/tiered_storage_overview.png new file mode 100644 index 00000000000..7d115e667d6 Binary files /dev/null and b/docs/static/images/time-aware-tiered-storage/tiered_storage_overview.png differ diff --git a/docs/static/images/time-aware-tiered-storage/tiered_storage_problem.png b/docs/static/images/time-aware-tiered-storage/tiered_storage_problem.png new file mode 100644 index 00000000000..dbe2ae53257 Binary files /dev/null and b/docs/static/images/time-aware-tiered-storage/tiered_storage_problem.png differ diff --git a/env/env.cc b/env/env.cc index 2643b4a6514..fb2cb950d76 100644 --- a/env/env.cc +++ b/env/env.cc @@ -633,8 +633,7 @@ Env::Env(const std::shared_ptr& fs, const std::shared_ptr& clock) : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {} -Env::~Env() { -} +Env::~Env() {} Status Env::NewLogger(const std::string& fname, std::shared_ptr* result) { @@ -841,14 +840,11 @@ std::string Env::GenerateUniqueId() { return result; } -SequentialFile::~SequentialFile() { -} +SequentialFile::~SequentialFile() {} -RandomAccessFile::~RandomAccessFile() { -} +RandomAccessFile::~RandomAccessFile() {} -WritableFile::~WritableFile() { -} +WritableFile::~WritableFile() {} MemoryMappedFileBuffer::~MemoryMappedFileBuffer() {} @@ -865,16 +861,15 @@ Status Logger::Close() { Status Logger::CloseImpl() { return Status::NotSupported(); } -FileLock::~FileLock() { -} +FileLock::~FileLock() {} -void LogFlush(Logger *info_log) { +void LogFlush(Logger* info_log) { if (info_log) { info_log->Flush(); } } -static void Logv(Logger *info_log, const char* format, va_list ap) { +static void Logv(Logger* info_log, const char* format, va_list ap) { if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) { info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); } @@ -887,9 +882,10 @@ void Log(Logger* info_log, const char* format, ...) { va_end(ap); } -void Logger::Logv(const InfoLogLevel log_level, const char* format, va_list ap) { - static const char* kInfoLogLevelNames[5] = { "DEBUG", "INFO", "WARN", - "ERROR", "FATAL" }; +void Logger::Logv(const InfoLogLevel log_level, const char* format, + va_list ap) { + static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN", "ERROR", + "FATAL"}; if (log_level < log_level_) { return; } @@ -906,7 +902,7 @@ void Logger::Logv(const InfoLogLevel log_level, const char* format, va_list ap) } else { char new_format[500]; snprintf(new_format, sizeof(new_format) - 1, "[%s] %s", - kInfoLogLevelNames[log_level], format); + kInfoLogLevelNames[log_level], format); Logv(new_format, ap); } @@ -919,7 +915,8 @@ void Logger::Logv(const InfoLogLevel log_level, const char* format, va_list ap) } } -static void Logv(const InfoLogLevel log_level, Logger *info_log, const char *format, va_list ap) { +static void Logv(const InfoLogLevel log_level, Logger* info_log, + const char* format, va_list ap) { if (info_log && info_log->GetInfoLogLevel() <= log_level) { if (log_level == InfoLogLevel::HEADER_LEVEL) { info_log->LogHeader(format, ap); @@ -937,7 +934,7 @@ void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, va_end(ap); } -static void Headerv(Logger *info_log, const char *format, va_list ap) { +static void Headerv(Logger* info_log, const char* format, va_list ap) { if (info_log) { info_log->LogHeader(format, ap); } @@ -1106,7 +1103,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) { options.env->SanitizeEnvOptions(env_options); } -} +} // namespace EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options, const DBOptions& db_options) const { diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 2f7da52df2a..0f18b321867 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -305,7 +305,7 @@ TEST_P(EnvBasicTestWithParam, LargeWrite) { read += result.size(); } ASSERT_TRUE(write_data == read_data); - delete [] scratch; + delete[] scratch; } TEST_P(EnvMoreTestWithParam, GetModTime) { diff --git a/env/env_chroot.cc b/env/env_chroot.cc index e7cd04031d8..a64373517f8 100644 --- a/env/env_chroot.cc +++ b/env/env_chroot.cc @@ -66,9 +66,9 @@ IOStatus ChrootFileSystem::GetTestDirectory(const IOOptions& options, return CreateDirIfMissing(*path, options, dbg); } - // Returns status and expanded absolute path including the chroot directory. - // Checks whether the provided path breaks out of the chroot. If it returns - // non-OK status, the returned path should not be used. +// Returns status and expanded absolute path including the chroot directory. +// Checks whether the provided path breaks out of the chroot. If it returns +// non-OK status, the returned path should not be used. std::pair ChrootFileSystem::EncodePath( const std::string& path) { if (path.empty() || path[0] != '/') { @@ -77,29 +77,29 @@ std::pair ChrootFileSystem::EncodePath( std::pair res; res.second = chroot_dir_ + path; #if defined(OS_AIX) - char resolvedName[PATH_MAX]; - char* normalized_path = realpath(res.second.c_str(), resolvedName); + char resolvedName[PATH_MAX]; + char* normalized_path = realpath(res.second.c_str(), resolvedName); #else - char* normalized_path = realpath(res.second.c_str(), nullptr); + char* normalized_path = realpath(res.second.c_str(), nullptr); #endif - if (normalized_path == nullptr) { - res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str()); - } else if (strlen(normalized_path) < chroot_dir_.size() || - strncmp(normalized_path, chroot_dir_.c_str(), - chroot_dir_.size()) != 0) { - res.first = IOStatus::IOError(res.second, - "Attempted to access path outside chroot"); - } else { - res.first = IOStatus::OK(); - } + if (normalized_path == nullptr) { + res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str()); + } else if (strlen(normalized_path) < chroot_dir_.size() || + strncmp(normalized_path, chroot_dir_.c_str(), + chroot_dir_.size()) != 0) { + res.first = IOStatus::IOError(res.second, + "Attempted to access path outside chroot"); + } else { + res.first = IOStatus::OK(); + } #if !defined(OS_AIX) - free(normalized_path); + free(normalized_path); #endif - return res; + return res; } - // Similar to EncodePath() except assumes the basename in the path hasn't been - // created yet. +// Similar to EncodePath() except assumes the basename in the path hasn't been +// created yet. std::pair ChrootFileSystem::EncodePathWithNewBasename( const std::string& path) { if (path.empty() || path[0] != '/') { diff --git a/env/env_encryption.cc b/env/env_encryption.cc index 147bd8ea4de..c6b0a257dbf 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -33,14 +33,14 @@ std::shared_ptr EncryptionProvider::NewCTRProvider( return std::make_shared(cipher); } - // Read up to "n" bytes from the file. "scratch[0..n-1]" may be - // written by this routine. Sets "*result" to the data that was - // read (including if fewer than "n" bytes were successfully read). - // May set "*result" to point at data in "scratch[0..n-1]", so - // "scratch[0..n-1]" must be live when "*result" is used. - // If an error was encountered, returns a non-OK status. - // - // REQUIRES: External synchronization +// Read up to "n" bytes from the file. "scratch[0..n-1]" may be +// written by this routine. Sets "*result" to the data that was +// read (including if fewer than "n" bytes were successfully read). +// May set "*result" to point at data in "scratch[0..n-1]", so +// "scratch[0..n-1]" must be live when "*result" is used. +// If an error was encountered, returns a non-OK status. +// +// REQUIRES: External synchronization IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) { @@ -89,16 +89,16 @@ size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } - // Positioned Read for direct I/O - // If Direct I/O enabled, offset, n, and scratch should be properly aligned +// Positioned Read for direct I/O +// If Direct I/O enabled, offset, n, and scratch should be properly aligned IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, @@ -118,16 +118,16 @@ IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, return io_s; } - // Read up to "n" bytes from the file starting at "offset". - // "scratch[0..n-1]" may be written by this routine. Sets "*result" - // to the data that was read (including if fewer than "n" bytes were - // successfully read). May set "*result" to point at data in - // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when - // "*result" is used. If an error was encountered, returns a non-OK - // status. - // - // Safe for concurrent use by multiple threads. - // If Direct I/O enabled, offset, n, and scratch should be aligned properly. +// Read up to "n" bytes from the file starting at "offset". +// "scratch[0..n-1]" may be written by this routine. Sets "*result" +// to the data that was read (including if fewer than "n" bytes were +// successfully read). May set "*result" to point at data in +// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when +// "*result" is used. If an error was encountered, returns a non-OK +// status. +// +// Safe for concurrent use by multiple threads. +// If Direct I/O enabled, offset, n, and scratch should be aligned properly. IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, @@ -146,7 +146,7 @@ IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, return io_s; } - // Readahead the file starting from offset by n bytes for caching. +// Readahead the file starting from offset by n bytes for caching. IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n, const IOOptions& options, IODebugContext* dbg) { @@ -154,21 +154,21 @@ IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n, return file_->Prefetch(offset + prefixLength_, n, options, dbg); } - // Tries to get an unique ID for this file that will be the same each time - // the file is opened (and will stay the same while the file is open). - // Furthermore, it tries to make this ID at most "max_size" bytes. If such an - // ID can be created this function returns the length of the ID and places it - // in "id"; otherwise, this function returns 0, in which case "id" - // may not have been modified. - // - // This function guarantees, for IDs from a given environment, two unique ids - // cannot be made equal to each other by adding arbitrary bytes to one of - // them. That is, no unique ID is the prefix of another. - // - // This function guarantees that the returned ID will not be interpretable as - // a single varint. - // - // Note: these IDs are only valid for the duration of the process. +// Tries to get an unique ID for this file that will be the same each time +// the file is opened (and will stay the same while the file is open). +// Furthermore, it tries to make this ID at most "max_size" bytes. If such an +// ID can be created this function returns the length of the ID and places it +// in "id"; otherwise, this function returns 0, in which case "id" +// may not have been modified. +// +// This function guarantees, for IDs from a given environment, two unique ids +// cannot be made equal to each other by adding arbitrary bytes to one of +// them. That is, no unique ID is the prefix of another. +// +// This function guarantees that the returned ID will not be interpretable as +// a single varint. +// +// Note: these IDs are only valid for the duration of the process. size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { return file_->GetUniqueId(id, max_size); }; @@ -177,21 +177,21 @@ void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { file_->Hint(pattern); } - // Indicates the upper layers if the current RandomAccessFile implementation - // uses direct IO. +// Indicates the upper layers if the current RandomAccessFile implementation +// uses direct IO. bool EncryptedRandomAccessFile::use_direct_io() const { return file_->use_direct_io(); } - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); @@ -267,8 +267,8 @@ bool EncryptedWritableFile::IsSyncThreadSafe() const { return file_->IsSyncThreadSafe(); } - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } @@ -363,14 +363,14 @@ bool EncryptedRandomRWFile::use_direct_io() const { return file_->use_direct_io(); } - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } - // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. - // Pass aligned buffer when use_direct_io() returns true. +// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. +// Pass aligned buffer when use_direct_io() returns true. IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data, const IOOptions& options, IODebugContext* dbg) { @@ -397,9 +397,9 @@ IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data, return file_->Write(offset, dataToWrite, options, dbg); } - // Read up to `n` bytes starting from offset `offset` and store them in - // result, provided `scratch` size should be at least `n`. - // Returns Status::OK() on success. +// Read up to `n` bytes starting from offset `offset` and store them in +// result, provided `scratch` size should be at least `n`. +// Returns Status::OK() on success. IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const { @@ -953,7 +953,8 @@ Env* NewEncryptedEnv(Env* base_env, // Encrypt one or more (partial) blocks of data at the file offset. // Length of data is given in dataSize. -Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t dataSize) { +Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, + size_t dataSize) { // Calculate block index auto blockSize = BlockSize(); uint64_t blockIndex = fileOffset / blockSize; @@ -965,7 +966,7 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t // Encrypt individual blocks. while (1) { - char *block = data; + char* block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { // We're not encrypting a full block. @@ -998,7 +999,8 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t // Decrypt one or more (partial) blocks of data at the file offset. // Length of data is given in dataSize. -Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t dataSize) { +Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, + size_t dataSize) { // Calculate block index auto blockSize = BlockSize(); uint64_t blockIndex = fileOffset / blockSize; @@ -1010,7 +1012,7 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t // Decrypt individual blocks. while (1) { - char *block = data; + char* block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { // We're not decrypting a full block. @@ -1344,6 +1346,6 @@ Status EncryptionProvider::CreateFromString( result); } -#endif // ROCKSDB_LITE +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/env/env_encryption_ctr.h b/env/env_encryption_ctr.h index ce2d4b3e31e..cfb440c72ac 100644 --- a/env/env_encryption_ctr.h +++ b/env/env_encryption_ctr.h @@ -87,8 +87,8 @@ class CTREncryptionProvider : public EncryptionProvider { Status AddCipher(const std::string& descriptor, const char* /*cipher*/, size_t /*len*/, bool /*for_write*/) override; - protected: + protected: // PopulateSecretPrefixPart initializes the data into a new prefix block // that will be encrypted. This function will store the data in plain text. // It will be encrypted later (before written to disk). diff --git a/env/env_test.cc b/env/env_test.cc index 866f3eabe88..f4e9d50b239 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -91,7 +91,8 @@ extern "C" bool RocksDbIOUringEnable() { return true; } std::unique_ptr NewAligned(const size_t size, const char ch) { char* ptr = nullptr; #ifdef OS_WIN - if (nullptr == (ptr = reinterpret_cast(_aligned_malloc(size, kPageSize)))) { + if (nullptr == + (ptr = reinterpret_cast(_aligned_malloc(size, kPageSize)))) { return std::unique_ptr(nullptr, Deleter(_aligned_free)); } std::unique_ptr uptr(ptr, Deleter(_aligned_free)); @@ -183,8 +184,7 @@ TEST_F(EnvPosixTest, AreFilesSame) { std::string same_file_link_name = same_file_name + "_link"; std::unique_ptr same_file; - ASSERT_OK(env->NewWritableFile(same_file_name, - &same_file, soptions)); + ASSERT_OK(env->NewWritableFile(same_file_name, &same_file, soptions)); same_file->Append("random_data"); ASSERT_OK(same_file->Flush()); same_file.reset(); @@ -681,7 +681,7 @@ TEST_P(EnvPosixTestWithParam, TwoPools) { } TEST_P(EnvPosixTestWithParam, DecreaseNumBgThreads) { - constexpr int kWaitMicros = 60000000; // 1min + constexpr int kWaitMicros = 60000000; // 1min std::vector tasks(10); @@ -988,7 +988,6 @@ bool IsUniqueIDValid(const std::string& s) { const size_t MAX_ID_SIZE = 100; char temp_id[MAX_ID_SIZE]; - } // namespace // Determine whether we can use the FS_IOC_GETVERSION ioctl @@ -1030,12 +1029,12 @@ class IoctlFriendlyTmpdir { explicit IoctlFriendlyTmpdir() { char dir_buf[100]; - const char *fmt = "%s/rocksdb.XXXXXX"; - const char *tmp = getenv("TEST_IOCTL_FRIENDLY_TMPDIR"); + const char* fmt = "%s/rocksdb.XXXXXX"; + const char* tmp = getenv("TEST_IOCTL_FRIENDLY_TMPDIR"); #ifdef OS_WIN #define rmdir _rmdir - if(tmp == nullptr) { + if (tmp == nullptr) { tmp = getenv("TMP"); } @@ -1066,8 +1065,10 @@ class IoctlFriendlyTmpdir { // Diagnose ioctl-related failure only if this is the // directory specified via that envvar. if (tmp && tmp == d) { - fprintf(stderr, "TEST_IOCTL_FRIENDLY_TMPDIR-specified directory is " - "not suitable: %s\n", d.c_str()); + fprintf(stderr, + "TEST_IOCTL_FRIENDLY_TMPDIR-specified directory is " + "not suitable: %s\n", + d.c_str()); } rmdir(dir_buf); // ignore failure } @@ -1087,19 +1088,16 @@ class IoctlFriendlyTmpdir { return; } - fprintf(stderr, "failed to find an ioctl-friendly temporary directory;" + fprintf(stderr, + "failed to find an ioctl-friendly temporary directory;" " specify one via the TEST_IOCTL_FRIENDLY_TMPDIR envvar\n"); std::abort(); #endif } - ~IoctlFriendlyTmpdir() { - rmdir(dir_.c_str()); - } + ~IoctlFriendlyTmpdir() { rmdir(dir_.c_str()); } - const std::string& name() const { - return dir_; - } + const std::string& name() const { return dir_; } bool is_supported() const { return is_supported_; } @@ -1273,7 +1271,7 @@ TEST_P(EnvPosixTestWithParam, AllocateTest) { // Returns true if any of the strings in ss are the prefix of another string. bool HasPrefix(const std::unordered_set& ss) { - for (const std::string& s: ss) { + for (const std::string& s : ss) { if (s.empty()) { return true; } @@ -1506,19 +1504,23 @@ TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) { for (int i = 0; i < num_reads; i++) { int rnd_off; // No repeat offsets. - while (start_offsets.find(rnd_off = rnd.Uniform(81920)) != start_offsets.end()) {} + while (start_offsets.find(rnd_off = rnd.Uniform(81920)) != + start_offsets.end()) { + } start_offsets.insert(rnd_off); } std::vector offsets; std::vector lens; // std::set already sorted the offsets. - for (int so: start_offsets) { + for (int so : start_offsets) { offsets.push_back(so); } for (size_t i = 0; i + 1 < offsets.size(); i++) { - lens.push_back(static_cast(rnd.Uniform(static_cast(offsets[i + 1] - offsets[i])) + 1)); + lens.push_back(static_cast( + rnd.Uniform(static_cast(offsets[i + 1] - offsets[i])) + 1)); } - lens.push_back(static_cast(rnd.Uniform(static_cast(kTotalSize - offsets.back())) + 1)); + lens.push_back(static_cast( + rnd.Uniform(static_cast(kTotalSize - offsets.back())) + 1)); ASSERT_EQ(num_reads, lens.size()); // Create requests @@ -1540,8 +1542,9 @@ TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) { // Validate results for (int i = 0; i < num_reads; ++i) { ASSERT_OK(reqs[i].status); - ASSERT_EQ(Slice(expected_data.data() + offsets[i], lens[i]).ToString(true), - reqs[i].result.ToString(true)); + ASSERT_EQ( + Slice(expected_data.data() + offsets[i], lens[i]).ToString(true), + reqs[i].result.ToString(true)); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -1754,57 +1757,60 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) { // Create file. { std::unique_ptr wfile; -#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) - if (soptions.use_direct_writes) { - soptions.use_direct_writes = false; - } +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_writes) { + soptions.use_direct_writes = false; + } #endif - ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); - ASSERT_OK(wfile->Append(slice)); - ASSERT_OK(wfile->InvalidateCache(0, 0)); - ASSERT_OK(wfile->Close()); + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + ASSERT_OK(wfile->Append(slice)); + ASSERT_OK(wfile->InvalidateCache(0, 0)); + ASSERT_OK(wfile->Close()); } - // Random Read - { - std::unique_ptr file; - auto scratch = NewAligned(kSectorSize, 0); - Slice result; -#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) - if (soptions.use_direct_reads) { - soptions.use_direct_reads = false; - } -#endif - ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); - ASSERT_OK(file->Read(0, kSectorSize, &result, scratch.get())); - ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0); - ASSERT_OK(file->InvalidateCache(0, 11)); - ASSERT_OK(file->InvalidateCache(0, 0)); + // Random Read + { + std::unique_ptr file; + auto scratch = NewAligned(kSectorSize, 0); + Slice result; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_reads) { + soptions.use_direct_reads = false; } +#endif + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + ASSERT_OK(file->Read(0, kSectorSize, &result, scratch.get())); + ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0); + ASSERT_OK(file->InvalidateCache(0, 11)); + ASSERT_OK(file->InvalidateCache(0, 0)); + } - // Sequential Read - { - std::unique_ptr file; - auto scratch = NewAligned(kSectorSize, 0); - Slice result; -#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) - if (soptions.use_direct_reads) { - soptions.use_direct_reads = false; - } + // Sequential Read + { + std::unique_ptr file; + auto scratch = NewAligned(kSectorSize, 0); + Slice result; +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) + if (soptions.use_direct_reads) { + soptions.use_direct_reads = false; + } #endif - ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions)); - if (file->use_direct_io()) { - ASSERT_OK(file->PositionedRead(0, kSectorSize, &result, scratch.get())); - } else { - ASSERT_OK(file->Read(kSectorSize, &result, scratch.get())); - } - ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0); - ASSERT_OK(file->InvalidateCache(0, 11)); - ASSERT_OK(file->InvalidateCache(0, 0)); + ASSERT_OK(env_->NewSequentialFile(fname, &file, soptions)); + if (file->use_direct_io()) { + ASSERT_OK(file->PositionedRead(0, kSectorSize, &result, scratch.get())); + } else { + ASSERT_OK(file->Read(kSectorSize, &result, scratch.get())); } - // Delete the file - ASSERT_OK(env_->DeleteFile(fname)); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + ASSERT_EQ(memcmp(scratch.get(), data.get(), kSectorSize), 0); + ASSERT_OK(file->InvalidateCache(0, 11)); + ASSERT_OK(file->InvalidateCache(0, 0)); + } + // Delete the file + ASSERT_OK(env_->DeleteFile(fname)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); } #endif // OS_LINUX || OS_WIN @@ -1931,52 +1937,53 @@ TEST_P(EnvPosixTestWithParam, Preallocation) { std::unique_ptr srcfile; EnvOptions soptions; soptions.use_direct_reads = soptions.use_direct_writes = direct_io_; -#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) - if (soptions.use_direct_writes) { - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "NewWritableFile:O_DIRECT", [&](void* arg) { - int* val = static_cast(arg); - *val &= ~O_DIRECT; - }); - } +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) + if (soptions.use_direct_writes) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + } #endif - ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); - srcfile->SetPreallocationBlockSize(1024 * 1024); - - // No writes should mean no preallocation - size_t block_size, last_allocated_block; - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 0UL); - - // Small write should preallocate one block - size_t kStrSize = 4096; - auto data = NewAligned(kStrSize, 'A'); - Slice str(data.get(), kStrSize); - srcfile->PrepareWrite(srcfile->GetFileSize(), kStrSize); - ASSERT_OK(srcfile->Append(str)); + ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + size_t kStrSize = 4096; + auto data = NewAligned(kStrSize, 'A'); + Slice str(data.get(), kStrSize); + srcfile->PrepareWrite(srcfile->GetFileSize(), kStrSize); + ASSERT_OK(srcfile->Append(str)); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + { + auto buf_ptr = NewAligned(block_size, ' '); + Slice buf(buf_ptr.get(), block_size); + srcfile->PrepareWrite(srcfile->GetFileSize(), block_size); + ASSERT_OK(srcfile->Append(buf)); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 1UL); - - // Write an entire preallocation block, make sure we increased by two. - { - auto buf_ptr = NewAligned(block_size, ' '); - Slice buf(buf_ptr.get(), block_size); - srcfile->PrepareWrite(srcfile->GetFileSize(), block_size); - ASSERT_OK(srcfile->Append(buf)); - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 2UL); - } + ASSERT_EQ(last_allocated_block, 2UL); + } - // Write five more blocks at once, ensure we're where we need to be. - { - auto buf_ptr = NewAligned(block_size * 5, ' '); - Slice buf = Slice(buf_ptr.get(), block_size * 5); - srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); - ASSERT_OK(srcfile->Append(buf)); - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 7UL); - } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + // Write five more blocks at once, ensure we're where we need to be. + { + auto buf_ptr = NewAligned(block_size * 5, ' '); + Slice buf = Slice(buf_ptr.get(), block_size * 5); + srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); + ASSERT_OK(srcfile->Append(buf)); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); } // Test that the two ways to get children file attributes (in bulk or @@ -1993,53 +2000,50 @@ TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) { for (int i = 0; i < kNumChildren; ++i) { const std::string path = test_base_dir + "/testfile_" + std::to_string(i); std::unique_ptr file; -#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) - if (soptions.use_direct_writes) { - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "NewWritableFile:O_DIRECT", [&](void* arg) { - int* val = static_cast(arg); - *val &= ~O_DIRECT; - }); - } +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) + if (soptions.use_direct_writes) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewWritableFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + } #endif - ASSERT_OK(env_->NewWritableFile(path, &file, soptions)); - auto buf_ptr = NewAligned(data.size(), 'T'); - Slice buf(buf_ptr.get(), data.size()); - ASSERT_OK(file->Append(buf)); - data.append(std::string(4096, 'T')); + ASSERT_OK(env_->NewWritableFile(path, &file, soptions)); + auto buf_ptr = NewAligned(data.size(), 'T'); + Slice buf(buf_ptr.get(), data.size()); + ASSERT_OK(file->Append(buf)); + data.append(std::string(4096, 'T')); } - std::vector file_attrs; - ASSERT_OK(env_->GetChildrenFileAttributes(test_base_dir, &file_attrs)); - for (int i = 0; i < kNumChildren; ++i) { - const std::string name = "testfile_" + std::to_string(i); - const std::string path = test_base_dir + "/" + name; + std::vector file_attrs; + ASSERT_OK(env_->GetChildrenFileAttributes(test_base_dir, &file_attrs)); + for (int i = 0; i < kNumChildren; ++i) { + const std::string name = "testfile_" + std::to_string(i); + const std::string path = test_base_dir + "/" + name; - auto file_attrs_iter = std::find_if( - file_attrs.begin(), file_attrs.end(), - [&name](const Env::FileAttributes& fm) { return fm.name == name; }); - ASSERT_TRUE(file_attrs_iter != file_attrs.end()); - uint64_t size; - ASSERT_OK(env_->GetFileSize(path, &size)); - ASSERT_EQ(size, 4096 * i); - ASSERT_EQ(size, file_attrs_iter->size_bytes); - } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + auto file_attrs_iter = std::find_if( + file_attrs.begin(), file_attrs.end(), + [&name](const Env::FileAttributes& fm) { return fm.name == name; }); + ASSERT_TRUE(file_attrs_iter != file_attrs.end()); + uint64_t size; + ASSERT_OK(env_->GetFileSize(path, &size)); + ASSERT_EQ(size, 4096 * i); + ASSERT_EQ(size, file_attrs_iter->size_bytes); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); } // Test that all WritableFileWrapper forwards all calls to WritableFile. TEST_P(EnvPosixTestWithParam, WritableFileWrapper) { class Base : public WritableFile { public: - mutable int *step_; + mutable int* step_; - void inc(int x) const { - EXPECT_EQ(x, (*step_)++); - } + void inc(int x) const { EXPECT_EQ(x, (*step_)++); } - explicit Base(int* step) : step_(step) { - inc(0); - } + explicit Base(int* step) : step_(step) { inc(0); } Status Append(const Slice& /*data*/) override { inc(1); @@ -2372,32 +2376,31 @@ TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) { } class TestEnv : public EnvWrapper { - public: - explicit TestEnv() : EnvWrapper(Env::Default()), - close_count(0) { } - const char* Name() const override { return "TestEnv"; } - class TestLogger : public Logger { - public: - using Logger::Logv; - explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } - ~TestLogger() override { - if (!closed_) { - Status s = CloseHelper(); - s.PermitUncheckedError(); - } + public: + explicit TestEnv() : EnvWrapper(Env::Default()), close_count(0) {} + const char* Name() const override { return "TestEnv"; } + class TestLogger : public Logger { + public: + using Logger::Logv; + explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } + ~TestLogger() override { + if (!closed_) { + Status s = CloseHelper(); + s.PermitUncheckedError(); } - void Logv(const char* /*format*/, va_list /*ap*/) override {} + } + void Logv(const char* /*format*/, va_list /*ap*/) override {} - protected: - Status CloseImpl() override { return CloseHelper(); } + protected: + Status CloseImpl() override { return CloseHelper(); } - private: - Status CloseHelper() { - env->CloseCountInc(); - return Status::OK(); - } - TestEnv* env; - }; + private: + Status CloseHelper() { + env->CloseCountInc(); + return Status::OK(); + } + TestEnv* env; + }; void CloseCountInc() { close_count++; } @@ -2504,7 +2507,8 @@ class EnvFSTestWithParam env_ptr_ = NewCompositeEnv(fs_); } if (env_non_null && !env_default && fs_default) { - env_ptr_ = std::unique_ptr(new FaultInjectionTestEnv(Env::Default())); + env_ptr_ = + std::unique_ptr(new FaultInjectionTestEnv(Env::Default())); fs_.reset(); } if (env_non_null && !env_default && !fs_default) { @@ -2572,17 +2576,16 @@ TEST_P(EnvFSTestWithParam, OptionsTest) { // 1. True means Options::env is non-null, false means null // 2. True means use Env::Default, false means custom // 3. True means use FileSystem::Default, false means custom -INSTANTIATE_TEST_CASE_P( - EnvFSTest, EnvFSTestWithParam, - ::testing::Combine(::testing::Bool(), ::testing::Bool(), - ::testing::Bool())); +INSTANTIATE_TEST_CASE_P(EnvFSTest, EnvFSTestWithParam, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), + ::testing::Bool())); // This test ensures that default Env and those allocated by // NewCompositeEnv() all share the same threadpool TEST_F(EnvTest, MultipleCompositeEnv) { std::shared_ptr fs1 = - std::make_shared(FileSystem::Default()); + std::make_shared(FileSystem::Default()); std::shared_ptr fs2 = - std::make_shared(FileSystem::Default()); + std::make_shared(FileSystem::Default()); std::unique_ptr env1 = NewCompositeEnv(fs1); std::unique_ptr env2 = NewCompositeEnv(fs2); Env::Default()->SetBackgroundThreads(8, Env::HIGH); diff --git a/env/file_system.cc b/env/file_system.cc index ab5b3c450df..f9dda429a33 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -136,7 +136,7 @@ IOStatus FileSystem::NewLogger(const std::string& fname, } FileOptions FileSystem::OptimizeForLogRead( - const FileOptions& file_options) const { + const FileOptions& file_options) const { FileOptions optimized_file_options(file_options); optimized_file_options.use_direct_reads = false; return optimized_file_options; @@ -150,7 +150,7 @@ FileOptions FileSystem::OptimizeForManifestRead( } FileOptions FileSystem::OptimizeForLogWrite(const FileOptions& file_options, - const DBOptions& db_options) const { + const DBOptions& db_options) const { FileOptions optimized_file_options(file_options); optimized_file_options.bytes_per_sync = db_options.wal_bytes_per_sync; optimized_file_options.writable_file_max_buffer_size = @@ -220,8 +220,7 @@ IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, char* space = new char[kBufferSize]; while (true) { Slice fragment; - s = file->Read(kBufferSize, IOOptions(), &fragment, space, - nullptr); + s = file->Read(kBufferSize, IOOptions(), &fragment, space, nullptr); if (!s.ok()) { break; } diff --git a/env/io_posix.cc b/env/io_posix.cc index 59a94acd46d..0ec0e9c83b4 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -9,8 +9,10 @@ #ifdef ROCKSDB_LIB_IO_POSIX #include "env/io_posix.h" + #include #include + #include #if defined(OS_LINUX) #include @@ -601,8 +603,7 @@ IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n, return s; } -IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, - size_t num_reqs, +IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) { if (use_direct_io()) { diff --git a/env/io_posix.h b/env/io_posix.h index d766427f8f5..f129668ea54 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -35,6 +35,12 @@ #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ #define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ #define POSIX_FADV_DONTNEED 4 /* [MC1] don't need these pages */ + +#define POSIX_MADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_MADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_MADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_MADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_MADV_DONTNEED 4 /* [MC1] don't need these pages */ #endif namespace ROCKSDB_NAMESPACE { @@ -284,8 +290,7 @@ class PosixRandomAccessFile : public FSRandomAccessFile { public: PosixRandomAccessFile(const std::string& fname, int fd, - size_t logical_block_size, - const EnvOptions& options + size_t logical_block_size, const EnvOptions& options #if defined(ROCKSDB_IOURING_PRESENT) , ThreadLocalPtr* thread_local_io_urings diff --git a/env/mock_env.h b/env/mock_env.h index a8d5283c5f7..406a31f6357 100644 --- a/env/mock_env.h +++ b/env/mock_env.h @@ -135,6 +135,7 @@ class MockEnv : public CompositeEnvWrapper { const char* Name() const override { return kClassName(); } Status CorruptBuffer(const std::string& fname); + private: MockEnv(Env* env, const std::shared_ptr& fs, const std::shared_ptr& clock); diff --git a/env/mock_env_test.cc b/env/mock_env_test.cc index bcd8ed530a5..be174bd73d2 100644 --- a/env/mock_env_test.cc +++ b/env/mock_env_test.cc @@ -51,14 +51,14 @@ TEST_F(MockEnvTest, Corrupt) { ASSERT_OK(writable_file->Append(kCorrupted)); ASSERT_TRUE(writable_file->GetFileSize() == kGood.size() + kCorrupted.size()); result.clear(); - ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(), - &result, &(scratch[0]))); + ASSERT_OK( + rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0]))); ASSERT_EQ(result.compare(kCorrupted), 0); // Corrupted ASSERT_OK(dynamic_cast(env_)->CorruptBuffer(kFileName)); result.clear(); - ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(), - &result, &(scratch[0]))); + ASSERT_OK( + rand_file->Read(kGood.size(), kCorrupted.size(), &result, &(scratch[0]))); ASSERT_NE(result.compare(kCorrupted), 0); } diff --git a/examples/.gitignore b/examples/.gitignore index 823664ae1f5..39da06a8584 100644 --- a/examples/.gitignore +++ b/examples/.gitignore @@ -5,5 +5,6 @@ compaction_filter_example multi_processes_example optimistic_transaction_example options_file_example +rocksdb_backup_restore_example simple_example transaction_example diff --git a/examples/column_families_example.cc b/examples/column_families_example.cc index d28b8e776e3..3828d3fb3f7 100644 --- a/examples/column_families_example.cc +++ b/examples/column_families_example.cc @@ -7,8 +7,8 @@ #include #include "rocksdb/db.h" -#include "rocksdb/slice.h" #include "rocksdb/options.h" +#include "rocksdb/slice.h" #if defined(OS_WIN) std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_column_families_example"; @@ -52,8 +52,8 @@ int main() { column_families.push_back(ColumnFamilyDescriptor( ROCKSDB_NAMESPACE::kDefaultColumnFamilyName, ColumnFamilyOptions())); // open the new one, too - column_families.push_back(ColumnFamilyDescriptor( - "new_cf", ColumnFamilyOptions())); + column_families.push_back( + ColumnFamilyDescriptor("new_cf", ColumnFamilyOptions())); std::vector handles; s = DB::Open(DBOptions(), kDBPath, column_families, &handles, &db); assert(s.ok()); diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc index e56b30d592b..1ecf8c79474 100644 --- a/examples/compact_files_example.cc +++ b/examples/compact_files_example.cc @@ -8,6 +8,7 @@ #include #include + #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/options.h" @@ -39,29 +40,27 @@ class Compactor : public EventListener { // and column family. It is the caller's responsibility to // destroy the returned CompactionTask. Returns "nullptr" // if it cannot find a proper compaction task. - virtual CompactionTask* PickCompaction( - DB* db, const std::string& cf_name) = 0; + virtual CompactionTask* PickCompaction(DB* db, + const std::string& cf_name) = 0; // Schedule and run the specified compaction task in background. - virtual void ScheduleCompaction(CompactionTask *task) = 0; + virtual void ScheduleCompaction(CompactionTask* task) = 0; }; // Example structure that describes a compaction task. struct CompactionTask { - CompactionTask( - DB* _db, Compactor* _compactor, - const std::string& _column_family_name, - const std::vector& _input_file_names, - const int _output_level, - const CompactionOptions& _compact_options, - bool _retry_on_fail) - : db(_db), - compactor(_compactor), - column_family_name(_column_family_name), - input_file_names(_input_file_names), - output_level(_output_level), - compact_options(_compact_options), - retry_on_fail(_retry_on_fail) {} + CompactionTask(DB* _db, Compactor* _compactor, + const std::string& _column_family_name, + const std::vector& _input_file_names, + const int _output_level, + const CompactionOptions& _compact_options, bool _retry_on_fail) + : db(_db), + compactor(_compactor), + column_family_name(_column_family_name), + input_file_names(_input_file_names), + output_level(_output_level), + compact_options(_compact_options), + retry_on_fail(_retry_on_fail) {} DB* db; Compactor* compactor; const std::string& column_family_name; @@ -77,15 +76,13 @@ class FullCompactor : public Compactor { public: explicit FullCompactor(const Options options) : options_(options) { compact_options_.compression = options_.compression; - compact_options_.output_file_size_limit = - options_.target_file_size_base; + compact_options_.output_file_size_limit = options_.target_file_size_base; } // When flush happens, it determines whether to trigger compaction. If // triggered_writes_stop is true, it will also set the retry flag of // compaction-task to true. - void OnFlushCompleted( - DB* db, const FlushJobInfo& info) override { + void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { CompactionTask* task = PickCompaction(db, info.cf_name); if (task != nullptr) { if (info.triggered_writes_stop) { @@ -97,8 +94,7 @@ class FullCompactor : public Compactor { } // Always pick a compaction which includes all files whenever possible. - CompactionTask* PickCompaction( - DB* db, const std::string& cf_name) override { + CompactionTask* PickCompaction(DB* db, const std::string& cf_name) override { ColumnFamilyMetaData cf_meta; db->GetColumnFamilyMetaData(&cf_meta); @@ -111,9 +107,8 @@ class FullCompactor : public Compactor { input_file_names.push_back(file.name); } } - return new CompactionTask( - db, this, cf_name, input_file_names, - options_.num_levels - 1, compact_options_, false); + return new CompactionTask(db, this, cf_name, input_file_names, + options_.num_levels - 1, compact_options_, false); } // Schedule the specified compaction task in background. @@ -127,16 +122,14 @@ class FullCompactor : public Compactor { assert(task); assert(task->db); Status s = task->db->CompactFiles( - task->compact_options, - task->input_file_names, - task->output_level); + task->compact_options, task->input_file_names, task->output_level); printf("CompactFiles() finished with status %s\n", s.ToString().c_str()); if (!s.ok() && !s.IsIOError() && task->retry_on_fail) { // If a compaction task with its retry_on_fail=true failed, // try to schedule another compaction in case the reason // is not an IO error. - CompactionTask* new_task = task->compactor->PickCompaction( - task->db, task->column_family_name); + CompactionTask* new_task = + task->compactor->PickCompaction(task->db, task->column_family_name); task->compactor->ScheduleCompaction(new_task); } } @@ -167,14 +160,13 @@ int main() { // because of options.level0_stop_writes_trigger for (int i = 1000; i < 99999; ++i) { db->Put(WriteOptions(), std::to_string(i), - std::string(500, 'a' + (i % 26))); + std::string(500, 'a' + (i % 26))); } // verify the values are still there std::string value; for (int i = 1000; i < 99999; ++i) { - db->Get(ReadOptions(), std::to_string(i), - &value); + db->Get(ReadOptions(), std::to_string(i), &value); assert(value == std::string(500, 'a' + (i % 26))); } diff --git a/examples/optimistic_transaction_example.cc b/examples/optimistic_transaction_example.cc index e0398f66e58..fb0514a694a 100644 --- a/examples/optimistic_transaction_example.cc +++ b/examples/optimistic_transaction_example.cc @@ -8,8 +8,8 @@ #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" -#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" using ROCKSDB_NAMESPACE::DB; using ROCKSDB_NAMESPACE::OptimisticTransactionDB; diff --git a/examples/simple_example.cc b/examples/simple_example.cc index 24e97506e78..2d49c4d14da 100644 --- a/examples/simple_example.cc +++ b/examples/simple_example.cc @@ -7,8 +7,8 @@ #include #include "rocksdb/db.h" -#include "rocksdb/slice.h" #include "rocksdb/options.h" +#include "rocksdb/slice.h" using ROCKSDB_NAMESPACE::DB; using ROCKSDB_NAMESPACE::Options; diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index 300bf0f8f91..b97a0f224d5 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -61,9 +61,10 @@ DeleteScheduler::~DeleteScheduler() { Status DeleteScheduler::DeleteFile(const std::string& file_path, const std::string& dir_to_sync, const bool force_bg) { - if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && - total_trash_size_.load() > - sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) { + if (rate_bytes_per_sec_.load() <= 0 || + (!force_bg && + total_trash_size_.load() > + sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) { // Rate limiting is disabled or trash size makes up more than // max_trash_db_ratio_ (default 25%) of the total DB size TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); @@ -318,8 +319,8 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, if (my_status.ok()) { if (num_hard_links == 1) { std::unique_ptr wf; - my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(), - &wf, nullptr); + my_status = fs_->ReopenWritableFile(path_in_trash, FileOptions(), &wf, + nullptr); if (my_status.ok()) { my_status = wf->Truncate(file_size - bytes_max_delete_chunk_, IOOptions(), nullptr); diff --git a/file/delete_scheduler.h b/file/delete_scheduler.h index 6d3f6b4a4f4..2904ec62186 100644 --- a/file/delete_scheduler.h +++ b/file/delete_scheduler.h @@ -14,7 +14,6 @@ #include "monitoring/instrumented_mutex.h" #include "port/port.h" - #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { @@ -54,7 +53,7 @@ class DeleteScheduler { // set, it forces the file to always be deleted in the background thread, // except when rate limiting is disabled Status DeleteFile(const std::string& fname, const std::string& dir_to_sync, - const bool force_bg = false); + const bool force_bg = false); // Wait for all files being deleteing in the background to finish or for // destructor to be called. @@ -67,9 +66,7 @@ class DeleteScheduler { uint64_t GetTotalTrashSize() { return total_trash_size_.load(); } // Return trash/DB size ratio where new files will be deleted immediately - double GetMaxTrashDBRatio() { - return max_trash_db_ratio_.load(); - } + double GetMaxTrashDBRatio() { return max_trash_db_ratio_.load(); } // Update trash/DB size ratio where new files will be deleted immediately void SetMaxTrashDBRatio(double r) { diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc index 19672115559..d825da32a28 100644 --- a/file/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -136,7 +136,7 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) { EXPECT_EQ(dummy_files_dirs_[0], *dir); }); - int num_files = 100; // 100 files + int num_files = 100; // 100 files uint64_t file_size = 1024; // every file is 1 kb std::vector delete_kbs_per_sec = {512, 200, 100, 50, 25}; @@ -249,7 +249,7 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) { [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); int thread_cnt = 10; - int num_files = 10; // 10 files per thread + int num_files = 10; // 10 files per thread uint64_t file_size = 1024; // every file is 1 kb std::vector delete_kbs_per_sec = {512, 200, 100, 50, 25}; @@ -591,8 +591,7 @@ TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) { rate_bytes_per_sec_ = 0; // Disable rate limiting initially NewDeleteScheduler(); - - int num_files = 10; // 10 files + int num_files = 10; // 10 files uint64_t file_size = 1024; // every file is 1 kb std::vector delete_kbs_per_sec = {512, 200, 0, 100, 50, -2, 25}; @@ -662,9 +661,9 @@ TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - int num_files = 100; // 100 files - uint64_t file_size = 1024 * 10; // 100 KB as a file size - rate_bytes_per_sec_ = 1; // 1 byte per sec (very slow trash delete) + int num_files = 100; // 100 files + uint64_t file_size = 1024 * 10; // 100 KB as a file size + rate_bytes_per_sec_ = 1; // 1 byte per sec (very slow trash delete) NewDeleteScheduler(); delete_scheduler_->SetMaxTrashDBRatio(0.25); diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index 9ea9129e2da..f7d4c959135 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -109,6 +109,7 @@ Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t read_len, uint64_t rounddown_start, uint32_t index) { + TEST_SYNC_POINT("FilePrefetchBuffer::ReadAsync"); // callback for async read request. auto fp = std::bind(&FilePrefetchBuffer::PrefetchAsyncCallback, this, std::placeholders::_1, std::placeholders::_2); @@ -247,10 +248,14 @@ void FilePrefetchBuffer::AbortAllIOs() { // Release io_handles. if (bufs_[curr_].io_handle_ != nullptr && bufs_[curr_].del_fn_ != nullptr) { DestroyAndClearIOHandle(curr_); + } else { + bufs_[curr_].async_read_in_progress_ = false; } if (bufs_[second].io_handle_ != nullptr && bufs_[second].del_fn_ != nullptr) { DestroyAndClearIOHandle(second); + } else { + bufs_[second].async_read_in_progress_ = false; } } @@ -325,7 +330,16 @@ Status FilePrefetchBuffer::HandleOverlappingData( uint64_t& tmp_offset, size_t& tmp_length) { Status s; size_t alignment = reader->file()->GetRequiredBufferAlignment(); - uint32_t second = curr_ ^ 1; + uint32_t second; + + // Check if the first buffer has the required offset and the async read is + // still in progress. This should only happen if a prefetch was initiated + // by Seek, but the next access is at another offset. + if (bufs_[curr_].async_read_in_progress_ && + IsOffsetInBufferWithAsyncProgress(offset, curr_)) { + PollAndUpdateBuffersIfNeeded(offset); + } + second = curr_ ^ 1; // If data is overlapping over two buffers, copy the data from curr_ and // call ReadAsync on curr_. diff --git a/file/file_util.cc b/file/file_util.cc index d7858f3c805..7997d6e11eb 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -5,8 +5,8 @@ // #include "file/file_util.h" -#include #include +#include #include "file/random_access_file_reader.h" #include "file/sequence_file_reader.h" diff --git a/file/filename.cc b/file/filename.cc index b771e081396..1e04c73395e 100644 --- a/file/filename.cc +++ b/file/filename.cc @@ -7,11 +7,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "file/filename.h" -#include #include #include + +#include #include + #include "file/writable_file_writer.h" #include "rocksdb/env.h" #include "test_util/sync_point.h" @@ -42,10 +44,8 @@ static size_t GetInfoLogPrefix(const std::string& path, char* dest, int len) { while (i < src_len && write_idx < len - sizeof(suffix)) { if ((path[i] >= 'a' && path[i] <= 'z') || (path[i] >= '0' && path[i] <= '9') || - (path[i] >= 'A' && path[i] <= 'Z') || - path[i] == '-' || - path[i] == '.' || - path[i] == '_'){ + (path[i] >= 'A' && path[i] <= 'Z') || path[i] == '-' || + path[i] == '.' || path[i] == '_') { dest[write_idx++] = path[i]; } else { if (i > 0) { @@ -153,9 +153,10 @@ void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf, if (path_id == 0) { snprintf(out_buf, out_buf_size, "%" PRIu64, number); } else { - snprintf(out_buf, out_buf_size, "%" PRIu64 - "(path " - "%" PRIu32 ")", + snprintf(out_buf, out_buf_size, + "%" PRIu64 + "(path " + "%" PRIu32 ")", number, path_id); } } @@ -176,9 +177,7 @@ std::string CurrentFileName(const std::string& dbname) { return dbname + "/" + kCurrentFileName; } -std::string LockFileName(const std::string& dbname) { - return dbname + "/LOCK"; -} +std::string LockFileName(const std::string& dbname) { return dbname + "/LOCK"; } std::string TempFileName(const std::string& dbname, uint64_t number) { return MakeFileName(dbname, number, kTempFileNameSuffix.c_str()); @@ -199,7 +198,8 @@ InfoLogPrefix::InfoLogPrefix(bool has_log_dir, } std::string InfoLogFileName(const std::string& dbname, - const std::string& db_path, const std::string& log_dir) { + const std::string& db_path, + const std::string& log_dir) { if (log_dir.empty()) { return dbname + "/LOG"; } @@ -210,7 +210,8 @@ std::string InfoLogFileName(const std::string& dbname, // Return the name of the old info log file for "dbname". std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts, - const std::string& db_path, const std::string& log_dir) { + const std::string& db_path, + const std::string& log_dir) { char buf[50]; snprintf(buf, sizeof(buf), "%llu", static_cast(ts)); @@ -263,9 +264,7 @@ std::string IdentityFileName(const std::string& dbname) { // dbname/OPTIONS-[0-9]+ // dbname/OPTIONS-[0-9]+.dbtmp // Disregards / at the beginning -bool ParseFileName(const std::string& fname, - uint64_t* number, - FileType* type, +bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type, WalFileType* log_type) { return ParseFileName(fname, number, "", type, log_type); } @@ -370,7 +369,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number, *log_type = kAliveLogFile; } } else if (archive_dir_found) { - return false; // Archive dir can contain only log files + return false; // Archive dir can contain only log files } else if (suffix == Slice(kRocksDbTFileExt) || suffix == Slice(kLevelDbTFileExt)) { *type = kTableFile; diff --git a/file/filename.h b/file/filename.h index 6d45e521073..2eb125b6a17 100644 --- a/file/filename.h +++ b/file/filename.h @@ -11,8 +11,9 @@ #pragma once #include -#include + #include +#include #include #include "options/db_options.h" @@ -54,8 +55,7 @@ extern std::string ArchivalDirectory(const std::string& dbname); // Return the name of the archived log file with the specified number // in the db named by "dbname". The result will be prefixed with "dbname". -extern std::string ArchivedLogFileName(const std::string& dbname, - uint64_t num); +extern std::string ArchivedLogFileName(const std::string& dbname, uint64_t num); extern std::string MakeTableFileName(const std::string& name, uint64_t number); @@ -140,8 +140,7 @@ extern std::string TempOptionsFileName(const std::string& dbname, // Return the name to use for a metadatabase. The result will be prefixed with // "dbname". -extern std::string MetaDatabaseName(const std::string& dbname, - uint64_t number); +extern std::string MetaDatabaseName(const std::string& dbname, uint64_t number); // Return the name of the Identity file which stores a unique number for the db // that will get regenerated if the db loses all its data and is recreated fresh diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index ec06ef0d898..23e7454ed11 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -4,10 +4,14 @@ // (found in the LICENSE.Apache file in the root directory). #include "db/db_test_util.h" +#include "file/file_prefetch_buffer.h" +#include "file/file_util.h" +#include "rocksdb/file_system.h" #include "test_util/sync_point.h" #ifdef GFLAGS #include "tools/io_tracer_parser_tool.h" #endif +#include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -75,6 +79,27 @@ class PrefetchTest public ::testing::WithParamInterface> { public: PrefetchTest() : DBTestBase("prefetch_test", true) {} + + void SetGenericOptions(Env* env, bool use_direct_io, Options& options) { + options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env; + options.disable_auto_compactions = true; + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + } + + void SetBlockBasedTableOptions(BlockBasedTableOptions& table_options) { + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } }; INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest, @@ -85,28 +110,23 @@ std::string BuildKey(int num, std::string postfix = "") { return "my_key_" + std::to_string(num) + postfix; } +// This test verifies the basic functionality of prefetching. TEST_P(PrefetchTest, Basic) { // First param is if the mockFS support_prefetch or not bool support_prefetch = std::get<0>(GetParam()) && test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); // Second param is if directIO is enabled or not bool use_direct_io = std::get<1>(GetParam()); - const int kNumKeys = 1100; - std::shared_ptr fs = - std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + const int kNumKeys = 1100; int buff_prefetch_count = 0; SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); @@ -188,35 +208,24 @@ TEST_P(PrefetchTest, Basic) { } #ifndef ROCKSDB_LITE +// This test verifies BlockBasedTableOptions.max_auto_readahead_size is +// configured dynamically. TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { // First param is if the mockFS support_prefetch or not bool support_prefetch = std::get<0>(GetParam()) && test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); // Second param is if directIO is enabled or not bool use_direct_io = std::get<1>(GetParam()); - std::shared_ptr fs = - std::make_shared(env_->GetFileSystem(), support_prefetch); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - options.disable_auto_compactions = true; - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } + Options options; + SetGenericOptions(env.get(), use_direct_io, options); BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + SetBlockBasedTableOptions(table_options); table_options.max_auto_readahead_size = 0; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -325,6 +334,8 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { Close(); } +// This test verifies BlockBasedTableOptions.initial_auto_readahead_size is +// configured dynamically. TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { // First param is if the mockFS support_prefetch or not bool support_prefetch = @@ -337,23 +348,10 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { std::shared_ptr fs = std::make_shared(env_->GetFileSystem(), support_prefetch); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - options.disable_auto_compactions = true; - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } + Options options; + SetGenericOptions(env.get(), use_direct_io, options); BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + SetBlockBasedTableOptions(table_options); table_options.initial_auto_readahead_size = 0; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -464,6 +462,8 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { Close(); } +// This test verifies BlockBasedTableOptions.num_file_reads_for_auto_readahead +// is configured dynamically. TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) { // First param is if the mockFS support_prefetch or not bool support_prefetch = @@ -478,26 +478,13 @@ TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) { // Second param is if directIO is enabled or not bool use_direct_io = std::get<1>(GetParam()); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - + Options options; + SetGenericOptions(env.get(), use_direct_io, options); BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + SetBlockBasedTableOptions(table_options); table_options.num_file_reads_for_auto_readahead = 0; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } - int buff_prefetch_count = 0; SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); @@ -574,6 +561,13 @@ TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) { } #endif // !ROCKSDB_LITE +// This test verifies the basic functionality of implicit autoreadahead: +// - Enable implicit autoreadahead and prefetch only if sequential blocks are +// read, +// - If data is already in buffer and few blocks are not requested to read, +// don't reset, +// - If data blocks are sequential during read after enabling implicit +// autoreadahead, reset readahead parameters. TEST_P(PrefetchTest, PrefetchWhenReseek) { // First param is if the mockFS support_prefetch or not bool support_prefetch = @@ -588,25 +582,12 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { // Second param is if directIO is enabled or not bool use_direct_io = std::get<1>(GetParam()); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - + Options options; + SetGenericOptions(env.get(), use_direct_io, options); BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + SetBlockBasedTableOptions(table_options); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } - int buff_prefetch_count = 0; SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); @@ -660,7 +641,7 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { iter->Seek(BuildKey(1019)); ASSERT_TRUE(iter->Valid()); // Missed 2 blocks but they are already in buffer so no reset. - iter->Seek(BuildKey(103)); // Already in buffer. + iter->Seek(BuildKey(103)); // Already in buffer. ASSERT_TRUE(iter->Valid()); iter->Seek(BuildKey(1033)); // Prefetch Data ASSERT_TRUE(iter->Valid()); @@ -842,6 +823,12 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { Close(); } +// This test verifies the functionality of implicit autoreadahead when caching +// is enabled: +// - If data is already in buffer and few blocks are not requested to read, +// don't reset, +// - If block was eligible for prefetching/in buffer but found in cache, don't +// prefetch and reset. TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) { // First param is if the mockFS support_prefetch or not bool support_prefetch = @@ -856,26 +843,15 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) { // Second param is if directIO is enabled or not bool use_direct_io = std::get<1>(GetParam()); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - + Options options; + SetGenericOptions(env.get(), use_direct_io, options); BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB table_options.block_cache = cache; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.no_block_cache = false; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } - int buff_prefetch_count = 0; SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); @@ -974,6 +950,7 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) { } #ifndef ROCKSDB_LITE +// This test verifies the functionality of ReadOptions.adaptive_readahead. TEST_P(PrefetchTest, DBIterLevelReadAhead) { const int kNumKeys = 1000; // Set options @@ -984,23 +961,11 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { bool use_direct_io = std::get<0>(GetParam()); bool is_adaptive_readahead = std::get<1>(GetParam()); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; + Options options; + SetGenericOptions(env.get(), use_direct_io, options); options.statistics = CreateDBStatistics(); - options.env = env.get(); - - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + SetBlockBasedTableOptions(table_options); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Status s = TryReopen(options); @@ -1024,7 +989,6 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { } MoveFilesToLevel(2); int buff_prefetch_count = 0; - int buff_async_prefetch_count = 0; int readahead_carry_over_count = 0; int num_sst_files = NumTableFilesAtLevel(2); size_t current_readahead_size = 0; @@ -1035,6 +999,101 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { "FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); + // The callback checks, since reads are sequential, readahead_size doesn't + // start from 8KB when iterator moves to next file and its called + // num_sst_files-1 times (excluding for first file). + SyncPoint::GetInstance()->SetCallBack( + "BlockPrefetcher::SetReadaheadState", [&](void* arg) { + readahead_carry_over_count++; + size_t readahead_size = *reinterpret_cast(arg); + if (readahead_carry_over_count) { + ASSERT_GT(readahead_size, 8 * 1024); + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::TryReadFromCache", [&](void* arg) { + current_readahead_size = *reinterpret_cast(arg); + ASSERT_GT(current_readahead_size, 0); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions ro; + if (is_adaptive_readahead) { + ro.adaptive_readahead = true; + } + + ASSERT_OK(options.statistics->Reset()); + + auto iter = std::unique_ptr(db_->NewIterator(ro)); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + num_keys++; + } + ASSERT_EQ(num_keys, total_keys); + + // For index and data blocks. + if (is_adaptive_readahead) { + ASSERT_EQ(readahead_carry_over_count, 2 * (num_sst_files - 1)); + } else { + ASSERT_GT(buff_prefetch_count, 0); + ASSERT_EQ(readahead_carry_over_count, 0); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } + Close(); +} + +// This test verifies the functionality of ReadOptions.adaptive_readahead when +// async_io is enabled. +TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { + const int kNumKeys = 1000; + // Set options + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + bool use_direct_io = std::get<0>(GetParam()); + bool is_adaptive_readahead = std::get<1>(GetParam()); + + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + int total_keys = 0; + for (int j = 0; j < 5; j++) { + for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + total_keys++; + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + int buff_async_prefetch_count = 0; + int readahead_carry_over_count = 0; + int num_sst_files = NumTableFilesAtLevel(2); + size_t current_readahead_size = 0; + + // Test - Iterate over the keys sequentially. + { SyncPoint::GetInstance()->SetCallBack( "FilePrefetchBuffer::PrefetchAsyncInternal:Start", [&](void*) { buff_async_prefetch_count++; }); @@ -1062,8 +1121,8 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { ReadOptions ro; if (is_adaptive_readahead) { ro.adaptive_readahead = true; - ro.async_io = true; } + ro.async_io = true; ASSERT_OK(options.statistics->Reset()); @@ -1078,11 +1137,10 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { // For index and data blocks. if (is_adaptive_readahead) { ASSERT_EQ(readahead_carry_over_count, 2 * (num_sst_files - 1)); - ASSERT_GT(buff_async_prefetch_count, 0); } else { - ASSERT_GT(buff_prefetch_count, 0); ASSERT_EQ(readahead_carry_over_count, 0); } + ASSERT_GT(buff_async_prefetch_count, 0); // Check stats to make sure async prefetch is done. { @@ -1106,11 +1164,34 @@ class PrefetchTest1 : public DBTestBase, public ::testing::WithParamInterface { public: PrefetchTest1() : DBTestBase("prefetch_test1", true) {} + + void SetGenericOptions(Env* env, bool use_direct_io, Options& options) { + options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env; + options.disable_auto_compactions = true; + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + } + + void SetBlockBasedTableOptions(BlockBasedTableOptions& table_options) { + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + } }; INSTANTIATE_TEST_CASE_P(PrefetchTest1, PrefetchTest1, ::testing::Bool()); #ifndef ROCKSDB_LITE +// This test verifies the functionality of ReadOptions.adaptive_readahead when +// reads are not sequential. TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) { const int kNumKeys = 1000; // Set options @@ -1118,21 +1199,10 @@ TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) { std::make_shared(env_->GetFileSystem(), false); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - if (GetParam()) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } + Options options; + SetGenericOptions(env.get(), GetParam(), options); BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + SetBlockBasedTableOptions(table_options); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Status s = TryReopen(options); @@ -1204,6 +1274,16 @@ TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) { } #endif //! ROCKSDB_LITE +// This test verifies the functionality of adaptive_readaheadsize with cache and +// if block is found in cache, decrease the readahead_size if +// - its enabled internally by RocksDB (implicit_auto_readahead_) and, +// - readahead_size is greater than 0 and, +// - the block would have called prefetch API if not found in cache for +// which conditions are: +// - few/no bytes are in buffer and, +// - block is sequential with the previous read and, +// - num_file_reads_ + 1 (including this read) > +// num_file_reads_for_auto_readahead_ TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) { const int kNumKeys = 2000; // Set options @@ -1211,24 +1291,14 @@ TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) { std::make_shared(env_->GetFileSystem(), false); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - if (GetParam()) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } - + Options options; + SetGenericOptions(env.get(), GetParam(), options); options.statistics = CreateDBStatistics(); BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB table_options.block_cache = cache; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.no_block_cache = false; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Status s = TryReopen(options); @@ -1348,6 +1418,8 @@ TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) { Close(); } +// This test verifies the basic functionality of seek parallelization for +// async_io. TEST_P(PrefetchTest1, SeekParallelizationTest) { const int kNumKeys = 2000; // Set options @@ -1355,23 +1427,11 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) { std::make_shared(env_->GetFileSystem(), false); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - if (GetParam()) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } - + Options options; + SetGenericOptions(env.get(), GetParam(), options); options.statistics = CreateDBStatistics(); BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + SetBlockBasedTableOptions(table_options); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Status s = TryReopen(options); @@ -1455,626 +1515,658 @@ extern "C" bool RocksDbIOUringEnable() { return true; } namespace { #ifndef ROCKSDB_LITE #ifdef GFLAGS - const int kMaxArgCount = 100; - const size_t kArgBufferSize = 100000; +const int kMaxArgCount = 100; +const size_t kArgBufferSize = 100000; - void RunIOTracerParserTool(std::string trace_file) { - std::vector params = {"./io_tracer_parser", - "-io_trace_file=" + trace_file}; +void RunIOTracerParserTool(std::string trace_file) { + std::vector params = {"./io_tracer_parser", + "-io_trace_file=" + trace_file}; - char arg_buffer[kArgBufferSize]; - char* argv[kMaxArgCount]; - int argc = 0; - int cursor = 0; - for (const auto& arg : params) { - ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize); - ASSERT_LE(argc + 1, kMaxArgCount); + char arg_buffer[kArgBufferSize]; + char* argv[kMaxArgCount]; + int argc = 0; + int cursor = 0; + for (const auto& arg : params) { + ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize); + ASSERT_LE(argc + 1, kMaxArgCount); - snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str()); + snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str()); - argv[argc++] = arg_buffer + cursor; - cursor += static_cast(arg.size()) + 1; - } - ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv)); + argv[argc++] = arg_buffer + cursor; + cursor += static_cast(arg.size()) + 1; } + ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv)); +} #endif // GFLAGS #endif // ROCKSDB_LITE - } // namespace +} // namespace -// Tests the default implementation of ReadAsync API with PosixFileSystem. - TEST_P(PrefetchTest, ReadAsyncWithPosixFS) { - if (mem_env_ || encrypted_env_) { - ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); - return; - } +// Tests the default implementation of ReadAsync API with PosixFileSystem during +// prefetching. +TEST_P(PrefetchTest, ReadAsyncWithPosixFS) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } - const int kNumKeys = 1000; - std::shared_ptr fs = std::make_shared( - FileSystem::Default(), /*support_prefetch=*/false); - std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + const int kNumKeys = 1000; + std::shared_ptr fs = std::make_shared( + FileSystem::Default(), /*support_prefetch=*/false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - bool use_direct_io = std::get<0>(GetParam()); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - options.statistics = CreateDBStatistics(); - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + bool use_direct_io = std::get<0>(GetParam()); + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Status s = TryReopen(options); - if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { - // If direct IO is not supported, skip the test - return; - } else { - ASSERT_OK(s); - } + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } - int total_keys = 0; - // Write the keys. - { - WriteBatch batch; - Random rnd(309); - for (int j = 0; j < 5; j++) { - for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { - ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); - total_keys++; - } - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - ASSERT_OK(Flush()); + int total_keys = 0; + // Write the keys. + { + WriteBatch batch; + Random rnd(309); + for (int j = 0; j < 5; j++) { + for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + total_keys++; } - MoveFilesToLevel(2); + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); } + MoveFilesToLevel(2); + } - int buff_prefetch_count = 0; - bool read_async_called = false; - ReadOptions ro; - ro.adaptive_readahead = true; - ro.async_io = true; + int buff_prefetch_count = 0; + bool read_async_called = false; + ReadOptions ro; + ro.adaptive_readahead = true; + ro.async_io = true; - if (std::get<1>(GetParam())) { - ro.readahead_size = 16 * 1024; - } + if (std::get<1>(GetParam())) { + ro.readahead_size = 16 * 1024; + } - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PrefetchAsyncInternal:Start", + [&](void*) { buff_prefetch_count++; }); - SyncPoint::GetInstance()->SetCallBack( - "UpdateResults::io_uring_result", - [&](void* /*arg*/) { read_async_called = true; }); - SyncPoint::GetInstance()->EnableProcessing(); + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + SyncPoint::GetInstance()->EnableProcessing(); - // Read the keys. - { - ASSERT_OK(options.statistics->Reset()); - get_perf_context()->Reset(); + // Read the keys. + { + ASSERT_OK(options.statistics->Reset()); + get_perf_context()->Reset(); - auto iter = std::unique_ptr(db_->NewIterator(ro)); - int num_keys = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - num_keys++; - } + auto iter = std::unique_ptr(db_->NewIterator(ro)); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + num_keys++; + } - ASSERT_EQ(num_keys, total_keys); - ASSERT_GT(buff_prefetch_count, 0); + ASSERT_EQ(num_keys, total_keys); + ASSERT_GT(buff_prefetch_count, 0); - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - HistogramData prefetched_bytes_discarded; - options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, - &prefetched_bytes_discarded); + // Check stats to make sure async prefetch is done. + { + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + HistogramData prefetched_bytes_discarded; + options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, + &prefetched_bytes_discarded); - // Not all platforms support iouring. In that case, ReadAsync in posix - // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - } - ASSERT_GT(prefetched_bytes_discarded.count, 0); + // Not all platforms support iouring. In that case, ReadAsync in posix + // won't submit async requests. + if (read_async_called) { + ASSERT_GT(async_read_bytes.count, 0); + } else { + ASSERT_EQ(async_read_bytes.count, 0); } - ASSERT_EQ(get_perf_context()->number_async_seek, 0); + ASSERT_GT(prefetched_bytes_discarded.count, 0); } + ASSERT_EQ(get_perf_context()->number_async_seek, 0); + } - { - // Read the keys using seek. - { - ASSERT_OK(options.statistics->Reset()); - get_perf_context()->Reset(); - - auto iter = std::unique_ptr(db_->NewIterator(ro)); - int num_keys = 0; - iter->Seek(BuildKey(450)); - while (iter->Valid()) { - ASSERT_OK(iter->status()); - num_keys++; - iter->Next(); - } - ASSERT_OK(iter->status()); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); - iter->Seek(BuildKey(450)); - while (iter->Valid()) { - ASSERT_OK(iter->status()); - num_keys++; - iter->Prev(); - } + Close(); +} - ASSERT_EQ(num_keys, total_keys + 1); - ASSERT_GT(buff_prefetch_count, 0); - - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, - &async_read_bytes); - HistogramData prefetched_bytes_discarded; - options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, - &prefetched_bytes_discarded); - - // Not all platforms support iouring. In that case, ReadAsync in posix - // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - ASSERT_GT(get_perf_context()->number_async_seek, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - ASSERT_EQ(get_perf_context()->number_async_seek, 0); - } - ASSERT_GT(prefetched_bytes_discarded.count, 0); - } - } - } +// This test verifies implementation of seek parallelization with +// PosixFileSystem during prefetching. +TEST_P(PrefetchTest, MultipleSeekWithPosixFS) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); + const int kNumKeys = 1000; + std::shared_ptr fs = std::make_shared( + FileSystem::Default(), /*support_prefetch=*/false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + bool use_direct_io = std::get<0>(GetParam()); + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Close(); + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); } - TEST_P(PrefetchTest, MultipleSeekWithPosixFS) { - if (mem_env_ || encrypted_env_) { - ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); - return; + int total_keys = 0; + // Write the keys. + { + WriteBatch batch; + Random rnd(309); + for (int j = 0; j < 5; j++) { + for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + total_keys++; + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); } + MoveFilesToLevel(2); + } - const int kNumKeys = 1000; - std::shared_ptr fs = std::make_shared( - FileSystem::Default(), /*support_prefetch=*/false); - std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - - bool use_direct_io = std::get<0>(GetParam()); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - options.statistics = CreateDBStatistics(); - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; + int num_keys_first_batch = 0; + int num_keys_second_batch = 0; + // Calculate number of keys without async_io for correctness validation. + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + // First Seek. + iter->Seek(BuildKey(450)); + while (iter->Valid() && num_keys_first_batch < 100) { + ASSERT_OK(iter->status()); + num_keys_first_batch++; + iter->Next(); } - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ASSERT_OK(iter->status()); - Status s = TryReopen(options); - if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { - // If direct IO is not supported, skip the test - return; - } else { - ASSERT_OK(s); + iter->Seek(BuildKey(942)); + while (iter->Valid()) { + ASSERT_OK(iter->status()); + num_keys_second_batch++; + iter->Next(); } + ASSERT_OK(iter->status()); + } - int total_keys = 0; - // Write the keys. - { - WriteBatch batch; - Random rnd(309); - for (int j = 0; j < 5; j++) { - for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { - ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); - total_keys++; - } - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - ASSERT_OK(Flush()); - } - MoveFilesToLevel(2); - } + int buff_prefetch_count = 0; + bool read_async_called = false; + ReadOptions ro; + ro.adaptive_readahead = true; + ro.async_io = true; + + if (std::get<1>(GetParam())) { + ro.readahead_size = 16 * 1024; + } + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PrefetchAsyncInternal:Start", + [&](void*) { buff_prefetch_count++; }); + + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Read the keys using seek. + { + ASSERT_OK(options.statistics->Reset()); + get_perf_context()->Reset(); - int num_keys_first_batch = 0; - int num_keys_second_batch = 0; - // Calculate number of keys without async_io for correctness validation. + auto iter = std::unique_ptr(db_->NewIterator(ro)); + int num_keys = 0; + // First Seek. { - auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); - // First Seek. iter->Seek(BuildKey(450)); - while (iter->Valid() && num_keys_first_batch < 100) { + while (iter->Valid() && num_keys < 100) { ASSERT_OK(iter->status()); - num_keys_first_batch++; + num_keys++; iter->Next(); } ASSERT_OK(iter->status()); + ASSERT_EQ(num_keys, num_keys_first_batch); + // Check stats to make sure async prefetch is done. + { + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - iter->Seek(BuildKey(942)); - while (iter->Valid()) { - ASSERT_OK(iter->status()); - num_keys_second_batch++; - iter->Next(); + // Not all platforms support iouring. In that case, ReadAsync in posix + // won't submit async requests. + if (read_async_called) { + ASSERT_GT(async_read_bytes.count, 0); + ASSERT_GT(get_perf_context()->number_async_seek, 0); + } else { + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(get_perf_context()->number_async_seek, 0); + } } - ASSERT_OK(iter->status()); - } - - int buff_prefetch_count = 0; - bool read_async_called = false; - ReadOptions ro; - ro.adaptive_readahead = true; - ro.async_io = true; - - if (std::get<1>(GetParam())) { - ro.readahead_size = 16 * 1024; } - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); - - SyncPoint::GetInstance()->SetCallBack( - "UpdateResults::io_uring_result", - [&](void* /*arg*/) { read_async_called = true; }); - SyncPoint::GetInstance()->EnableProcessing(); - - // Read the keys using seek. + // Second Seek. { + num_keys = 0; ASSERT_OK(options.statistics->Reset()); get_perf_context()->Reset(); - auto iter = std::unique_ptr(db_->NewIterator(ro)); - int num_keys = 0; - // First Seek. - { - iter->Seek(BuildKey(450)); - while (iter->Valid() && num_keys < 100) { - ASSERT_OK(iter->status()); - num_keys++; - iter->Next(); - } + iter->Seek(BuildKey(942)); + while (iter->Valid()) { ASSERT_OK(iter->status()); - ASSERT_EQ(num_keys, num_keys_first_batch); - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, - &async_read_bytes); - - // Not all platforms support iouring. In that case, ReadAsync in posix - // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - ASSERT_GT(get_perf_context()->number_async_seek, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - ASSERT_EQ(get_perf_context()->number_async_seek, 0); - } - } + num_keys++; + iter->Next(); } + ASSERT_OK(iter->status()); + ASSERT_EQ(num_keys, num_keys_second_batch); + + ASSERT_GT(buff_prefetch_count, 0); - // Second Seek. + // Check stats to make sure async prefetch is done. { - num_keys = 0; - ASSERT_OK(options.statistics->Reset()); - get_perf_context()->Reset(); - - iter->Seek(BuildKey(942)); - while (iter->Valid()) { - ASSERT_OK(iter->status()); - num_keys++; - iter->Next(); - } - ASSERT_OK(iter->status()); - ASSERT_EQ(num_keys, num_keys_second_batch); - - ASSERT_GT(buff_prefetch_count, 0); - - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, - &async_read_bytes); - HistogramData prefetched_bytes_discarded; - options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, - &prefetched_bytes_discarded); - - // Not all platforms support iouring. In that case, ReadAsync in posix - // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - ASSERT_GT(get_perf_context()->number_async_seek, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - ASSERT_EQ(get_perf_context()->number_async_seek, 0); - } - ASSERT_GT(prefetched_bytes_discarded.count, 0); + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + HistogramData prefetched_bytes_discarded; + options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, + &prefetched_bytes_discarded); + + // Not all platforms support iouring. In that case, ReadAsync in posix + // won't submit async requests. + if (read_async_called) { + ASSERT_GT(async_read_bytes.count, 0); + ASSERT_GT(get_perf_context()->number_async_seek, 0); + } else { + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(get_perf_context()->number_async_seek, 0); } + ASSERT_GT(prefetched_bytes_discarded.count, 0); } } - - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); - Close(); } - TEST_P(PrefetchTest, SeekParallelizationTest1) { - if (mem_env_ || encrypted_env_) { - ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); - return; - } - const int kNumKeys = 2000; - // Set options - std::shared_ptr fs = std::make_shared( - FileSystem::Default(), /*support_prefetch=*/false); - std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - - bool use_direct_io = std::get<0>(GetParam()); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; - } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} - options.statistics = CreateDBStatistics(); - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); +// This test verifies implementation of seek parallelization with +// PosixFileSystem during prefetching. +TEST_P(PrefetchTest, SeekParallelizationTestWithPosix) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + const int kNumKeys = 2000; + // Set options + std::shared_ptr fs = std::make_shared( + FileSystem::Default(), /*support_prefetch=*/false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - Status s = TryReopen(options); - if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { - // If direct IO is not supported, skip the test - return; - } else { - ASSERT_OK(s); - } + bool use_direct_io = std::get<0>(GetParam()); + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - WriteBatch batch; - Random rnd(309); - for (int i = 0; i < kNumKeys; i++) { - ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); - } - ASSERT_OK(db_->Write(WriteOptions(), &batch)); + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } - std::string start_key = BuildKey(0); - std::string end_key = BuildKey(kNumKeys - 1); - Slice least(start_key.data(), start_key.size()); - Slice greatest(end_key.data(), end_key.size()); + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); - int buff_prefetch_count = 0; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); + int buff_prefetch_count = 0; - bool read_async_called = false; - SyncPoint::GetInstance()->SetCallBack( - "UpdateResults::io_uring_result", - [&](void* /*arg*/) { read_async_called = true; }); - SyncPoint::GetInstance()->EnableProcessing(); + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PrefetchAsyncInternal:Start", + [&](void*) { buff_prefetch_count++; }); - SyncPoint::GetInstance()->EnableProcessing(); - ReadOptions ro; - ro.adaptive_readahead = true; - ro.async_io = true; + bool read_async_called = false; + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + SyncPoint::GetInstance()->EnableProcessing(); - if (std::get<1>(GetParam())) { - ro.readahead_size = 16 * 1024; - } + SyncPoint::GetInstance()->EnableProcessing(); + ReadOptions ro; + ro.adaptive_readahead = true; + ro.async_io = true; - { - ASSERT_OK(options.statistics->Reset()); - // Each block contains around 4 keys. - auto iter = std::unique_ptr(db_->NewIterator(ro)); - iter->Seek( - BuildKey(0)); // Prefetch data because of seek parallelization. - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); + if (std::get<1>(GetParam())) { + ro.readahead_size = 16 * 1024; + } - // New data block. Since num_file_reads in FilePrefetch after this read is - // 2, it won't go for prefetching. - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); + { + ASSERT_OK(options.statistics->Reset()); + // Each block contains around 4 keys. + auto iter = std::unique_ptr(db_->NewIterator(ro)); + iter->Seek(BuildKey(0)); // Prefetch data because of seek parallelization. + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); - // Prefetch data. - iter->Next(); - ASSERT_TRUE(iter->Valid()); + // New data block. Since num_file_reads in FilePrefetch after this read is + // 2, it won't go for prefetching. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - // Not all platforms support iouring. In that case, ReadAsync in posix - // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - ASSERT_GT(get_perf_context()->number_async_seek, 0); - if (std::get<1>(GetParam())) { - ASSERT_EQ(buff_prefetch_count, 1); - } else { - ASSERT_EQ(buff_prefetch_count, 2); - } - } else { - ASSERT_EQ(async_read_bytes.count, 0); - ASSERT_EQ(get_perf_context()->number_async_seek, 0); + // Prefetch data. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + + // Check stats to make sure async prefetch is done. + { + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + // Not all platforms support iouring. In that case, ReadAsync in posix + // won't submit async requests. + if (read_async_called) { + ASSERT_GT(async_read_bytes.count, 0); + ASSERT_GT(get_perf_context()->number_async_seek, 0); + if (std::get<1>(GetParam())) { ASSERT_EQ(buff_prefetch_count, 1); + } else { + ASSERT_EQ(buff_prefetch_count, 2); } + } else { + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(get_perf_context()->number_async_seek, 0); + ASSERT_EQ(buff_prefetch_count, 1); } - - buff_prefetch_count = 0; } - Close(); + + buff_prefetch_count = 0; } + Close(); +} #ifndef ROCKSDB_LITE #ifdef GFLAGS - TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) { - if (mem_env_ || encrypted_env_) { - ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); - return; - } +// This test verifies io_tracing with PosixFileSystem during prefetching. +TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } - const int kNumKeys = 1000; - std::shared_ptr fs = std::make_shared( - FileSystem::Default(), /*support_prefetch=*/false); - std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + const int kNumKeys = 1000; + std::shared_ptr fs = std::make_shared( + FileSystem::Default(), /*support_prefetch=*/false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); - bool use_direct_io = std::get<0>(GetParam()); - Options options = CurrentOptions(); - options.write_buffer_size = 1024; - options.create_if_missing = true; - options.compression = kNoCompression; - options.env = env.get(); - options.statistics = CreateDBStatistics(); - if (use_direct_io) { - options.use_direct_reads = true; - options.use_direct_io_for_flush_and_compaction = true; + bool use_direct_io = std::get<0>(GetParam()); + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + int total_keys = 0; + // Write the keys. + { + WriteBatch batch; + Random rnd(309); + for (int j = 0; j < 5; j++) { + for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + total_keys++; + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); } - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.cache_index_and_filter_blocks = false; - table_options.metadata_block_size = 1024; - table_options.index_type = - BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + MoveFilesToLevel(2); + } - Status s = TryReopen(options); - if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { - // If direct IO is not supported, skip the test - return; - } else { - ASSERT_OK(s); + int buff_prefetch_count = 0; + bool read_async_called = false; + ReadOptions ro; + ro.adaptive_readahead = true; + ro.async_io = true; + + if (std::get<1>(GetParam())) { + ro.readahead_size = 16 * 1024; + } + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PrefetchAsyncInternal:Start", + [&](void*) { buff_prefetch_count++; }); + + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Read the keys. + { + // Start io_tracing. + WriteOptions write_opt; + TraceOptions trace_opt; + std::unique_ptr trace_writer; + std::string trace_file_path = dbname_ + "/io_trace_file"; + + ASSERT_OK( + NewFileTraceWriter(env_, EnvOptions(), trace_file_path, &trace_writer)); + ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer))); + ASSERT_OK(options.statistics->Reset()); + + auto iter = std::unique_ptr(db_->NewIterator(ro)); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + num_keys++; } - int total_keys = 0; - // Write the keys. + // End the tracing. + ASSERT_OK(db_->EndIOTrace()); + ASSERT_OK(env_->FileExists(trace_file_path)); + + ASSERT_EQ(num_keys, total_keys); + ASSERT_GT(buff_prefetch_count, 0); + + // Check stats to make sure async prefetch is done. { - WriteBatch batch; - Random rnd(309); - for (int j = 0; j < 5; j++) { - for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { - ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); - total_keys++; - } - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - ASSERT_OK(Flush()); + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + // Not all platforms support iouring. In that case, ReadAsync in posix + // won't submit async requests. + if (read_async_called) { + ASSERT_GT(async_read_bytes.count, 0); + } else { + ASSERT_EQ(async_read_bytes.count, 0); } - MoveFilesToLevel(2); } - int buff_prefetch_count = 0; - bool read_async_called = false; - ReadOptions ro; - ro.adaptive_readahead = true; - ro.async_io = true; + // Check the file to see if ReadAsync is logged. + RunIOTracerParserTool(trace_file_path); + } - if (std::get<1>(GetParam())) { - ro.readahead_size = 16 * 1024; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + Close(); +} +#endif // GFLAGS + +class FilePrefetchBufferTest : public testing::Test { + public: + void SetUp() override { + SetupSyncPointsToMockDirectIO(); + env_ = Env::Default(); + fs_ = FileSystem::Default(); + test_dir_ = test::PerThreadDBPath("file_prefetch_buffer_test"); + ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr)); + stats_ = CreateDBStatistics(); + } + + void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); } + + void Write(const std::string& fname, const std::string& content) { + std::unique_ptr f; + ASSERT_OK(fs_->NewWritableFile(Path(fname), FileOptions(), &f, nullptr)); + ASSERT_OK(f->Append(content, IOOptions(), nullptr)); + ASSERT_OK(f->Close(IOOptions(), nullptr)); + } + + void Read(const std::string& fname, const FileOptions& opts, + std::unique_ptr* reader) { + std::string fpath = Path(fname); + std::unique_ptr f; + ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr)); + reader->reset(new RandomAccessFileReader( + std::move(f), fpath, env_->GetSystemClock().get(), + /*io_tracer=*/nullptr, stats_.get())); + } + + void AssertResult(const std::string& content, + const std::vector& reqs) { + for (const auto& r : reqs) { + ASSERT_OK(r.status); + ASSERT_EQ(r.len, r.result.size()); + ASSERT_EQ(content.substr(r.offset, r.len), r.result.ToString()); } + } - SyncPoint::GetInstance()->SetCallBack( - "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); + FileSystem* fs() { return fs_.get(); } + Statistics* stats() { return stats_.get(); } - SyncPoint::GetInstance()->SetCallBack( - "UpdateResults::io_uring_result", - [&](void* /*arg*/) { read_async_called = true; }); - SyncPoint::GetInstance()->EnableProcessing(); + private: + Env* env_; + std::shared_ptr fs_; + std::string test_dir_; + std::shared_ptr stats_; - // Read the keys. - { - // Start io_tracing. - WriteOptions write_opt; - TraceOptions trace_opt; - std::unique_ptr trace_writer; - std::string trace_file_path = dbname_ + "/io_trace_file"; - - ASSERT_OK(NewFileTraceWriter(env_, EnvOptions(), trace_file_path, - &trace_writer)); - ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer))); - ASSERT_OK(options.statistics->Reset()); + std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; } +}; - auto iter = std::unique_ptr(db_->NewIterator(ro)); - int num_keys = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_OK(iter->status()); - num_keys++; - } +TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) { + std::string fname = "seek-with-block-cache-hit"; + Random rand(0); + std::string content = rand.RandomString(32768); + Write(fname, content); + + FileOptions opts; + std::unique_ptr r; + Read(fname, opts, &r); + + FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, fs()); + Slice result; + // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings, + // it will do two reads of 4096+8192 and 8192 + Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result); + // Platforms that don't have IO uring may not support async IO + ASSERT_TRUE(s.IsTryAgain() || s.IsNotSupported()); + // Simulate a block cache hit + fpb.UpdateReadPattern(0, 4096, false); + // Now read some data that straddles the two prefetch buffers - offset 8192 to + // 16384 + ASSERT_TRUE(fpb.TryReadFromCacheAsync(IOOptions(), r.get(), 8192, 8192, + &result, &s, Env::IOPriority::IO_LOW)); +} - // End the tracing. - ASSERT_OK(db_->EndIOTrace()); - ASSERT_OK(env_->FileExists(trace_file_path)); +TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { + std::string fname = "seek-with-block-cache-hit"; + Random rand(0); + std::string content = rand.RandomString(32768); + Write(fname, content); - ASSERT_EQ(num_keys, total_keys); - ASSERT_GT(buff_prefetch_count, 0); + FileOptions opts; + std::unique_ptr r; + Read(fname, opts, &r); - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - // Not all platforms support iouring. In that case, ReadAsync in posix - // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - } - } + FilePrefetchBuffer fpb( + /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, + /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, fs()); - // Check the file to see if ReadAsync is logged. - RunIOTracerParserTool(trace_file_path); - } + int read_async_called = 0; + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::ReadAsync", + [&](void* /*arg*/) { read_async_called++; }); + SyncPoint::GetInstance()->EnableProcessing(); - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->ClearAllCallBacks(); + Slice async_result; + // Simulate a seek of 4000 bytes at offset 3000. Due to the readahead + // settings, it will do two reads of 4000+4096 and 4096 + Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 3000, 4000, &async_result); + // Platforms that don't have IO uring may not support async IO + ASSERT_TRUE(s.IsTryAgain() || s.IsNotSupported()); + + ASSERT_TRUE(fpb.TryReadFromCacheAsync(IOOptions(), r.get(), /*offset=*/3000, + /*length=*/4000, &async_result, &s, + Env::IOPriority::IO_LOW)); + // No sync call should be made. + HistogramData sst_read_micros; + stats()->histogramData(SST_READ_MICROS, &sst_read_micros); + ASSERT_EQ(sst_read_micros.count, 0); + + // Number of async calls should be. + ASSERT_EQ(read_async_called, 2); + // Length should be 4000. + ASSERT_EQ(async_result.size(), 4000); + // Data correctness. + Slice result(content.c_str() + 3000, 4000); + ASSERT_EQ(result.size(), 4000); + ASSERT_EQ(result, async_result); +} - Close(); - } -#endif // GFLAGS #endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 20e933924bd..030cd8d07a2 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -248,7 +248,7 @@ size_t End(const FSReadRequest& r) { FSReadRequest Align(const FSReadRequest& r, size_t alignment) { FSReadRequest req; req.offset = static_cast( - TruncateToPageBoundary(alignment, static_cast(r.offset))); + TruncateToPageBoundary(alignment, static_cast(r.offset))); req.len = Roundup(End(r), alignment) - req.offset; req.scratch = nullptr; return req; diff --git a/file/random_access_file_reader_test.cc b/file/random_access_file_reader_test.cc index 0f54026869a..ac0e9e57a1f 100644 --- a/file/random_access_file_reader_test.cc +++ b/file/random_access_file_reader_test.cc @@ -60,9 +60,7 @@ class RandomAccessFileReaderTest : public testing::Test { std::shared_ptr fs_; std::string test_dir_; - std::string Path(const std::string& fname) { - return test_dir_ + "/" + fname; - } + std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; } }; // Skip the following tests in lite mode since direct I/O is unsupported. diff --git a/file/read_write_util.cc b/file/read_write_util.cc index cc4f6b84974..3617a35e330 100644 --- a/file/read_write_util.cc +++ b/file/read_write_util.cc @@ -10,6 +10,7 @@ #include "file/read_write_util.h" #include + #include "test_util/sync_point.h" namespace ROCKSDB_NAMESPACE { diff --git a/file/read_write_util.h b/file/read_write_util.h index 718135c9885..9f034b705f1 100644 --- a/file/read_write_util.h +++ b/file/read_write_util.h @@ -9,6 +9,7 @@ #pragma once #include + #include "file/sequence_file_reader.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc index c4c4114880f..7053e6a0738 100644 --- a/file/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -161,9 +161,8 @@ bool SstFileManagerImpl::EnoughRoomForCompaction( // Update cur_compactions_reserved_size_ so concurrent compaction // don't max out space - size_t needed_headroom = - cur_compactions_reserved_size_ + size_added_by_compaction + - compaction_buffer_size_; + size_t needed_headroom = cur_compactions_reserved_size_ + + size_added_by_compaction + compaction_buffer_size_; if (max_allowed_space_ != 0 && (needed_headroom + total_files_size_ > max_allowed_space_)) { return false; @@ -415,13 +414,12 @@ bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) { return false; } -Status SstFileManagerImpl::ScheduleFileDeletion( - const std::string& file_path, const std::string& path_to_sync, - const bool force_bg) { +Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path, + const std::string& path_to_sync, + const bool force_bg) { TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::ScheduleFileDeletion", const_cast(&file_path)); - return delete_scheduler_.DeleteFile(file_path, path_to_sync, - force_bg); + return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg); } void SstFileManagerImpl::WaitForEmptyTrash() { diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h index 066c75e15bf..b21b47b8684 100644 --- a/file/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -9,10 +9,9 @@ #include -#include "port/port.h" - #include "db/compaction/compaction.h" #include "file/delete_scheduler.h" +#include "port/port.h" #include "rocksdb/sst_file_manager.h" namespace ROCKSDB_NAMESPACE { diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 5cd52223f55..258cf82a105 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -240,10 +240,10 @@ enum class CacheTier : uint8_t { kNonVolatileBlockTier = 0x01, }; -enum UpdateStatus { // Return status For inplace update callback - UPDATE_FAILED = 0, // Nothing to update - UPDATED_INPLACE = 1, // Value updated inplace - UPDATED = 2, // No inplace update. Merged value set +enum UpdateStatus { // Return status For inplace update callback + UPDATE_FAILED = 0, // Nothing to update + UPDATED_INPLACE = 1, // Value updated inplace + UPDATED = 2, // No inplace update. Merged value set }; enum class PrepopulateBlobCache : uint8_t { diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 2db9d406edf..11e4d268688 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -69,9 +69,9 @@ extern "C" { /* Exported types */ -typedef struct rocksdb_t rocksdb_t; -typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; -typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; +typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; +typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t; typedef struct rocksdb_restore_options_t rocksdb_restore_options_t; typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t; @@ -82,43 +82,45 @@ typedef struct rocksdb_compactionfiltercontext_t rocksdb_compactionfiltercontext_t; typedef struct rocksdb_compactionfilterfactory_t rocksdb_compactionfilterfactory_t; -typedef struct rocksdb_comparator_t rocksdb_comparator_t; -typedef struct rocksdb_dbpath_t rocksdb_dbpath_t; -typedef struct rocksdb_env_t rocksdb_env_t; -typedef struct rocksdb_fifo_compaction_options_t rocksdb_fifo_compaction_options_t; -typedef struct rocksdb_filelock_t rocksdb_filelock_t; -typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; -typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t; -typedef struct rocksdb_iterator_t rocksdb_iterator_t; -typedef struct rocksdb_logger_t rocksdb_logger_t; -typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; -typedef struct rocksdb_options_t rocksdb_options_t; +typedef struct rocksdb_comparator_t rocksdb_comparator_t; +typedef struct rocksdb_dbpath_t rocksdb_dbpath_t; +typedef struct rocksdb_env_t rocksdb_env_t; +typedef struct rocksdb_fifo_compaction_options_t + rocksdb_fifo_compaction_options_t; +typedef struct rocksdb_filelock_t rocksdb_filelock_t; +typedef struct rocksdb_filterpolicy_t rocksdb_filterpolicy_t; +typedef struct rocksdb_flushoptions_t rocksdb_flushoptions_t; +typedef struct rocksdb_iterator_t rocksdb_iterator_t; +typedef struct rocksdb_logger_t rocksdb_logger_t; +typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; +typedef struct rocksdb_options_t rocksdb_options_t; typedef struct rocksdb_compactoptions_t rocksdb_compactoptions_t; typedef struct rocksdb_block_based_table_options_t rocksdb_block_based_table_options_t; -typedef struct rocksdb_cuckoo_table_options_t - rocksdb_cuckoo_table_options_t; -typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; -typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; -typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; -typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t; -typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; -typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; -typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; -typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t; -typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; -typedef struct rocksdb_universal_compaction_options_t rocksdb_universal_compaction_options_t; -typedef struct rocksdb_livefiles_t rocksdb_livefiles_t; +typedef struct rocksdb_cuckoo_table_options_t rocksdb_cuckoo_table_options_t; +typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; +typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; +typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; +typedef struct rocksdb_slicetransform_t rocksdb_slicetransform_t; +typedef struct rocksdb_snapshot_t rocksdb_snapshot_t; +typedef struct rocksdb_writablefile_t rocksdb_writablefile_t; +typedef struct rocksdb_writebatch_t rocksdb_writebatch_t; +typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t; +typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t; +typedef struct rocksdb_universal_compaction_options_t + rocksdb_universal_compaction_options_t; +typedef struct rocksdb_livefiles_t rocksdb_livefiles_t; typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t; typedef struct rocksdb_column_family_metadata_t rocksdb_column_family_metadata_t; typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t; typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t; -typedef struct rocksdb_envoptions_t rocksdb_envoptions_t; -typedef struct rocksdb_ingestexternalfileoptions_t rocksdb_ingestexternalfileoptions_t; -typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t; -typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t; -typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t; +typedef struct rocksdb_envoptions_t rocksdb_envoptions_t; +typedef struct rocksdb_ingestexternalfileoptions_t + rocksdb_ingestexternalfileoptions_t; +typedef struct rocksdb_sstfilewriter_t rocksdb_sstfilewriter_t; +typedef struct rocksdb_ratelimiter_t rocksdb_ratelimiter_t; +typedef struct rocksdb_perfcontext_t rocksdb_perfcontext_t; typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t; typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t; typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t; @@ -161,8 +163,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup( rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush( - rocksdb_backup_engine_t* be, rocksdb_t* db, unsigned char flush_before_backup, - char** errptr); + rocksdb_backup_engine_t* be, rocksdb_t* db, + unsigned char flush_before_backup, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups( rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr); @@ -174,9 +176,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy( extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files( rocksdb_restore_options_t* opt, int v); -extern ROCKSDB_LIBRARY_API void -rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be, - uint32_t backup_id, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_verify_backup( + rocksdb_backup_engine_t* be, uint32_t backup_id, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_restore_db_from_latest_backup( @@ -194,17 +195,14 @@ rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be); extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count( const rocksdb_backup_engine_info_t* info); -extern ROCKSDB_LIBRARY_API int64_t -rocksdb_backup_engine_info_timestamp(const rocksdb_backup_engine_info_t* info, - int index); +extern ROCKSDB_LIBRARY_API int64_t rocksdb_backup_engine_info_timestamp( + const rocksdb_backup_engine_info_t* info, int index); -extern ROCKSDB_LIBRARY_API uint32_t -rocksdb_backup_engine_info_backup_id(const rocksdb_backup_engine_info_t* info, - int index); +extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_backup_id( + const rocksdb_backup_engine_info_t* info, int index); -extern ROCKSDB_LIBRARY_API uint64_t -rocksdb_backup_engine_info_size(const rocksdb_backup_engine_info_t* info, - int index); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_backup_engine_info_size( + const rocksdb_backup_engine_info_t* info, int index); extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files( const rocksdb_backup_engine_info_t* info, int index); @@ -576,17 +574,15 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator( rocksdb_t* db, const rocksdb_readoptions_t* options); extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since( - rocksdb_t* db, uint64_t seq_number, - const rocksdb_wal_readoptions_t* options, - char** errptr -); + rocksdb_t* db, uint64_t seq_number, + const rocksdb_wal_readoptions_t* options, char** errptr); extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf( rocksdb_t* db, const rocksdb_readoptions_t* options, rocksdb_column_family_handle_t* column_family); extern ROCKSDB_LIBRARY_API void rocksdb_create_iterators( - rocksdb_t *db, rocksdb_readoptions_t* opts, + rocksdb_t* db, rocksdb_readoptions_t* opts, rocksdb_column_family_handle_t** column_families, rocksdb_iterator_t** iterators, size_t size, char** errptr); @@ -601,14 +597,13 @@ extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot( extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db, const char* propname); /* returns 0 on success, -1 otherwise */ -int rocksdb_property_int( - rocksdb_t* db, - const char* propname, uint64_t *out_val); +int rocksdb_property_int(rocksdb_t* db, const char* propname, + uint64_t* out_val); /* returns 0 on success, -1 otherwise */ -int rocksdb_property_int_cf( - rocksdb_t* db, rocksdb_column_family_handle_t* column_family, - const char* propname, uint64_t *out_val); +int rocksdb_property_int_cf(rocksdb_t* db, + rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t* out_val); extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf( rocksdb_t* db, rocksdb_column_family_handle_t* column_family, @@ -708,13 +703,18 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp( extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error( const rocksdb_iterator_t*, char** errptr); -extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next( + rocksdb_wal_iterator_t* iter); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid( - const rocksdb_wal_iterator_t*); -extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) ; -extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) ; -extern ROCKSDB_LIBRARY_API uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db); -extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) ; + const rocksdb_wal_iterator_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status( + const rocksdb_wal_iterator_t* iter, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch( + const rocksdb_wal_iterator_t* iter, uint64_t* seq); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_get_latest_sequence_number(rocksdb_t* db); +extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy( + const rocksdb_wal_iterator_t* iter); /* Write batch */ @@ -820,20 +820,20 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_pop_save_point( /* Write batch with index */ -extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create( - size_t reserved_bytes, - unsigned char overwrite_keys); -extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create_from( - const char* rep, size_t size); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* +rocksdb_writebatch_wi_create(size_t reserved_bytes, + unsigned char overwrite_keys); +extern ROCKSDB_LIBRARY_API rocksdb_writebatch_wi_t* +rocksdb_writebatch_wi_create_from(const char* rep, size_t size); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_destroy( rocksdb_writebatch_wi_t*); -extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t*); -extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b); -extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t*, - const char* key, - size_t klen, - const char* val, - size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_clear( + rocksdb_writebatch_wi_t*); +extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_wi_count( + rocksdb_writebatch_wi_t* b); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put( + rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val, + size_t vlen); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_cf( rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, const char* val, size_t vlen); @@ -846,11 +846,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_putv_cf( int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes); -extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t*, - const char* key, - size_t klen, - const char* val, - size_t vlen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge( + rocksdb_writebatch_wi_t*, const char* key, size_t klen, const char* val, + size_t vlen); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_merge_cf( rocksdb_writebatch_wi_t*, rocksdb_column_family_handle_t* column_family, const char* key, size_t klen, const char* val, size_t vlen); @@ -863,9 +861,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_mergev_cf( int num_keys, const char* const* keys_list, const size_t* keys_list_sizes, int num_values, const char* const* values_list, const size_t* values_list_sizes); -extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t*, - const char* key, - size_t klen); +extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete( + rocksdb_writebatch_wi_t*, const char* key, size_t klen); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_singledelete( rocksdb_writebatch_wi_t*, const char* key, size_t klen); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_cf( @@ -891,9 +888,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_range_cf( size_t end_key_len); // DO NOT USE - rocksdb_writebatch_wi_delete_rangev is not yet supported extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev( - rocksdb_writebatch_wi_t* b, int num_keys, const char* const* start_keys_list, - const size_t* start_keys_list_sizes, const char* const* end_keys_list, - const size_t* end_keys_list_sizes); + rocksdb_writebatch_wi_t* b, int num_keys, + const char* const* start_keys_list, const size_t* start_keys_list_sizes, + const char* const* end_keys_list, const size_t* end_keys_list_sizes); // DO NOT USE - rocksdb_writebatch_wi_delete_rangev_cf is not yet supported extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf( rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family, @@ -903,56 +900,40 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_delete_rangev_cf( extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_put_log_data( rocksdb_writebatch_wi_t*, const char* blob, size_t len); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_iterate( - rocksdb_writebatch_wi_t* b, - void* state, + rocksdb_writebatch_wi_t* b, void* state, void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), void (*deleted)(void*, const char* k, size_t klen)); extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_wi_data( - rocksdb_writebatch_wi_t* b, - size_t* size); + rocksdb_writebatch_wi_t* b, size_t* size); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_set_save_point( rocksdb_writebatch_wi_t*); extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_rollback_to_save_point( rocksdb_writebatch_wi_t*, char** errptr); extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch( - rocksdb_writebatch_wi_t* wbwi, - const rocksdb_options_t* options, - const char* key, size_t keylen, - size_t* vallen, - char** errptr); + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + const char* key, size_t keylen, size_t* vallen, char** errptr); extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_cf( - rocksdb_writebatch_wi_t* wbwi, - const rocksdb_options_t* options, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t keylen, - size_t* vallen, - char** errptr); + rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db( - rocksdb_writebatch_wi_t* wbwi, - rocksdb_t* db, - const rocksdb_readoptions_t* options, - const char* key, size_t keylen, - size_t* vallen, - char** errptr); + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, + const rocksdb_readoptions_t* options, const char* key, size_t keylen, + size_t* vallen, char** errptr); extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf( - rocksdb_writebatch_wi_t* wbwi, - rocksdb_t* db, + rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db, const rocksdb_readoptions_t* options, - rocksdb_column_family_handle_t* column_family, - const char* key, size_t keylen, - size_t* vallen, - char** errptr); + rocksdb_column_family_handle_t* column_family, const char* key, + size_t keylen, size_t* vallen, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi( - rocksdb_t* db, - const rocksdb_writeoptions_t* options, - rocksdb_writebatch_wi_t* wbwi, - char** errptr); -extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base( - rocksdb_writebatch_wi_t* wbwi, - rocksdb_iterator_t* base_iterator); -extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf( - rocksdb_writebatch_wi_t* wbwi, - rocksdb_iterator_t* base_iterator, + rocksdb_t* db, const rocksdb_writeoptions_t* options, + rocksdb_writebatch_wi_t* wbwi, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_writebatch_wi_create_iterator_with_base( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator); +extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* +rocksdb_writebatch_wi_create_iterator_with_base_cf( + rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator, rocksdb_column_family_handle_t* cf); /* Options utils */ @@ -995,16 +976,23 @@ rocksdb_block_based_options_set_block_restart_interval( rocksdb_block_based_table_options_t* options, int block_restart_interval); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_block_restart_interval( - rocksdb_block_based_table_options_t* options, int index_block_restart_interval); + rocksdb_block_based_table_options_t* options, + int index_block_restart_interval); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_metadata_block_size( rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_partition_filters( - rocksdb_block_based_table_options_t* options, unsigned char partition_filters); + rocksdb_block_based_table_options_t* options, + unsigned char partition_filters); +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_optimize_filters_for_memory( + rocksdb_block_based_table_options_t* options, + unsigned char optimize_filters_for_memory); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_use_delta_encoding( - rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding); + rocksdb_block_based_table_options_t* options, + unsigned char use_delta_encoding); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy( rocksdb_block_based_table_options_t* options, rocksdb_filterpolicy_t* filter_policy); @@ -1032,9 +1020,11 @@ enum { rocksdb_block_based_table_data_block_index_type_binary_search = 0, rocksdb_block_based_table_data_block_index_type_binary_search_and_hash = 1, }; -extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_data_block_index_type( +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_data_block_index_type( rocksdb_block_based_table_options_t*, int); // uses one of the above enums -extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_data_block_hash_ratio( +extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_data_block_hash_ratio( rocksdb_block_based_table_options_t* options, double v); // rocksdb_block_based_options_set_hash_index_allow_collision() // is removed since BlockBasedTableOptions.hash_index_allow_collision() @@ -1075,11 +1065,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory( rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options); /* Options */ -extern ROCKSDB_LIBRARY_API void rocksdb_set_options( - rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_set_options(rocksdb_t* db, int count, + const char* const keys[], + const char* const values[], + char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf( - rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr); + rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, + const char* const keys[], const char* const values[], char** errptr); extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void); extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*); @@ -1094,9 +1087,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction( extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_universal_style_compaction( rocksdb_options_t* opt, uint64_t memtable_memory_budget); -extern ROCKSDB_LIBRARY_API void -rocksdb_options_set_allow_ingest_behind(rocksdb_options_t*, - unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_ingest_behind( + rocksdb_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter( @@ -1132,9 +1124,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks( rocksdb_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks( rocksdb_options_t*); -extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(rocksdb_options_t*, - const rocksdb_dbpath_t** path_values, - size_t num_paths); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths( + rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*, @@ -1383,12 +1374,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num( rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t*); -extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_pending_compaction_bytes_limit( - rocksdb_options_t* opt, size_t v); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt, + size_t v); extern ROCKSDB_LIBRARY_API size_t rocksdb_options_get_soft_pending_compaction_bytes_limit(rocksdb_options_t* opt); -extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_pending_compaction_bytes_limit( - rocksdb_options_t* opt, size_t v); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt, + size_t v); extern ROCKSDB_LIBRARY_API size_t rocksdb_options_get_hard_pending_compaction_bytes_limit(rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size( @@ -1469,7 +1462,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync( extern ROCKSDB_LIBRARY_API uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync( - rocksdb_options_t*, uint64_t); + rocksdb_options_t*, uint64_t); extern ROCKSDB_LIBRARY_API uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void @@ -1508,8 +1501,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load( rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep( rocksdb_options_t*); -extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_prefix_bloom_size_ratio( - rocksdb_options_t*, double); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_set_memtable_prefix_bloom_size_ratio(rocksdb_options_t*, + double); extern ROCKSDB_LIBRARY_API double rocksdb_options_get_memtable_prefix_bloom_size_ratio(rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_compaction_bytes( @@ -1614,8 +1608,7 @@ extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_atomic_flush( rocksdb_options_t* opt); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache( - rocksdb_options_t* opt, rocksdb_cache_t* cache -); + rocksdb_options_t* opt, rocksdb_cache_t* cache); extern ROCKSDB_LIBRARY_API void rocksdb_options_add_compact_on_deletion_collector_factory( @@ -1632,7 +1625,8 @@ extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_compression( /* RateLimiter */ extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); -extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy( + rocksdb_ratelimiter_t*); /* PerfContext */ enum { @@ -1731,8 +1725,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset( rocksdb_perfcontext_t* context); extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report( rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters); -extern ROCKSDB_LIBRARY_API uint64_t rocksdb_perfcontext_metric( - rocksdb_perfcontext_t* context, int metric); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, int metric); extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy( rocksdb_perfcontext_t* context); @@ -1877,11 +1871,13 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek( rocksdb_readoptions_t*, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_total_order_seek(rocksdb_readoptions_t*); -extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_max_skippable_internal_keys( - rocksdb_readoptions_t*, uint64_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_readoptions_set_max_skippable_internal_keys(rocksdb_readoptions_t*, + uint64_t); extern ROCKSDB_LIBRARY_API uint64_t rocksdb_readoptions_get_max_skippable_internal_keys(rocksdb_readoptions_t*); -extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_background_purge_on_iterator_cleanup( +extern ROCKSDB_LIBRARY_API void +rocksdb_readoptions_set_background_purge_on_iterator_cleanup( rocksdb_readoptions_t*, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup( @@ -1898,6 +1894,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout( rocksdb_readoptions_t*, uint64_t microseconds); extern ROCKSDB_LIBRARY_API uint64_t rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_async_io( + rocksdb_readoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_async_io( + rocksdb_readoptions_t*); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_timestamp( rocksdb_readoptions_t*, const char* ts, size_t tslen); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iter_start_ts( @@ -1917,8 +1917,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL( rocksdb_writeoptions_t* opt, int disable); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_disable_WAL( rocksdb_writeoptions_t* opt); -extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_ignore_missing_column_families( - rocksdb_writeoptions_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void +rocksdb_writeoptions_set_ignore_missing_column_families(rocksdb_writeoptions_t*, + unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_writeoptions_get_ignore_missing_column_families( rocksdb_writeoptions_t*); @@ -2017,7 +2018,8 @@ rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache); /* DBPath */ -extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size); +extern ROCKSDB_LIBRARY_API rocksdb_dbpath_t* rocksdb_dbpath_create( + const char* path, uint64_t target_size); extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*); /* Env */ @@ -2042,10 +2044,14 @@ extern ROCKSDB_LIBRARY_API int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads( rocksdb_env_t* env); -extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env); -extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env); -extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env); -extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_lower_high_priority_thread_pool_io_priority(rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_cpu_priority( + rocksdb_env_t* env); +extern ROCKSDB_LIBRARY_API void +rocksdb_env_lower_high_priority_thread_pool_cpu_priority(rocksdb_env_t* env); extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*); @@ -2218,10 +2224,10 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey( const rocksdb_livefiles_t*, int index, size_t* size); extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey( const rocksdb_livefiles_t*, int index, size_t* size); -extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_entries( - const rocksdb_livefiles_t*, int index); -extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_deletions( - const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index); extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy( const rocksdb_livefiles_t*); @@ -2371,7 +2377,8 @@ extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open( const rocksdb_transactiondb_options_t* txn_db_options, const char* name, char** errptr); -extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families( +extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* +rocksdb_transactiondb_open_column_families( const rocksdb_options_t* options, const rocksdb_transactiondb_options_t* txn_db_options, const char* name, int num_column_families, const char* const* column_family_names, @@ -2555,7 +2562,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_put_cf( extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_write( rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options, - rocksdb_writebatch_t *batch, char** errptr); + rocksdb_writebatch_t* batch, char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge( rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val, @@ -2757,7 +2764,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy( rocksdb_memory_consumers_t* consumers); extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers, - char** errptr); + char** errptr); extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy( rocksdb_memory_usage_t* usage); @@ -2790,5 +2797,5 @@ extern ROCKSDB_LIBRARY_API void rocksdb_disable_manual_compaction( extern ROCKSDB_LIBRARY_API void rocksdb_enable_manual_compaction(rocksdb_t* db); #ifdef __cplusplus -} /* end extern "C" */ +} /* end extern "C" */ #endif diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 11c7da77e29..584e119bc84 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -7,18 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // -// A Cache is an interface that maps keys to values. It has internal -// synchronization and may be safely accessed concurrently from -// multiple threads. It may automatically evict entries to make room -// for new entries. Values have a specified charge against the cache -// capacity. For example, a cache where the values are variable -// length strings, may use the length of the string as the charge for -// the string. -// -// A builtin cache implementation with a least-recently-used eviction -// policy is provided. Clients may use their own implementations if -// they want something more sophisticated (like scan-resistance, a -// custom eviction policy, variable cache sizing, etc.) +// Various APIs for creating and customizing read caches in RocksDB. #pragma once @@ -37,8 +26,69 @@ namespace ROCKSDB_NAMESPACE { class Cache; struct ConfigOptions; +class Logger; class SecondaryCache; +// Classifications of block cache entries. +// +// Developer notes: Adding a new enum to this class requires corresponding +// updates to `kCacheEntryRoleToCamelString` and +// `kCacheEntryRoleToHyphenString`. Do not add to this enum after `kMisc` since +// `kNumCacheEntryRoles` assumes `kMisc` comes last. +enum class CacheEntryRole { + // Block-based table data block + kDataBlock, + // Block-based table filter block (full or partitioned) + kFilterBlock, + // Block-based table metadata block for partitioned filter + kFilterMetaBlock, + // OBSOLETE / DEPRECATED: old/removed block-based filter + kDeprecatedFilterBlock, + // Block-based table index block + kIndexBlock, + // Other kinds of block-based table block + kOtherBlock, + // WriteBufferManager's charge to account for its memtable usage + kWriteBuffer, + // Compression dictionary building buffer's charge to account for + // its memory usage + kCompressionDictionaryBuildingBuffer, + // Filter's charge to account for + // (new) bloom and ribbon filter construction's memory usage + kFilterConstruction, + // BlockBasedTableReader's charge to account for its memory usage + kBlockBasedTableReader, + // FileMetadata's charge to account for its memory usage + kFileMetadata, + // Blob value (when using the same cache as block cache and blob cache) + kBlobValue, + // Blob cache's charge to account for its memory usage (when using a + // separate block cache and blob cache) + kBlobCache, + // Default bucket, for miscellaneous cache entries. Do not use for + // entries that could potentially add up to large usage. + kMisc, +}; +constexpr uint32_t kNumCacheEntryRoles = + static_cast(CacheEntryRole::kMisc) + 1; + +// Obtain a hyphen-separated, lowercase name of a `CacheEntryRole`. +const std::string& GetCacheEntryRoleName(CacheEntryRole); + +// For use with `GetMapProperty()` for property +// `DB::Properties::kBlockCacheEntryStats`. On success, the map will +// be populated with all keys that can be obtained from these functions. +struct BlockCacheEntryStatsMapKeys { + static const std::string& CacheId(); + static const std::string& CacheCapacityBytes(); + static const std::string& LastCollectionDurationSeconds(); + static const std::string& LastCollectionAgeSeconds(); + + static std::string EntryCount(CacheEntryRole); + static std::string UsedBytes(CacheEntryRole); + static std::string UsedPercent(CacheEntryRole); +}; + extern const bool kDefaultToAdaptiveMutex; enum CacheMetadataChargePolicy { @@ -227,11 +277,21 @@ extern std::shared_ptr NewCompressedSecondaryCache( extern std::shared_ptr NewCompressedSecondaryCache( const CompressedSecondaryCacheOptions& opts); -// HyperClockCache - EXPERIMENTAL -// -// A lock-free Cache alternative for RocksDB block cache that offers much -// improved CPU efficiency under high parallel load or high contention, with -// some caveats. +// HyperClockCache - A lock-free Cache alternative for RocksDB block cache +// that offers much improved CPU efficiency vs. LRUCache under high parallel +// load or high contention, with some caveats: +// * Not a general Cache implementation: can only be used for +// BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is +// compatible with HyperClockCache. +// * Requires an extra tuning parameter: see estimated_entry_charge below. +// Similarly, substantially changing the capacity with SetCapacity could +// harm efficiency. +// * SecondaryCache is not yet supported. +// * Cache priorities are less aggressively enforced, which could cause +// cache dilution from long range scans (unless they use fill_cache=false). +// * Can be worse for small caches, because if almost all of a cache shard is +// pinned (more likely with non-partitioned filters), then CLOCK eviction +// becomes very CPU intensive. // // See internal cache/clock_cache.h for full description. struct HyperClockCacheOptions : public ShardedCacheOptions { @@ -292,8 +352,34 @@ extern std::shared_ptr NewClockCache( CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy); +// A Cache maps keys to objects resident in memory, tracks reference counts +// on those key-object entries, and is able to remove unreferenced entries +// whenever it wants. All operations are fully thread safe except as noted. +// Inserted entries have a specified "charge" which is some quantity in +// unspecified units, typically bytes of memory used. A Cache will typically +// have a finite capacity in units of charge, and evict entries as needed +// to stay at or below that capacity. +// +// NOTE: This API is for expert use only and is more intended for providing +// custom implementations than for calling into. It is subject to change +// as RocksDB evolves, especially the RocksDB block cache. +// +// INTERNAL: See typed_cache.h for convenient wrappers on top of this API. class Cache { - public: + public: // types hidden from API client + // Opaque handle to an entry stored in the cache. + struct Handle {}; + + public: // types hidden from Cache implementation + // Pointer to cached object of unspecified type. (This type alias is + // provided for clarity, not really for type checking.) + using ObjectPtr = void*; + + // Opaque object providing context (settings, etc.) to create objects + // for primary cache from saved (serialized) secondary cache entries. + struct CreateContext {}; + + public: // type defs // Depending on implementation, cache entries with higher priority levels // could be less likely to get evicted than entries with lower priority // levels. The "high" priority level applies to certain SST metablocks (e.g. @@ -325,54 +411,94 @@ class Cache { // so anything required for these operations should be contained in the // object itself. // - // The SizeCallback takes a void* pointer to the object and returns the size + // The SizeCallback takes a pointer to the object and returns the size // of the persistable data. It can be used by the secondary cache to allocate // memory if needed. // // RocksDB callbacks are NOT exception-safe. A callback completing with an // exception can lead to undefined behavior in RocksDB, including data loss, // unreported corruption, deadlocks, and more. - using SizeCallback = size_t (*)(void* obj); + using SizeCallback = size_t (*)(ObjectPtr obj); - // The SaveToCallback takes a void* object pointer and saves the persistable + // The SaveToCallback takes an object pointer and saves the persistable // data into a buffer. The secondary cache may decide to not store it in a // contiguous buffer, in which case this callback will be called multiple // times with increasing offset - using SaveToCallback = Status (*)(void* from_obj, size_t from_offset, - size_t length, void* out); - - // A function pointer type for custom destruction of an entry's - // value. The Cache is responsible for copying and reclaiming space - // for the key, but values are managed by the caller. - using DeleterFn = void (*)(const Slice& key, void* value); + using SaveToCallback = Status (*)(ObjectPtr from_obj, size_t from_offset, + size_t length, char* out_buf); + + // A function pointer type for destruction of a cache object. This will + // typically call the destructor for the appropriate type of the object. + // The Cache is responsible for copying and reclaiming space for the key, + // but objects are managed in part using this callback. Generally a DeleterFn + // can be nullptr if the ObjectPtr does not need destruction (e.g. nullptr or + // pointer into static data). + using DeleterFn = void (*)(ObjectPtr obj, MemoryAllocator* allocator); + + // The CreateCallback is takes in a buffer from the NVM cache and constructs + // an object using it. The callback doesn't have ownership of the buffer and + // should copy the contents into its own buffer. The CreateContext* is + // provided by Lookup and may be used to follow DB- or CF-specific settings. + // In case of some error, non-OK is returned and the caller should ignore + // any result in out_obj. (The implementation must clean up after itself.) + using CreateCallback = Status (*)(const Slice& data, CreateContext* context, + MemoryAllocator* allocator, + ObjectPtr* out_obj, size_t* out_charge); // A struct with pointers to helper functions for spilling items from the // cache into the secondary cache. May be extended in the future. An // instance of this struct is expected to outlive the cache. struct CacheItemHelper { + // Function for deleting an object on its removal from the Cache. + // nullptr is only for entries that require no destruction, such as + // "placeholder" cache entries with nullptr object. + DeleterFn del_cb; // (<- Most performance critical) + // Next three are used for persisting values as described above. + // If any is nullptr, then all three should be nullptr and persisting the + // entry to/from secondary cache is not supported. SizeCallback size_cb; SaveToCallback saveto_cb; - DeleterFn del_cb; - - CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {} - CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb, - DeleterFn _del_cb) - : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {} + CreateCallback create_cb; + // Classification of the entry for monitoring purposes in block cache. + CacheEntryRole role; + + constexpr CacheItemHelper() + : del_cb(nullptr), + size_cb(nullptr), + saveto_cb(nullptr), + create_cb(nullptr), + role(CacheEntryRole::kMisc) {} + + explicit constexpr CacheItemHelper(CacheEntryRole _role, + DeleterFn _del_cb = nullptr, + SizeCallback _size_cb = nullptr, + SaveToCallback _saveto_cb = nullptr, + CreateCallback _create_cb = nullptr) + : del_cb(_del_cb), + size_cb(_size_cb), + saveto_cb(_saveto_cb), + create_cb(_create_cb), + role(_role) { + // Either all three secondary cache callbacks are non-nullptr or + // all three are nullptr + assert((size_cb != nullptr) == (saveto_cb != nullptr)); + assert((size_cb != nullptr) == (create_cb != nullptr)); + } + inline bool IsSecondaryCacheCompatible() const { + return size_cb != nullptr; + } }; - // The CreateCallback is passed by the block cache user to Lookup(). It - // takes in a buffer from the NVM cache and constructs an object using - // it. The callback doesn't have ownership of the buffer and should - // copy the contents into its own buffer. - using CreateCallback = std::function; - + public: // ctor/dtor/create Cache(std::shared_ptr allocator = nullptr) : memory_allocator_(std::move(allocator)) {} // No copying allowed Cache(const Cache&) = delete; Cache& operator=(const Cache&) = delete; + // Destroys all remaining entries by calling the associated "deleter" + virtual ~Cache() {} + // Creates a new Cache based on the input value string and returns the result. // Currently, this method can be used to create LRUCaches only // @param config_options @@ -388,47 +514,103 @@ class Cache { const std::string& value, std::shared_ptr* result); - // Destroys all existing entries by calling the "deleter" - // function that was passed via the Insert() function. - // - // @See Insert - virtual ~Cache() {} - - // Opaque handle to an entry stored in the cache. - struct Handle {}; - + public: // functions // The type of the Cache virtual const char* Name() const = 0; - // Insert a mapping from key->value into the volatile cache only - // and assign it with the specified charge against the total cache capacity. - // If strict_capacity_limit is true and cache reaches its full capacity, - // return Status::MemoryLimit. + // The Insert and Lookup APIs below are intended to allow cached objects + // to be demoted/promoted between the primary block cache and a secondary + // cache. The secondary cache could be a non-volatile cache, and will + // likely store the object in a different representation. They rely on a + // per object CacheItemHelper to do the conversions. + // The secondary cache may persist across process and system restarts, + // and may even be moved between hosts. Therefore, the cache key must + // be repeatable across restarts/reboots, and globally unique if + // multiple DBs share the same cache and the set of DBs can change + // over time. + + // Insert a mapping from key->object into the cache and assign it + // the specified charge against the total cache capacity. If + // strict_capacity_limit is true and cache reaches its full capacity, + // return Status::MemoryLimit. `obj` must be non-nullptr if compatible + // with secondary cache (helper->size_cb != nullptr), because Value() == + // nullptr is reserved for indicating some secondary cache failure cases. + // On success, returns OK and takes ownership of `obj`, eventually deleting + // it with helper->del_cb. On non-OK return, the caller maintains ownership + // of `obj` so will often need to delete it in such cases. + // + // The helper argument is saved by the cache and will be used when the + // inserted object is evicted or considered for promotion to the secondary + // cache. Promotion to secondary cache is only enabled if helper->size_cb + // != nullptr. The helper must outlive the cache. Callers may use + // &kNoopCacheItemHelper as a trivial helper (no deleter for the object, + // no secondary cache). `helper` must not be nullptr (efficiency). // - // If handle is not nullptr, returns a handle that corresponds to the - // mapping. The caller must call this->Release(handle) when the returned - // mapping is no longer needed. In case of error caller is responsible to - // cleanup the value (i.e. calling "deleter"). + // If `handle` is not nullptr and return status is OK, `handle` is set + // to a Handle* for the entry. The caller must call this->Release(handle) + // when the returned entry is no longer needed. If `handle` is nullptr, it is + // as if Release is called immediately after Insert. // - // If handle is nullptr, it is as if Release is called immediately after - // insert. In case of error value will be cleanup. + // Regardless of whether the item was inserted into the cache, + // it will attempt to insert it into the secondary cache if one is + // configured, and the helper supports it. + // The cache implementation must support a secondary cache, otherwise + // the item is only inserted into the primary cache. It may + // defer the insertion to the secondary cache as it sees fit. // - // When the inserted entry is no longer needed, the key and - // value will be passed to "deleter" which must delete the value. - // (The Cache is responsible for copying and reclaiming space for - // the key.) - virtual Status Insert(const Slice& key, void* value, size_t charge, - DeleterFn deleter, Handle** handle = nullptr, + // When the inserted entry is no longer needed, it will be destroyed using + // helper->del_cb (if non-nullptr). + virtual Status Insert(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + Handle** handle = nullptr, Priority priority = Priority::LOW) = 0; - // If the cache has no mapping for "key", returns nullptr. + // Lookup the key, returning nullptr if not found. If found, returns + // a handle to the mapping that must eventually be passed to Release(). + // + // If a non-nullptr helper argument is provided with a non-nullptr + // create_cb, and a secondary cache is configured, then the secondary + // cache is also queried if lookup in the primary cache fails. If found + // in secondary cache, the provided create_db and create_context are + // used to promote the entry to an object in the primary cache. + // In that case, the helper may be saved and used later when the object + // is evicted, so as usual, the pointed-to helper must outlive the cache. + // + // ======================== Async Lookup (wait=false) ====================== + // When wait=false, the handle returned might be in any of three states: + // * Present - If Value() != nullptr, then the result is present and + // the handle can be used just as if wait=true. + // * Pending, not ready (IsReady() == false) - secondary cache is still + // working to retrieve the value. Might become ready any time. + // * Pending, ready (IsReady() == true) - secondary cache has the value + // but it has not been loaded as an object into primary cache. Call to + // Wait()/WaitAll() will not block. + // + // IMPORTANT: Pending handles are not thread-safe, and only these functions + // are allowed on them: Value(), IsReady(), Wait(), WaitAll(). Even Release() + // can only come after Wait() or WaitAll() even though a reference is held. + // + // Only Wait()/WaitAll() gets a Handle out of a Pending state. (Waiting is + // safe and has no effect on other handle states.) After waiting on a Handle, + // it is in one of two states: + // * Present - if Value() != nullptr + // * Failed - if Value() == nullptr, such as if the secondary cache + // initially thought it had the value but actually did not. // - // Else return a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. - // If stats is not nullptr, relative tickers could be used inside the - // function. - virtual Handle* Lookup(const Slice& key, Statistics* stats = nullptr) = 0; + // Note that given an arbitrary Handle, the only way to distinguish the + // Pending+ready state from the Failed state is to Wait() on it. A cache + // entry not compatible with secondary cache can also have Value()==nullptr + // like the Failed state, but this is not generally a concern. + virtual Handle* Lookup(const Slice& key, + const CacheItemHelper* helper = nullptr, + CreateContext* create_context = nullptr, + Priority priority = Priority::LOW, bool wait = true, + Statistics* stats = nullptr) = 0; + + // Convenience wrapper when secondary cache not supported + inline Handle* BasicLookup(const Slice& key, Statistics* stats) { + return Lookup(key, nullptr, nullptr, Priority::LOW, true, stats); + } // Increments the reference count for the handle if it refers to an entry in // the cache. Returns true if refcount was incremented; otherwise, returns @@ -449,11 +631,12 @@ class Cache { // REQUIRES: handle must have been returned by a method on *this. virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0; - // Return the value encapsulated in a handle returned by a - // successful Lookup(). + // Return the object assiciated with a handle returned by a successful + // Lookup(). For historical reasons, this is also known at the "value" + // associated with the key. // REQUIRES: handle must not have been released yet. // REQUIRES: handle must have been returned by a method on *this. - virtual void* Value(Handle* handle) = 0; + virtual ObjectPtr Value(Handle* handle) = 0; // If the cache contains the entry for the key, erase it. Note that the // underlying entry will be kept around until all existing handles @@ -504,11 +687,8 @@ class Cache { // Returns the charge for the specific entry in the cache. virtual size_t GetCharge(Handle* handle) const = 0; - // Returns the deleter for the specified entry. This might seem useless - // as the Cache itself is responsible for calling the deleter, but - // the deleter can essentially verify that a cache entry is of an - // expected type from an expected code source. - virtual DeleterFn GetDeleter(Handle* handle) const = 0; + // Returns the helper for the specified entry. + virtual const CacheItemHelper* GetCacheItemHelper(Handle* handle) const = 0; // Call this on shutdown if you want to speed it up. Cache will disown // any underlying data and will not free it on delete. This call will leak @@ -534,124 +714,27 @@ class Cache { // entries is iterated over if other threads are operating on the Cache // also. virtual void ApplyToAllEntries( - const std::function& callback, + const std::function& callback, const ApplyToAllEntriesOptions& opts) = 0; - // DEPRECATED version of above. (Default implementation uses above.) - virtual void ApplyToAllCacheEntries(void (*callback)(void* value, - size_t charge), - bool /*thread_safe*/) { - ApplyToAllEntries([callback](const Slice&, void* value, size_t charge, - DeleterFn) { callback(value, charge); }, - {}); - } - // Remove all entries. // Prerequisite: no entry is referenced. virtual void EraseUnRefEntries() = 0; virtual std::string GetPrintableOptions() const { return ""; } + // Check for any warnings or errors in the operation of the cache and + // report them to the logger. This is intended only to be called + // periodically so does not need to be very efficient. (Obscure calling + // conventions for Logger inherited from env.h) + virtual void ReportProblems( + const std::shared_ptr& /*info_log*/) const {} + MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } // EXPERIMENTAL // The following APIs are experimental and might change in the future. - // The Insert and Lookup APIs below are intended to allow cached objects - // to be demoted/promoted between the primary block cache and a secondary - // cache. The secondary cache could be a non-volatile cache, and will - // likely store the object in a different representation. They rely on a - // per object CacheItemHelper to do the conversions. - // The secondary cache may persist across process and system restarts, - // and may even be moved between hosts. Therefore, the cache key must - // be repeatable across restarts/reboots, and globally unique if - // multiple DBs share the same cache and the set of DBs can change - // over time. - - // Insert a mapping from key->value into the cache and assign it - // the specified charge against the total cache capacity. If - // strict_capacity_limit is true and cache reaches its full capacity, - // return Status::MemoryLimit. `value` must be non-nullptr for this - // Insert() because Value() == nullptr is reserved for indicating failure - // with secondary-cache-compatible mappings. - // - // The helper argument is saved by the cache and will be used when the - // inserted object is evicted or promoted to the secondary cache. It, - // therefore, must outlive the cache. - // - // If handle is not nullptr, returns a handle that corresponds to the - // mapping. The caller must call this->Release(handle) when the returned - // mapping is no longer needed. In case of error caller is responsible to - // cleanup the value (i.e. calling "deleter"). - // - // If handle is nullptr, it is as if Release is called immediately after - // insert. In case of error value will be cleanup. - // - // Regardless of whether the item was inserted into the cache, - // it will attempt to insert it into the secondary cache if one is - // configured, and the helper supports it. - // The cache implementation must support a secondary cache, otherwise - // the item is only inserted into the primary cache. It may - // defer the insertion to the secondary cache as it sees fit. - // - // When the inserted entry is no longer needed, the key and - // value will be passed to "deleter". - virtual Status Insert(const Slice& key, void* value, - const CacheItemHelper* helper, size_t charge, - Handle** handle = nullptr, - Priority priority = Priority::LOW) { - if (!helper) { - return Status::InvalidArgument(); - } - return Insert(key, value, charge, helper->del_cb, handle, priority); - } - - // Lookup the key in the primary and secondary caches (if one is configured). - // The create_cb callback function object will be used to contruct the - // cached object. - // If none of the caches have the mapping for the key, returns nullptr. - // Else, returns a handle that corresponds to the mapping. - // - // This call may promote the object from the secondary cache (if one is - // configured, and has the given key) to the primary cache. - // - // The helper argument should be provided if the caller wants the lookup - // to include the secondary cache (if one is configured) and the object, - // if it exists, to be promoted to the primary cache. The helper may be - // saved and used later when the object is evicted. Therefore, it must - // outlive the cache. - // - // ======================== Async Lookup (wait=false) ====================== - // When wait=false, the handle returned might be in any of three states: - // * Present - If Value() != nullptr, then the result is present and - // the handle can be used just as if wait=true. - // * Pending, not ready (IsReady() == false) - secondary cache is still - // working to retrieve the value. Might become ready any time. - // * Pending, ready (IsReady() == true) - secondary cache has the value - // but it has not been loaded into primary cache. Call to Wait()/WaitAll() - // will not block. - // - // IMPORTANT: Pending handles are not thread-safe, and only these functions - // are allowed on them: Value(), IsReady(), Wait(), WaitAll(). Even Release() - // can only come after Wait() or WaitAll() even though a reference is held. - // - // Only Wait()/WaitAll() gets a Handle out of a Pending state. (Waiting is - // safe and has no effect on other handle states.) After waiting on a Handle, - // it is in one of two states: - // * Present - if Value() != nullptr - // * Failed - if Value() == nullptr, such as if the secondary cache - // initially thought it had the value but actually did not. - // - // Note that given an arbitrary Handle, the only way to distinguish the - // Pending+ready state from the Failed state is to Wait() on it. A cache - // entry not compatible with secondary cache can also have Value()==nullptr - // like the Failed state, but this is not generally a concern. - virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/, - const CreateCallback& /*create_cb*/, - Priority /*priority*/, bool /*wait*/, - Statistics* stats = nullptr) { - return Lookup(key, stats); - } // Release a mapping returned by a previous Lookup(). The "useful" // parameter specifies whether the data was actually used or not, @@ -671,7 +754,7 @@ class Cache { // Convert a "pending" handle into a full thread-shareable handle by // * If necessary, wait until secondary cache finishes loading the value. - // * Construct the value for primary cache and set it in the handle. + // * Construct the object for primary cache and set it in the handle. // Even after Wait() on a pending handle, the caller must check for // Value() == nullptr in case of failure. This call is not thread-safe // on pending handles. This call has no effect on non-pending handles. @@ -687,64 +770,8 @@ class Cache { std::shared_ptr memory_allocator_; }; -// Classifications of block cache entries. -// -// Developer notes: Adding a new enum to this class requires corresponding -// updates to `kCacheEntryRoleToCamelString` and -// `kCacheEntryRoleToHyphenString`. Do not add to this enum after `kMisc` since -// `kNumCacheEntryRoles` assumes `kMisc` comes last. -enum class CacheEntryRole { - // Block-based table data block - kDataBlock, - // Block-based table filter block (full or partitioned) - kFilterBlock, - // Block-based table metadata block for partitioned filter - kFilterMetaBlock, - // OBSOLETE / DEPRECATED: old/removed block-based filter - kDeprecatedFilterBlock, - // Block-based table index block - kIndexBlock, - // Other kinds of block-based table block - kOtherBlock, - // WriteBufferManager's charge to account for its memtable usage - kWriteBuffer, - // Compression dictionary building buffer's charge to account for - // its memory usage - kCompressionDictionaryBuildingBuffer, - // Filter's charge to account for - // (new) bloom and ribbon filter construction's memory usage - kFilterConstruction, - // BlockBasedTableReader's charge to account for its memory usage - kBlockBasedTableReader, - // FileMetadata's charge to account for its memory usage - kFileMetadata, - // Blob value (when using the same cache as block cache and blob cache) - kBlobValue, - // Blob cache's charge to account for its memory usage (when using a - // separate block cache and blob cache) - kBlobCache, - // Default bucket, for miscellaneous cache entries. Do not use for - // entries that could potentially add up to large usage. - kMisc, -}; -constexpr uint32_t kNumCacheEntryRoles = - static_cast(CacheEntryRole::kMisc) + 1; - -// Obtain a hyphen-separated, lowercase name of a `CacheEntryRole`. -const std::string& GetCacheEntryRoleName(CacheEntryRole); - -// For use with `GetMapProperty()` for property -// `DB::Properties::kBlockCacheEntryStats`. On success, the map will -// be populated with all keys that can be obtained from these functions. -struct BlockCacheEntryStatsMapKeys { - static const std::string& CacheId(); - static const std::string& CacheCapacityBytes(); - static const std::string& LastCollectionDurationSeconds(); - static const std::string& LastCollectionAgeSeconds(); - - static std::string EntryCount(CacheEntryRole); - static std::string UsedBytes(CacheEntryRole); - static std::string UsedPercent(CacheEntryRole); -}; +// Useful for cache entries requiring no clean-up, such as for cache +// reservations +inline constexpr Cache::CacheItemHelper kNoopCacheItemHelper{}; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index 57668a24e17..9c6a9c30d68 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -163,6 +163,7 @@ class CompactionFilter : public Customizable { // is a write conflict and may allow a Transaction to Commit that should have // failed. Instead, it is better to implement any Merge filtering inside the // MergeOperator. + // key includes timestamp if user-defined timestamp is enabled. virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, const Slice& existing_value, std::string* new_value, std::string* /*skip_until*/) const { diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h index 0d6fbd02812..5ff8eccc8bf 100644 --- a/include/rocksdb/compaction_job_stats.h +++ b/include/rocksdb/compaction_job_stats.h @@ -6,6 +6,7 @@ #pragma once #include #include + #include #include "rocksdb/rocksdb_namespace.h" diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index b1635aa1441..cde95556099 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -284,7 +284,6 @@ class DB { const std::vector& column_families, std::vector* handles, DB** dbptr); - // Open DB and run the compaction. // It's a read-only operation, the result won't be installed to the DB, it // will be output to the `output_directory`. The API should only be used with @@ -502,10 +501,7 @@ class DB { virtual Status Merge(const WriteOptions& /*options*/, ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, const Slice& /*ts*/, - const Slice& /*value*/) { - return Status::NotSupported( - "Merge does not support user-defined timestamp yet"); - } + const Slice& /*value*/); // Apply the specified updates to the database. // If `updates` contains no update, WAL will still be synced if @@ -1517,11 +1513,17 @@ class DB { virtual Status SyncWAL() = 0; // Lock the WAL. Also flushes the WAL after locking. + // After this method returns ok, writes to the database will be stopped until + // UnlockWAL() is called. + // This method may internally acquire and release DB mutex and the WAL write + // mutex, but after it returns, neither mutex is held by caller. virtual Status LockWAL() { return Status::NotSupported("LockWAL not implemented"); } // Unlock the WAL. + // The write stop on the database will be cleared. + // This method may internally acquire and release DB mutex. virtual Status UnlockWAL() { return Status::NotSupported("UnlockWAL not implemented"); } diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 547faa5a455..91ad47218e6 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -344,8 +344,7 @@ class FileSystem : public Customizable { // The returned file may be concurrently accessed by multiple threads. virtual IOStatus NewRandomAccessFile( const std::string& fname, const FileOptions& file_opts, - std::unique_ptr* result, - IODebugContext* dbg) = 0; + std::unique_ptr* result, IODebugContext* dbg) = 0; // These values match Linux definition // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 enum WriteLifeTimeHint { @@ -497,7 +496,8 @@ class FileSystem : public Customizable { virtual IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/, const IOOptions& /*options*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported("Truncate is not supported for this FileSystem"); + return IOStatus::NotSupported( + "Truncate is not supported for this FileSystem"); } // Create the specified directory. Returns error if directory exists. @@ -534,7 +534,8 @@ class FileSystem : public Customizable { const std::string& /*target*/, const IOOptions& /*options*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported("LinkFile is not supported for this FileSystem"); + return IOStatus::NotSupported( + "LinkFile is not supported for this FileSystem"); } virtual IOStatus NumFileLinks(const std::string& /*fname*/, @@ -548,7 +549,8 @@ class FileSystem : public Customizable { const std::string& /*second*/, const IOOptions& /*options*/, bool* /*res*/, IODebugContext* /*dbg*/) { - return IOStatus::NotSupported("AreFilesSame is not supported for this FileSystem"); + return IOStatus::NotSupported( + "AreFilesSame is not supported for this FileSystem"); } // Lock the specified file. Used to prevent concurrent access to @@ -612,7 +614,7 @@ class FileSystem : public Customizable { // the FileOptions in the parameters, but is optimized for writing log files. // Default implementation returns the copy of the same object. virtual FileOptions OptimizeForLogWrite(const FileOptions& file_options, - const DBOptions& db_options) const; + const DBOptions& db_options) const; // OptimizeForManifestWrite will create a new FileOptions object that is a // copy of the FileOptions in the parameters, but is optimized for writing @@ -1328,8 +1330,7 @@ class FileSystemWrapper : public FileSystem { FileSystem* target() const { return target_.get(); } // The following text is boilerplate that forwards all methods to target() - IOStatus NewSequentialFile(const std::string& f, - const FileOptions& file_opts, + IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts, std::unique_ptr* r, IODebugContext* dbg) override { return target_->NewSequentialFile(f, file_opts, r, dbg); @@ -1356,8 +1357,7 @@ class FileSystemWrapper : public FileSystem { const FileOptions& file_opts, std::unique_ptr* r, IODebugContext* dbg) override { - return target_->ReuseWritableFile(fname, old_fname, file_opts, r, - dbg); + return target_->ReuseWritableFile(fname, old_fname, file_opts, r, dbg); } IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& file_opts, @@ -1474,7 +1474,7 @@ class FileSystemWrapper : public FileSystem { } FileOptions OptimizeForLogRead( - const FileOptions& file_options) const override { + const FileOptions& file_options) const override { return target_->OptimizeForLogRead(file_options); } FileOptions OptimizeForManifestRead( @@ -1482,7 +1482,7 @@ class FileSystemWrapper : public FileSystem { return target_->OptimizeForManifestRead(file_options); } FileOptions OptimizeForLogWrite(const FileOptions& file_options, - const DBOptions& db_options) const override { + const DBOptions& db_options) const override { return target_->OptimizeForLogWrite(file_options, db_options); } FileOptions OptimizeForManifestWrite( diff --git a/include/rocksdb/io_status.h b/include/rocksdb/io_status.h index 51ee47384fd..0bf5e939a69 100644 --- a/include/rocksdb/io_status.h +++ b/include/rocksdb/io_status.h @@ -14,11 +14,13 @@ #pragma once #include + #include "rocksdb/slice.h" #ifdef OS_WIN #include #endif #include + #include "status.h" namespace ROCKSDB_NAMESPACE { diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h index 0437b80dc80..559d44c57a6 100644 --- a/include/rocksdb/iostats_context.h +++ b/include/rocksdb/iostats_context.h @@ -5,6 +5,7 @@ #pragma once #include + #include #include "rocksdb/perf_level.h" diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h index 22ea7734f60..7408cbc8738 100644 --- a/include/rocksdb/ldb_tool.h +++ b/include/rocksdb/ldb_tool.h @@ -6,6 +6,7 @@ #ifndef ROCKSDB_LITE #include #include + #include "rocksdb/db.h" #include "rocksdb/options.h" diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 8644fcf3f5d..853b587581b 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -140,7 +140,10 @@ enum class CompactionReason : int { // According to the comments in flush_job.cc, RocksDB treats flush as // a level 0 compaction in internal stats. kFlush, - // Compaction caused by external sst file ingestion + // [InternalOnly] External sst file ingestion treated as a compaction + // with placeholder input level L0 as file ingestion + // technically does not have an input level like other compactions. + // Used only for internal stats and conflict checking with other compactions kExternalSstIngestion, // Compaction due to SST file being too old kPeriodicCompaction, @@ -151,6 +154,9 @@ enum class CompactionReason : int { // A special TTL compaction for RoundRobin policy, which basically the same as // kLevelMaxLevelSize, but the goal is to compact TTLed files. kRoundRobinTtl, + // [InternalOnly] DBImpl::ReFitLevel treated as a compaction, + // Used only for internal conflict checking with other compactions + kRefitLevel, // total number of compaction reasons, new reasons must be added above this. kNumOfReasons, }; diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index e1e88bbdf29..077130475da 100755 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -82,6 +82,7 @@ class MergeOperator : public Customizable { } struct MergeOperationInput { + // If user-defined timestamp is enabled, `_key` includes timestamp. explicit MergeOperationInput(const Slice& _key, const Slice* _existing_value, const std::vector& _operand_list, @@ -103,6 +104,13 @@ class MergeOperator : public Customizable { Logger* logger; }; + enum class OpFailureScope { + kDefault, + kTryMerge, + kMustMerge, + kOpFailureScopeMax, + }; + struct MergeOperationOutput { explicit MergeOperationOutput(std::string& _new_value, Slice& _existing_operand) @@ -114,6 +122,20 @@ class MergeOperator : public Customizable { // client can set this field to the operand (or existing_value) instead of // using new_value. Slice& existing_operand; + // Indicates the blast radius of the failure. It is only meaningful to + // provide a failure scope when returning `false` from the API populating + // the `MergeOperationOutput`. Currently RocksDB operations handle these + // values as follows: + // + // - `OpFailureScope::kDefault`: fallback to default + // (`OpFailureScope::kTryMerge`) + // - `OpFailureScope::kTryMerge`: operations that try to merge that key will + // fail. This includes flush and compaction, which puts the DB in + // read-only mode. + // - `OpFailureScope::kMustMerge`: operations that must merge that key will + // fail (e.g., `Get()`, `MultiGet()`, iteration). Flushes/compactions can + // still proceed by copying the original input operands to the output. + OpFailureScope op_failure_scope = OpFailureScope::kDefault; }; // This function applies a stack of merge operands in chronological order diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 0cdffcd5fd6..3cdd8bd8a38 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -82,7 +82,7 @@ struct SstFileMetaData : public FileStorageInfo { bool _being_compacted, Temperature _temperature, uint64_t _oldest_blob_file_number, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - std::string& _file_checksum, + uint64_t _epoch_number, std::string& _file_checksum, std::string& _file_checksum_func_name) : smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno), @@ -94,7 +94,8 @@ struct SstFileMetaData : public FileStorageInfo { num_deletions(0), oldest_blob_file_number(_oldest_blob_file_number), oldest_ancester_time(_oldest_ancester_time), - file_creation_time(_file_creation_time) { + file_creation_time(_file_creation_time), + epoch_number(_epoch_number) { if (!_file_name.empty()) { if (_file_name[0] == '/') { relative_filename = _file_name.substr(1); @@ -141,7 +142,12 @@ struct SstFileMetaData : public FileStorageInfo { // Timestamp when the SST file is created, provided by // SystemClock::GetCurrentTime(). 0 if the information is not available. uint64_t file_creation_time = 0; - + // The order of a file being flushed or ingested/imported. + // Compaction output file will be assigned with the minimum `epoch_number` + // among input files'. + // For L0, larger `epoch_number` indicates newer L0 file. + // 0 if the information is not available. + uint64_t epoch_number = 0; // DEPRECATED: The name of the file within its directory with a // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct // instead. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 20cf196f0b8..1dec10a6c94 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -2033,7 +2033,8 @@ struct IngestExternalFileOptions { // that where created before the file was ingested. bool snapshot_consistency = true; // If set to false, IngestExternalFile() will fail if the file key range - // overlaps with existing keys or tombstones in the DB. + // overlaps with existing keys or tombstones or output of ongoing compaction + // during file ingestion in the DB. bool allow_global_seqno = true; // If set to false and the file key range overlaps with the memtable key range // (memtable flush required), IngestExternalFile will fail. diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 3c890cbcc83..cd1dd99f06d 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -6,6 +6,7 @@ #pragma once #include + #include #include @@ -68,7 +69,7 @@ struct PerfContext { uint64_t block_read_count; // total number of block reads (with IO) uint64_t block_read_byte; // total number of bytes from block reads uint64_t block_read_time; // total nanos spent on block reads - uint64_t block_cache_index_hit_count; // total number of index block hits + uint64_t block_cache_index_hit_count; // total number of index block hits // total number of standalone handles lookup from secondary cache uint64_t block_cache_standalone_handle_count; // total number of real handles lookup from secondary cache that are inserted diff --git a/include/rocksdb/perf_level.h b/include/rocksdb/perf_level.h index e6a76890460..e7dded0e321 100644 --- a/include/rocksdb/perf_level.h +++ b/include/rocksdb/perf_level.h @@ -6,6 +6,7 @@ #pragma once #include + #include #include "rocksdb/rocksdb_namespace.h" diff --git a/include/rocksdb/persistent_cache.h b/include/rocksdb/persistent_cache.h index d60bf4233ab..f14f0199935 100644 --- a/include/rocksdb/persistent_cache.h +++ b/include/rocksdb/persistent_cache.h @@ -8,6 +8,7 @@ #pragma once #include + #include #include diff --git a/include/rocksdb/pluggable_compaction.h b/include/rocksdb/pluggable_compaction.h index 80e3a794b70..8ee3beae4ab 100644 --- a/include/rocksdb/pluggable_compaction.h +++ b/include/rocksdb/pluggable_compaction.h @@ -98,15 +98,18 @@ struct FilesInOneLevel { struct OutputFile { std::string pathname; TableProperties table_properties; - uint64_t file_size; - uint64_t num_entries; - uint64_t num_deletions; - uint64_t raw_key_size; - uint64_t raw_value_size; + uint64_t file_size{0}; + uint64_t num_entries{0}; + uint64_t num_deletions{0}; + uint64_t raw_key_size{0}; + uint64_t raw_value_size{0}; std::string smallest_internal_key; std::string largest_internal_key; SequenceNumber smallest_seqno; SequenceNumber largest_seqno; + uint64_t unique_id_lo{0}; + uint64_t unique_id_hi{0}; + uint64_t epoch_number{0}; }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/secondary_cache.h b/include/rocksdb/secondary_cache.h index 3e637efb3f7..cb6f74450a5 100644 --- a/include/rocksdb/secondary_cache.h +++ b/include/rocksdb/secondary_cache.h @@ -20,7 +20,7 @@ namespace ROCKSDB_NAMESPACE { // A handle for lookup result. The handle may not be immediately ready or // have a valid value. The caller must call isReady() to determine if its // ready, and call Wait() in order to block until it becomes ready. -// The caller must call value() after it becomes ready to determine if the +// The caller must call Value() after it becomes ready to determine if the // handle successfullly read the item. class SecondaryCacheResultHandle { public: @@ -32,8 +32,9 @@ class SecondaryCacheResultHandle { // Block until handle becomes ready virtual void Wait() = 0; - // Return the value. If nullptr, it means the lookup was unsuccessful - virtual void* Value() = 0; + // Return the cache entry object (also known as value). If nullptr, it means + // the lookup was unsuccessful. + virtual Cache::ObjectPtr Value() = 0; // Return the size of value virtual size_t Size() = 0; @@ -56,15 +57,37 @@ class SecondaryCache : public Customizable { const std::string& id, std::shared_ptr* result); - // Insert the given value into this cache. The value is not written - // directly. Rather, the SaveToCallback provided by helper_cb will be - // used to extract the persistable data in value, which will be written + // Insert the given value into this cache. Ownership of `value` is + // transferred to the callee, who is reponsible for deleting the value + // with helper->del_cb if del_cb is not nullptr. Unlike Cache::Insert(), + // the callee is responsible for such cleanup even in case of non-OK + // Status. + // Typically, the value is not saved directly but the implementation + // uses the SaveToCallback provided by helper to extract value's + // persistable data (typically uncompressed block), which will be written // to this tier. The implementation may or may not write it to cache - // depending on the admission control policy, even if the return status is - // success. - virtual Status Insert(const Slice& key, void* value, + // depending on the admission control policy, even if the return status + // is success (OK). + // + // If the implementation is asynchronous or otherwise uses `value` after + // the call returns, then InsertSaved() must be overridden not to rely on + // Insert(). For example, there could be a "holding area" in memory where + // Lookup() might return the same parsed value back. But more typically, if + // the implementation only uses `value` for getting persistable data during + // the call, then the default implementation of `InsertSaved()` suffices. + virtual Status Insert(const Slice& key, Cache::ObjectPtr obj, const Cache::CacheItemHelper* helper) = 0; + // Insert a value from its saved/persistable data (typically uncompressed + // block), as if generated by SaveToCallback/SizeCallback. This can be used + // in "warming up" the cache from some auxiliary source, and like Insert() + // may or may not write it to cache depending on the admission control + // policy, even if the return status is success. + // + // The default implementation assumes synchronous, non-escaping Insert(), + // wherein `value` is not used after return of Insert(). See Insert(). + virtual Status InsertSaved(const Slice& key, const Slice& saved); + // Lookup the data for the given key in this cache. The create_cb // will be used to create the object. The handle returned may not be // ready yet, unless wait=true, in which case Lookup() will block until @@ -79,8 +102,9 @@ class SecondaryCache : public Customizable { // is_in_sec_cache is to indicate whether the handle is possibly erased // from the secondary cache after the Lookup. virtual std::unique_ptr Lookup( - const Slice& key, const Cache::CreateCallback& create_cb, bool wait, - bool advise_erase, bool& is_in_sec_cache) = 0; + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool advise_erase, + bool& is_in_sec_cache) = 0; // Indicate whether a handle can be erased in this secondary cache. [[nodiscard]] virtual bool SupportForceErase() const = 0; diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index 9754ec1b978..e0c7c9fe713 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -68,9 +68,9 @@ struct ExternalSstFileInfo { std::string largest_range_del_key; // largest range deletion user key in file std::string file_checksum; // sst file checksum; std::string file_checksum_func_name; // The name of file checksum function - SequenceNumber sequence_number; // sequence number of all keys in file - uint64_t file_size; // file size in bytes - uint64_t num_entries; // number of entries in file + SequenceNumber sequence_number; // sequence number of all keys in file + uint64_t file_size; // file size in bytes + uint64_t num_entries; // number of entries in file uint64_t num_range_del_entries; // number of range deletion entries in file int32_t version; // file version }; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 1ab3dc4cba8..39af9455991 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -135,6 +135,9 @@ class Status { Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg) : Status(_code, _subcode, msg, "", _sev) {} + static Status CopyAppendMessage(const Status& s, const Slice& delim, + const Slice& msg); + Severity severity() const { MarkChecked(); return sev_; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index c5a545eefcd..3a2bf26299e 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -266,6 +266,9 @@ struct BlockBasedTableOptions { // IF NULL, no page cache is used std::shared_ptr persistent_cache = nullptr; + // DEPRECATED: This feature is planned for removal in a future release. + // Use SecondaryCache instead. + // // If non-NULL use the specified cache for compressed blocks. // If NULL, rocksdb will not use a compressed block cache. // Note: though it looks similar to `block_cache`, RocksDB doesn't put the diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h index 2519f3a5877..e13ad8f80a5 100644 --- a/include/rocksdb/transaction_log.h +++ b/include/rocksdb/transaction_log.h @@ -7,6 +7,7 @@ #include #include + #include "rocksdb/status.h" #include "rocksdb/types.h" #include "rocksdb/write_batch.h" diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h index 421abf3cd4c..6fb53d8466f 100644 --- a/include/rocksdb/types.h +++ b/include/rocksdb/types.h @@ -6,6 +6,7 @@ #pragma once #include + #include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { diff --git a/include/rocksdb/utilities/backup_engine.h b/include/rocksdb/utilities/backup_engine.h index f28ad96180a..892c9493240 100644 --- a/include/rocksdb/utilities/backup_engine.h +++ b/include/rocksdb/utilities/backup_engine.h @@ -11,6 +11,7 @@ #ifndef ROCKSDB_LITE #include +#include #include #include #include @@ -23,6 +24,8 @@ #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { +class BackupEngineReadOnlyBase; +class BackupEngine; // The default DB file checksum function name. constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c"; @@ -270,6 +273,28 @@ inline BackupEngineOptions::ShareFilesNaming operator|( return static_cast(l | r); } +// Identifying information about a backup shared file that is (or might be) +// excluded from a backup using exclude_files_callback. +struct BackupExcludedFileInfo { + explicit BackupExcludedFileInfo(const std::string& _relative_file) + : relative_file(_relative_file) {} + + // File name and path relative to the backup dir. + std::string relative_file; +}; + +// An auxiliary structure for exclude_files_callback +struct MaybeExcludeBackupFile { + explicit MaybeExcludeBackupFile(BackupExcludedFileInfo&& _info) + : info(std::move(_info)) {} + + // Identifying information about a backup shared file that could be excluded + const BackupExcludedFileInfo info; + + // API user sets to true if the file should be excluded from this backup + bool exclude_decision = false; +}; + struct CreateBackupOptions { // Flush will always trigger if 2PC is enabled. // If write-ahead logs are disabled, set flush_before_backup=true to @@ -278,10 +303,31 @@ struct CreateBackupOptions { // Callback for reporting progress, based on callback_trigger_interval_size. // - // RocksDB callbacks are NOT exception-safe. A callback completing with an - // exception can lead to undefined behavior in RocksDB, including data loss, - // unreported corruption, deadlocks, and more. - std::function progress_callback = []() {}; + // An exception thrown from the callback will result in Status::Aborted from + // the operation. + std::function progress_callback = {}; + + // A callback that allows the API user to select files for exclusion, such + // as if the files are known to exist in an alternate backup directory. + // Only "shared" files can be excluded from backups. This is an advanced + // feature because the BackupEngine user is trusted to keep track of files + // such that the DB can be restored. + // + // Input to the callback is a [begin,end) range of sharable files live in + // the DB being backed up, and the callback implementation sets + // exclude_decision=true for files to exclude. A callback offers maximum + // flexibility, e.g. if remote files are unavailable at backup time but + // whose existence has been recorded somewhere. In case of an empty or + // no-op callback, all files are included in the backup . + // + // To restore the DB, RestoreOptions::alternate_dirs must be used to provide + // the excluded files. + // + // An exception thrown from the callback will result in Status::Aborted from + // the operation. + std::function + exclude_files_callback = {}; // If false, background_thread_cpu_priority is ignored. // Otherwise, the cpu priority can be decreased, @@ -300,6 +346,11 @@ struct RestoreOptions { // Default: false bool keep_log_files; + // For backups that were created using exclude_files_callback, this + // option enables restoring those backups by providing BackupEngines on + // directories known to contain the required files. + std::forward_list alternate_dirs; + explicit RestoreOptions(bool _keep_log_files = false) : keep_log_files(_keep_log_files) {} }; @@ -324,9 +375,15 @@ struct BackupInfo { // Backup API user metadata std::string app_metadata; - // Backup file details, if requested with include_file_details=true + // Backup file details, if requested with include_file_details=true. + // Does not include excluded_files. std::vector file_details; + // Identifying information about shared files that were excluded from the + // created backup. See exclude_files_callback and alternate_dirs. + // This information is only provided if include_file_details=true. + std::vector excluded_files; + // DB "name" (a directory in the backup_env) for opening this backup as a // read-only DB. This should also be used as the DBOptions::wal_dir, such // as by default setting wal_dir="". See also env_for_open. @@ -348,8 +405,8 @@ struct BackupInfo { BackupInfo() {} - BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, - uint32_t _number_files, const std::string& _app_metadata) + explicit BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, + uint32_t _number_files, const std::string& _app_metadata) : backup_id(_backup_id), timestamp(_timestamp), size(_size), @@ -364,8 +421,8 @@ class BackupStatistics { number_fail_backup = 0; } - BackupStatistics(uint32_t _number_success_backup, - uint32_t _number_fail_backup) + explicit BackupStatistics(uint32_t _number_success_backup, + uint32_t _number_fail_backup) : number_success_backup(_number_success_backup), number_fail_backup(_number_fail_backup) {} @@ -462,6 +519,9 @@ class BackupEngineReadOnlyBase { // Returns Status::OK() if all checks are good virtual IOStatus VerifyBackup(BackupID backup_id, bool verify_with_checksum = false) const = 0; + + // Internal use only + virtual BackupEngine* AsBackupEngine() = 0; }; // Append-only functions of a BackupEngine. See BackupEngine comment for diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h index 7fb9d489a34..6046513aba4 100644 --- a/include/rocksdb/utilities/checkpoint.h +++ b/include/rocksdb/utilities/checkpoint.h @@ -10,6 +10,7 @@ #include #include + #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { diff --git a/include/rocksdb/utilities/env_mirror.h b/include/rocksdb/utilities/env_mirror.h index 8e96ac4108d..ffde5effad0 100644 --- a/include/rocksdb/utilities/env_mirror.h +++ b/include/rocksdb/utilities/env_mirror.h @@ -23,6 +23,7 @@ #include #include #include + #include "rocksdb/env.h" namespace ROCKSDB_NAMESPACE { diff --git a/include/rocksdb/utilities/option_change_migration.h b/include/rocksdb/utilities/option_change_migration.h index 5d9ddda6270..a73324a9e34 100644 --- a/include/rocksdb/utilities/option_change_migration.h +++ b/include/rocksdb/utilities/option_change_migration.h @@ -6,6 +6,7 @@ #pragma once #include + #include "rocksdb/options.h" #include "rocksdb/status.h" diff --git a/include/rocksdb/utilities/options_type.h b/include/rocksdb/utilities/options_type.h index 3948ea344a9..cd340ed5967 100644 --- a/include/rocksdb/utilities/options_type.h +++ b/include/rocksdb/utilities/options_type.h @@ -111,14 +111,14 @@ enum class OptionTypeFlags : uint32_t { kStringNameOnly = 0x8000, // The option serializes to a name only }; -inline OptionTypeFlags operator|(const OptionTypeFlags &a, - const OptionTypeFlags &b) { +inline OptionTypeFlags operator|(const OptionTypeFlags& a, + const OptionTypeFlags& b) { return static_cast(static_cast(a) | static_cast(b)); } -inline OptionTypeFlags operator&(const OptionTypeFlags &a, - const OptionTypeFlags &b) { +inline OptionTypeFlags operator&(const OptionTypeFlags& a, + const OptionTypeFlags& b) { return static_cast(static_cast(a) & static_cast(b)); } diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h index 17143916b30..a682c774841 100644 --- a/include/rocksdb/utilities/sim_cache.h +++ b/include/rocksdb/utilities/sim_cache.h @@ -6,8 +6,10 @@ #pragma once #include + #include #include + #include "rocksdb/cache.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index ac4835d5afc..dbb6bb31a72 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -7,6 +7,7 @@ #include #include #include + #include "rocksdb/db.h" #ifdef _WIN32 @@ -131,8 +132,8 @@ class StackableDB : public DB { const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input = false) override { - return db_->MultiGet(options, column_family, num_keys, keys, - values, statuses, sorted_input); + return db_->MultiGet(options, column_family, num_keys, keys, values, + statuses, sorted_input); } using DB::IngestExternalFile; @@ -214,6 +215,10 @@ class StackableDB : public DB { const Slice& value) override { return db_->Merge(options, column_family, key, value); } + Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& ts, const Slice& value) override { + return db_->Merge(options, column_family, key, ts, value); + } virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override { return db_->Write(opts, updates); @@ -473,8 +478,7 @@ class StackableDB : public DB { return db_->GetCurrentWalFile(current_log_file); } - virtual Status GetCreationTimeOfOldestFile( - uint64_t* creation_time) override { + virtual Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override { return db_->GetCreationTimeOfOldestFile(creation_time); } diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index b8f7076339b..1d2822988fc 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -627,7 +627,7 @@ class Transaction { PREPARED = 2, AWAITING_COMMIT = 3, COMMITTED = 4, - COMMITED = COMMITTED, // old misspelled name + COMMITED = COMMITTED, // old misspelled name AWAITING_ROLLBACK = 5, ROLLEDBACK = 6, LOCKS_STOLEN = 7, diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index aefcd6de151..741c5957474 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -25,8 +25,8 @@ class TransactionDBMutexFactory; enum TxnDBWritePolicy { WRITE_COMMITTED = 0, // write only the committed data - WRITE_PREPARED, // write data after the prepare phase of 2pc - WRITE_UNPREPARED // write data before the prepare phase of 2pc + WRITE_PREPARED, // write data after the prepare phase of 2pc + WRITE_UNPREPARED // write data before the prepare phase of 2pc }; constexpr uint32_t kInitialMaxDeadlocks = 5; diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 4c5cfe5f5ef..18f6e11190f 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -12,8 +12,8 @@ // NOTE: in 'main' development branch, this should be the *next* // minor or major version number planned for release. #define ROCKSDB_MAJOR 7 -#define ROCKSDB_MINOR 8 -#define ROCKSDB_PATCH 3 +#define ROCKSDB_MINOR 10 +#define ROCKSDB_PATCH 2 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index d0ddcf49ebd..6f6079a1276 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -172,10 +172,7 @@ class WriteBatch : public WriteBatchBase { return Merge(nullptr, key, value); } Status Merge(ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/, - const Slice& /*ts*/, const Slice& /*value*/) override { - return Status::NotSupported( - "Merge does not support user-defined timestamp"); - } + const Slice& /*ts*/, const Slice& /*value*/) override; // variant that takes SliceParts Status Merge(ColumnFamilyHandle* column_family, const SliceParts& key, @@ -219,6 +216,7 @@ class WriteBatch : public WriteBatchBase { Status PopSavePoint() override; // Support for iterating over the contents of a batch. + // Objects of subclasses of Handler will be used by WriteBatch::Iterate(). class Handler { public: virtual ~Handler(); @@ -229,6 +227,7 @@ class WriteBatch : public WriteBatchBase { // default implementation will just call Put without column family for // backwards compatibility. If the column family is not default, // the function is noop + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value) { if (column_family_id == 0) { @@ -241,14 +240,17 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument( "non-default column family and PutCF not implemented"); } + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual void Put(const Slice& /*key*/, const Slice& /*value*/) {} + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */, const Slice& /* entity */) { return Status::NotSupported("PutEntityCF not implemented"); } + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { if (column_family_id == 0) { Delete(key); @@ -257,8 +259,10 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument( "non-default column family and DeleteCF not implemented"); } + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual void Delete(const Slice& /*key*/) {} + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) { if (column_family_id == 0) { SingleDelete(key); @@ -267,14 +271,18 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument( "non-default column family and SingleDeleteCF not implemented"); } + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual void SingleDelete(const Slice& /*key*/) {} + // If user-defined timestamp is enabled, then `begin_key` and `end_key` + // both include timestamp. virtual Status DeleteRangeCF(uint32_t /*column_family_id*/, const Slice& /*begin_key*/, const Slice& /*end_key*/) { return Status::InvalidArgument("DeleteRangeCF not implemented"); } + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual Status MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) { if (column_family_id == 0) { @@ -284,8 +292,10 @@ class WriteBatch : public WriteBatchBase { return Status::InvalidArgument( "non-default column family and MergeCF not implemented"); } + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual void Merge(const Slice& /*key*/, const Slice& /*value*/) {} + // If user-defined timestamp is enabled, then `key` includes timestamp. virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/, const Slice& /*key*/, const Slice& /*value*/) { diff --git a/java/GetBenchmarks.md b/java/GetBenchmarks.md new file mode 100644 index 00000000000..b66a897e212 --- /dev/null +++ b/java/GetBenchmarks.md @@ -0,0 +1,161 @@ +# RocksDB Get Performance Benchmarks + +Results associated with [Improve Java API `get()` performance by reducing copies](https://github.com/facebook/rocksdb/pull/10970) + +## Build/Run + +Mac +``` +make clean jclean +DEBUG_LEVEL=0 make -j12 rocksdbjava +(cd java/target; cp rocksdbjni-7.9.0-osx.jar rocksdbjni-7.9.0-SNAPSHOT-osx.jar) +mvn install:install-file -Dfile=./java/target/rocksdbjni-7.9.0-SNAPSHOT-osx.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=7.9.0-SNAPSHOT -Dpackaging=jar +``` + +Linux +``` +make clean jclean +DEBUG_LEVEL=0 make -j12 rocksdbjava +(cd java/target; cp rocksdbjni-7.9.0-linux64.jar rocksdbjni-7.9.0-SNAPSHOT-linux64.jar) +mvn install:install-file -Dfile=./java/target/rocksdbjni-7.9.0-SNAPSHOT-linux64.jar -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=7.9.0-SNAPSHOT -Dpackaging=jar +``` + +Build jmh test package, on either platform +``` +pushd java/jmh +mvn clean package +``` + +A quick test run, just as a sanity check, using a small number of keys, would be +``` +java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000 -p keySize=128 -p valueSize=32768 -p columnFamilyTestType="no_column_family" GetBenchmarks +``` +The long performance run (as big as we can make it on our Ubuntu box without filling the disk) +``` +java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000,50000 -p keySize=128 -p valueSize=1024,16384 -p columnFamilyTestType="1_column_family","20_column_families" GetBenchmarks.get GetBenchmarks.preallocatedByteBufferGet GetBenchmarks.preallocatedGet +``` + +## Results (small runs, Mac) + +These are run on a 10-core M1 with 64GB of memory and 2TB of SSD. +They probably reflect the absolute best case for this optimization, hitting in-memory buffers and completely eliminating a buffer copy. + +### Before +Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units +GetBenchmarks.get no_column_family 1000 128 N/A 32768 thrpt 25 43496.578 ± 5743.090 ops/s +GetBenchmarks.preallocatedByteBufferGet no_column_family 1000 128 N/A 32768 thrpt 25 70765.578 ± 697.548 ops/s +GetBenchmarks.preallocatedGet no_column_family 1000 128 N/A 32768 thrpt 25 69883.554 ± 944.184 ops/s + +### After fixing byte[] (.get and .preallocatedGet) + +Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units +GetBenchmarks.get no_column_family 1000 128 N/A 32768 thrpt 25 149207.681 ± 2261.671 ops/s +GetBenchmarks.preallocatedByteBufferGet no_column_family 1000 128 N/A 32768 thrpt 25 68920.489 ± 1574.664 ops/s +GetBenchmarks.preallocatedGet no_column_family 1000 128 N/A 32768 thrpt 25 177399.022 ± 2107.375 ops/s + +### After fixing ByteBuffer (.preallocatedByteBufferGet) + +Benchmark (columnFamilyTestType) (keyCount) (keySize) (multiGetSize) (valueSize) Mode Cnt Score Error Units +GetBenchmarks.get no_column_family 1000 128 N/A 32768 thrpt 25 150389.259 ± 1371.473 ops/s +GetBenchmarks.preallocatedByteBufferGet no_column_family 1000 128 N/A 32768 thrpt 25 179919.468 ± 1670.714 ops/s +GetBenchmarks.preallocatedGet no_column_family 1000 128 N/A 32768 thrpt 25 178261.938 ± 2630.571 ops/s +## Results (Ubuntu, big runs) +These take 3-4 hours +``` +java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000,50000 -p keySize=128 -p valueSize=1024,16384 -p columnFamilyTestType="1_column_family","20_column_families" GetBenchmarks.get GetBenchmarks.preallocatedByteBufferGet GetBenchmarks.preallocatedGet +``` +It's clear that all `get()` variants have noticeably improved performance, though not the spectacular gains of the M1. +### With fixes for all of the `get()` instances + +Benchmark (columnFamilyTestType) (keyCount) (keySize) (valueSize) Mode Cnt Score Error Units +GetBenchmarks.get 1_column_family 1000 128 1024 thrpt 25 935648.793 ± 22879.910 ops/s +GetBenchmarks.get 1_column_family 1000 128 16384 thrpt 25 204366.301 ± 1326.570 ops/s +GetBenchmarks.get 1_column_family 50000 128 1024 thrpt 25 693451.990 ± 19822.720 ops/s +GetBenchmarks.get 1_column_family 50000 128 16384 thrpt 25 50473.768 ± 497.335 ops/s +GetBenchmarks.get 20_column_families 1000 128 1024 thrpt 25 550118.874 ± 14289.009 ops/s +GetBenchmarks.get 20_column_families 1000 128 16384 thrpt 25 120545.549 ± 648.280 ops/s +GetBenchmarks.get 20_column_families 50000 128 1024 thrpt 25 235671.353 ± 2231.195 ops/s +GetBenchmarks.get 20_column_families 50000 128 16384 thrpt 25 12463.887 ± 1950.746 ops/s +GetBenchmarks.preallocatedByteBufferGet 1_column_family 1000 128 1024 thrpt 25 1196026.040 ± 35435.729 ops/s +GetBenchmarks.preallocatedByteBufferGet 1_column_family 1000 128 16384 thrpt 25 403252.655 ± 3287.054 ops/s +GetBenchmarks.preallocatedByteBufferGet 1_column_family 50000 128 1024 thrpt 25 829965.448 ± 16945.452 ops/s +GetBenchmarks.preallocatedByteBufferGet 1_column_family 50000 128 16384 thrpt 25 63798.042 ± 1292.858 ops/s +GetBenchmarks.preallocatedByteBufferGet 20_column_families 1000 128 1024 thrpt 25 724557.253 ± 12710.828 ops/s +GetBenchmarks.preallocatedByteBufferGet 20_column_families 1000 128 16384 thrpt 25 176846.615 ± 1121.644 ops/s +GetBenchmarks.preallocatedByteBufferGet 20_column_families 50000 128 1024 thrpt 25 263553.764 ± 1304.243 ops/s +GetBenchmarks.preallocatedByteBufferGet 20_column_families 50000 128 16384 thrpt 25 14721.693 ± 2574.240 ops/s +GetBenchmarks.preallocatedGet 1_column_family 1000 128 1024 thrpt 25 1093947.765 ± 42846.276 ops/s +GetBenchmarks.preallocatedGet 1_column_family 1000 128 16384 thrpt 25 391629.913 ± 4039.965 ops/s +GetBenchmarks.preallocatedGet 1_column_family 50000 128 1024 thrpt 25 769332.958 ± 24180.749 ops/s +GetBenchmarks.preallocatedGet 1_column_family 50000 128 16384 thrpt 25 61712.038 ± 423.494 ops/s +GetBenchmarks.preallocatedGet 20_column_families 1000 128 1024 thrpt 25 694684.465 ± 5484.205 ops/s +GetBenchmarks.preallocatedGet 20_column_families 1000 128 16384 thrpt 25 172383.593 ± 841.679 ops/s +GetBenchmarks.preallocatedGet 20_column_families 50000 128 1024 thrpt 25 257447.351 ± 1388.667 ops/s +GetBenchmarks.preallocatedGet 20_column_families 50000 128 16384 thrpt 25 13418.522 ± 2418.619 ops/s + +### Baseline (no fixes) + +Benchmark (columnFamilyTestType) (keyCount) (keySize) (valueSize) Mode Cnt Score Error Units +GetBenchmarks.get 1_column_family 1000 128 1024 thrpt 25 866745.224 ± 8834.629 ops/s +GetBenchmarks.get 1_column_family 1000 128 16384 thrpt 25 184332.195 ± 2304.217 ops/s +GetBenchmarks.get 1_column_family 50000 128 1024 thrpt 25 666794.288 ± 16150.684 ops/s +GetBenchmarks.get 1_column_family 50000 128 16384 thrpt 25 47221.788 ± 433.165 ops/s +GetBenchmarks.get 20_column_families 1000 128 1024 thrpt 25 551513.636 ± 7763.681 ops/s +GetBenchmarks.get 20_column_families 1000 128 16384 thrpt 25 113117.720 ± 580.738 ops/s +GetBenchmarks.get 20_column_families 50000 128 1024 thrpt 25 238675.555 ± 1758.978 ops/s +GetBenchmarks.get 20_column_families 50000 128 16384 thrpt 25 11639.390 ± 1459.765 ops/s +GetBenchmarks.preallocatedByteBufferGet 1_column_family 1000 128 1024 thrpt 25 1153617.917 ± 26350.028 ops/s +GetBenchmarks.preallocatedByteBufferGet 1_column_family 1000 128 16384 thrpt 25 401710.334 ± 4324.539 ops/s +GetBenchmarks.preallocatedByteBufferGet 1_column_family 50000 128 1024 thrpt 25 809384.073 ± 13833.871 ops/s +GetBenchmarks.preallocatedByteBufferGet 1_column_family 50000 128 16384 thrpt 25 59279.005 ± 443.207 ops/s +GetBenchmarks.preallocatedByteBufferGet 20_column_families 1000 128 1024 thrpt 25 715466.403 ± 6591.375 ops/s +GetBenchmarks.preallocatedByteBufferGet 20_column_families 1000 128 16384 thrpt 25 175279.163 ± 910.923 ops/s +GetBenchmarks.preallocatedByteBufferGet 20_column_families 50000 128 1024 thrpt 25 263295.180 ± 856.456 ops/s +GetBenchmarks.preallocatedByteBufferGet 20_column_families 50000 128 16384 thrpt 25 14001.928 ± 2462.067 ops/s +GetBenchmarks.preallocatedGet 1_column_family 1000 128 1024 thrpt 25 1072866.854 ± 27030.592 ops/s +GetBenchmarks.preallocatedGet 1_column_family 1000 128 16384 thrpt 25 383950.853 ± 4510.654 ops/s +GetBenchmarks.preallocatedGet 1_column_family 50000 128 1024 thrpt 25 764395.469 ± 10097.417 ops/s +GetBenchmarks.preallocatedGet 1_column_family 50000 128 16384 thrpt 25 56851.330 ± 388.029 ops/s +GetBenchmarks.preallocatedGet 20_column_families 1000 128 1024 thrpt 25 668518.593 ± 9764.117 ops/s +GetBenchmarks.preallocatedGet 20_column_families 1000 128 16384 thrpt 25 171309.695 ± 875.895 ops/s +GetBenchmarks.preallocatedGet 20_column_families 50000 128 1024 thrpt 25 256057.801 ± 954.621 ops/s +GetBenchmarks.preallocatedGet 20_column_families 50000 128 16384 thrpt 25 13319.380 ± 2126.654 ops/s + +### Comparison + +It does at least look best when the data is cached. That is to say, smallest number of column families, and least keys. + +GetBenchmarks.get 1_column_family 1000 128 16384 thrpt 25 204366.301 ± 1326.570 ops/s +GetBenchmarks.get 1_column_family 1000 128 16384 thrpt 25 184332.195 ± 2304.217 ops/s + +GetBenchmarks.get 1_column_family 50000 128 16384 thrpt 25 50473.768 ± 497.335 ops/s +GetBenchmarks.get 1_column_family 50000 128 16384 thrpt 25 47221.788 ± 433.165 ops/s + +GetBenchmarks.get 20_column_families 1000 128 16384 thrpt 25 120545.549 ± 648.280 ops/s +GetBenchmarks.get 20_column_families 1000 128 16384 thrpt 25 113117.720 ± 580.738 ops/s + +GetBenchmarks.get 20_column_families 50000 128 16384 thrpt 25 12463.887 ± 1950.746 ops/s +GetBenchmarks.get 20_column_families 50000 128 16384 thrpt 25 11639.390 ± 1459.765 ops/s + +### Baseline +25 minute run, small number of keys +``` +java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000 -p keySize=128 -p valueSize=32768 -p columnFamilyTestType="no_column_families" GetBenchmarks.get GetBenchmarks.preallocatedByteBufferGet GetBenchmarks.preallocatedGet +``` + +Benchmark (columnFamilyTestType) (keyCount) (keySize) (valueSize) Mode Cnt Score Error Units +GetBenchmarks.get no_column_families 1000 128 32768 thrpt 25 32344.908 ± 296.651 ops/s +GetBenchmarks.preallocatedByteBufferGet no_column_families 1000 128 32768 thrpt 25 45266.968 ± 424.514 ops/s +GetBenchmarks.preallocatedGet no_column_families 1000 128 32768 thrpt 25 43531.088 ± 291.785 ops/s + +### Optimized + +Benchmark (columnFamilyTestType) (keyCount) (keySize) (valueSize) Mode Cnt Score Error Units +GetBenchmarks.get no_column_families 1000 128 32768 thrpt 25 37463.716 ± 235.744 ops/s +GetBenchmarks.preallocatedByteBufferGet no_column_families 1000 128 32768 thrpt 25 48946.105 ± 466.463 ops/s +GetBenchmarks.preallocatedGet no_column_families 1000 128 32768 thrpt 25 47143.624 ± 576.763 ops/s + +## Conclusion + +The performance improvement is real. + diff --git a/java/Makefile b/java/Makefile index bc171079ba0..bc7e121c412 100644 --- a/java/Makefile +++ b/java/Makefile @@ -146,6 +146,7 @@ JAVA_TESTS = \ org.rocksdb.MemoryUtilTest\ org.rocksdb.MemTableTest\ org.rocksdb.MergeTest\ + org.rocksdb.MultiColumnRegressionTest \ org.rocksdb.MultiGetManyKeysTest\ org.rocksdb.MultiGetTest\ org.rocksdb.MixedOptionsTest\ diff --git a/java/jmh/pom.xml b/java/jmh/pom.xml index 26615da8617..3016aefa788 100644 --- a/java/jmh/pom.xml +++ b/java/jmh/pom.xml @@ -50,7 +50,7 @@ org.rocksdb rocksdbjni - 6.27.0-SNAPSHOT + 7.9.0-SNAPSHOT diff --git a/java/jmh/src/main/java/org/rocksdb/jmh/GetBenchmarks.java b/java/jmh/src/main/java/org/rocksdb/jmh/GetBenchmarks.java index e34005c2f41..1c4329b3a84 100644 --- a/java/jmh/src/main/java/org/rocksdb/jmh/GetBenchmarks.java +++ b/java/jmh/src/main/java/org/rocksdb/jmh/GetBenchmarks.java @@ -1,23 +1,24 @@ /** * Copyright (c) 2011-present, Facebook, Inc. All rights reserved. - * This source code is licensed under both the GPLv2 (found in the - * COPYING file in the root directory) and Apache 2.0 License - * (found in the LICENSE.Apache file in the root directory). + * This source code is licensed under both the GPLv2 (found in the + * COPYING file in the root directory) and Apache 2.0 License + * (found in the LICENSE.Apache file in the root directory). */ package org.rocksdb.jmh; -import org.openjdk.jmh.annotations.*; -import org.rocksdb.*; -import org.rocksdb.util.FileUtils; +import static org.rocksdb.util.KVUtils.ba; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; - -import static org.rocksdb.util.KVUtils.ba; +import org.openjdk.jmh.annotations.*; +import org.rocksdb.*; +import org.rocksdb.util.FileUtils; @State(Scope.Benchmark) public class GetBenchmarks { @@ -30,16 +31,24 @@ public class GetBenchmarks { }) String columnFamilyTestType; - @Param("100000") - int keyCount; + @Param({"1000", "100000"}) int keyCount; + + @Param({"12", "64", "128"}) int keySize; + + @Param({"64", "1024", "65536"}) int valueSize; Path dbDir; DBOptions options; + ReadOptions readOptions; int cfs = 0; // number of column families private AtomicInteger cfHandlesIdx; ColumnFamilyHandle[] cfHandles; RocksDB db; private final AtomicInteger keyIndex = new AtomicInteger(); + private ByteBuffer keyBuf; + private ByteBuffer valueBuf; + private byte[] keyArr; + private byte[] valueArr; @Setup(Level.Trial) public void setup() throws IOException, RocksDBException { @@ -50,6 +59,7 @@ public void setup() throws IOException, RocksDBException { options = new DBOptions() .setCreateIfMissing(true) .setCreateMissingColumnFamilies(true); + readOptions = new ReadOptions(); final List cfDescriptors = new ArrayList<>(); cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); @@ -74,16 +84,32 @@ public void setup() throws IOException, RocksDBException { cfHandles = cfHandlesList.toArray(new ColumnFamilyHandle[0]); // store initial data for retrieving via get - for (int i = 0; i < cfs; i++) { + keyArr = new byte[keySize]; + valueArr = new byte[valueSize]; + Arrays.fill(keyArr, (byte) 0x30); + Arrays.fill(valueArr, (byte) 0x30); + for (int i = 0; i <= cfs; i++) { for (int j = 0; j < keyCount; j++) { - db.put(cfHandles[i], ba("key" + j), ba("value" + j)); + final byte[] keyPrefix = ba("key" + j); + final byte[] valuePrefix = ba("value" + j); + System.arraycopy(keyPrefix, 0, keyArr, 0, keyPrefix.length); + System.arraycopy(valuePrefix, 0, valueArr, 0, valuePrefix.length); + db.put(cfHandles[i], keyArr, valueArr); } } - try (final FlushOptions flushOptions = new FlushOptions() - .setWaitForFlush(true)) { + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { db.flush(flushOptions); } + + keyBuf = ByteBuffer.allocateDirect(keySize); + valueBuf = ByteBuffer.allocateDirect(valueSize); + Arrays.fill(keyArr, (byte) 0x30); + Arrays.fill(valueArr, (byte) 0x30); + keyBuf.put(keyArr); + keyBuf.flip(); + valueBuf.put(valueArr); + valueBuf.flip(); } @TearDown(Level.Trial) @@ -93,13 +119,14 @@ public void cleanup() throws IOException { } db.close(); options.close(); + readOptions.close(); FileUtils.delete(dbDir); } private ColumnFamilyHandle getColumnFamily() { if (cfs == 0) { return cfHandles[0]; - } else if (cfs == 1) { + } else if (cfs == 1) { return cfHandles[1]; } else { int idx = cfHandlesIdx.getAndIncrement(); @@ -131,9 +158,58 @@ private int next() { return idx; } - @Benchmark - public byte[] get() throws RocksDBException { + // String -> byte[] + private byte[] getKeyArr() { + final int MAX_LEN = 9; // key100000 + final int keyIdx = next(); + final byte[] keyPrefix = ba("key" + keyIdx); + System.arraycopy(keyPrefix, 0, keyArr, 0, keyPrefix.length); + Arrays.fill(keyArr, keyPrefix.length, MAX_LEN, (byte) 0x30); + return keyArr; + } + + // String -> ByteBuffer + private ByteBuffer getKeyBuf() { + final int MAX_LEN = 9; // key100000 final int keyIdx = next(); - return db.get(getColumnFamily(), ba("key" + keyIdx)); + final String keyStr = "key" + keyIdx; + for (int i = 0; i < keyStr.length(); ++i) { + keyBuf.put(i, (byte) keyStr.charAt(i)); + } + for (int i = keyStr.length(); i < MAX_LEN; ++i) { + keyBuf.put(i, (byte) 0x30); + } + // Reset position for future reading + keyBuf.position(0); + return keyBuf; + } + + private byte[] getValueArr() { + return valueArr; + } + + private ByteBuffer getValueBuf() { + return valueBuf; + } + + @Benchmark + public void get() throws RocksDBException { + db.get(getColumnFamily(), getKeyArr()); + } + + @Benchmark + public void preallocatedGet() throws RocksDBException { + db.get(getColumnFamily(), getKeyArr(), getValueArr()); + } + + @Benchmark + public void preallocatedByteBufferGet() throws RocksDBException { + int res = db.get(getColumnFamily(), readOptions, getKeyBuf(), getValueBuf()); + // For testing correctness: + // assert res > 0; + // final byte[] ret = new byte[valueSize]; + // valueBuf.get(ret); + // System.out.println(str(ret)); + // valueBuf.flip(); } -} +} \ No newline at end of file diff --git a/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java b/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java index c8c8274443c..d374477160e 100644 --- a/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java +++ b/java/jmh/src/main/java/org/rocksdb/jmh/MultiGetBenchmarks.java @@ -194,24 +194,6 @@ public List multiGet10() throws RocksDBException { return new ArrayList<>(); } - @Benchmark - public List multiGetDirect10() throws RocksDBException { - final int fromKeyIdx = next(multiGetSize, keyCount); - if (fromKeyIdx >= 0) { - final List keys = keys(keyBuffersList, fromKeyIdx, fromKeyIdx + multiGetSize); - final List results = db.multiGetByteBuffers( - keys, valueBuffersList.subList(fromKeyIdx, fromKeyIdx + multiGetSize)); - for (final RocksDB.MultiGetInstance result : results) { - if (result.status.getCode() != Status.Code.Ok) - throw new RuntimeException("Test status assumption wrong"); - if (result.valueSize != valueSize) - throw new RuntimeException("Test valueSize assumption wrong"); - } - return results; - } - return new ArrayList<>(); - } - public static void main(final String[] args) throws RunnerException { final org.openjdk.jmh.runner.options.Options opt = new OptionsBuilder() diff --git a/java/pom.xml.template b/java/pom.xml.template index 4abff4768e4..8a1981c66de 100644 --- a/java/pom.xml.template +++ b/java/pom.xml.template @@ -59,8 +59,8 @@ - 1.7 - 1.7 + 1.8 + 1.8 UTF-8 diff --git a/java/rocksjni/compact_range_options.cc b/java/rocksjni/compact_range_options.cc index d0b91b47a00..77fbb8890e2 100644 --- a/java/rocksjni/compact_range_options.cc +++ b/java/rocksjni/compact_range_options.cc @@ -24,7 +24,6 @@ jlong Java_org_rocksdb_CompactRangeOptions_newCompactRangeOptions( return GET_CPLUSPLUS_POINTER(options); } - /* * Class: org_rocksdb_CompactRangeOptions * Method: exclusiveManualCompaction @@ -43,13 +42,14 @@ jboolean Java_org_rocksdb_CompactRangeOptions_exclusiveManualCompaction( * Signature: (JZ)V */ void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean exclusive_manual_compaction) { + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + jboolean exclusive_manual_compaction) { auto* options = reinterpret_cast(jhandle); - options->exclusive_manual_compaction = static_cast(exclusive_manual_compaction); + options->exclusive_manual_compaction = + static_cast(exclusive_manual_compaction); } - /* * Class: org_rocksdb_CompactRangeOptions * Method: bottommostLevelCompaction @@ -83,8 +83,9 @@ void Java_org_rocksdb_CompactRangeOptions_setBottommostLevelCompaction( * Method: changeLevel * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->change_level); @@ -95,8 +96,8 @@ jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel * Method: setChangeLevel * Signature: (JZ)V */ -void Java_org_rocksdb_CompactRangeOptions_setChangeLevel - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean change_level) { +void Java_org_rocksdb_CompactRangeOptions_setChangeLevel( + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean change_level) { auto* options = reinterpret_cast(jhandle); options->change_level = static_cast(change_level); @@ -107,8 +108,9 @@ void Java_org_rocksdb_CompactRangeOptions_setChangeLevel * Method: targetLevel * Signature: (J)I */ -jint Java_org_rocksdb_CompactRangeOptions_targetLevel - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +jint Java_org_rocksdb_CompactRangeOptions_targetLevel(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->target_level); @@ -119,8 +121,10 @@ jint Java_org_rocksdb_CompactRangeOptions_targetLevel * Method: setTargetLevel * Signature: (JI)V */ -void Java_org_rocksdb_CompactRangeOptions_setTargetLevel - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint target_level) { +void Java_org_rocksdb_CompactRangeOptions_setTargetLevel(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle, + jint target_level) { auto* options = reinterpret_cast(jhandle); options->target_level = static_cast(target_level); @@ -131,8 +135,9 @@ void Java_org_rocksdb_CompactRangeOptions_setTargetLevel * Method: targetPathId * Signature: (J)I */ -jint Java_org_rocksdb_CompactRangeOptions_targetPathId - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +jint Java_org_rocksdb_CompactRangeOptions_targetPathId(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->target_path_id); @@ -143,8 +148,10 @@ jint Java_org_rocksdb_CompactRangeOptions_targetPathId * Method: setTargetPathId * Signature: (JI)V */ -void Java_org_rocksdb_CompactRangeOptions_setTargetPathId - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint target_path_id) { +void Java_org_rocksdb_CompactRangeOptions_setTargetPathId(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle, + jint target_path_id) { auto* options = reinterpret_cast(jhandle); options->target_path_id = static_cast(target_path_id); @@ -155,8 +162,9 @@ void Java_org_rocksdb_CompactRangeOptions_setTargetPathId * Method: allowWriteStall * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->allow_write_stall); @@ -167,21 +175,22 @@ jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall * Method: setAllowWriteStall * Signature: (JZ)V */ -void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean allow_write_stall) { +void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall( + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, + jboolean allow_write_stall) { auto* options = reinterpret_cast(jhandle); options->allow_write_stall = static_cast(allow_write_stall); } - /* * Class: org_rocksdb_CompactRangeOptions * Method: maxSubcompactions * Signature: (J)I */ -jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->max_subcompactions); @@ -192,8 +201,8 @@ jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions * Method: setMaxSubcompactions * Signature: (JI)V */ -void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions - (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint max_subcompactions) { +void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions( + JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint max_subcompactions) { auto* options = reinterpret_cast(jhandle); options->max_subcompactions = static_cast(max_subcompactions); @@ -204,8 +213,9 @@ void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactRangeOptions_disposeInternal( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +void Java_org_rocksdb_CompactRangeOptions_disposeInternal(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); delete options; diff --git a/java/rocksjni/compaction_filter.cc b/java/rocksjni/compaction_filter.cc index c3a68cdf284..ea04996acd3 100644 --- a/java/rocksjni/compaction_filter.cc +++ b/java/rocksjni/compaction_filter.cc @@ -6,10 +6,11 @@ // This file implements the "bridge" between Java and C++ for // ROCKSDB_NAMESPACE::CompactionFilter. +#include "rocksdb/compaction_filter.h" + #include #include "include/org_rocksdb_AbstractCompactionFilter.h" -#include "rocksdb/compaction_filter.h" // diff --git a/java/rocksjni/compaction_filter_factory_jnicallback.cc b/java/rocksjni/compaction_filter_factory_jnicallback.cc index cacbf02c1cc..14285526f9d 100644 --- a/java/rocksjni/compaction_filter_factory_jnicallback.cc +++ b/java/rocksjni/compaction_filter_factory_jnicallback.cc @@ -7,38 +7,40 @@ // ROCKSDB_NAMESPACE::CompactionFilterFactory. #include "rocksjni/compaction_filter_factory_jnicallback.h" + #include "rocksjni/portal.h" namespace ROCKSDB_NAMESPACE { CompactionFilterFactoryJniCallback::CompactionFilterFactoryJniCallback( JNIEnv* env, jobject jcompaction_filter_factory) : JniCallback(env, jcompaction_filter_factory) { - // Note: The name of a CompactionFilterFactory will not change during // it's lifetime, so we cache it in a global var jmethodID jname_method_id = AbstractCompactionFilterFactoryJni::getNameMethodId(env); - if(jname_method_id == nullptr) { + if (jname_method_id == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } jstring jname = (jstring)env->CallObjectMethod(m_jcallback_obj, jname_method_id); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception thrown return; } jboolean has_exception = JNI_FALSE; - m_name = JniUtil::copyString(env, jname, &has_exception); // also releases jname + m_name = + JniUtil::copyString(env, jname, &has_exception); // also releases jname if (has_exception == JNI_TRUE) { // exception thrown return; } m_jcreate_compaction_filter_methodid = - AbstractCompactionFilterFactoryJni::getCreateCompactionFilterMethodId(env); - if(m_jcreate_compaction_filter_methodid == nullptr) { + AbstractCompactionFilterFactoryJni::getCreateCompactionFilterMethodId( + env); + if (m_jcreate_compaction_filter_methodid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } @@ -48,18 +50,19 @@ const char* CompactionFilterFactoryJniCallback::Name() const { return m_name.get(); } -std::unique_ptr CompactionFilterFactoryJniCallback::CreateCompactionFilter( +std::unique_ptr +CompactionFilterFactoryJniCallback::CreateCompactionFilter( const CompactionFilter::Context& context) { jboolean attached_thread = JNI_FALSE; JNIEnv* env = getJniEnv(&attached_thread); assert(env != nullptr); - jlong addr_compaction_filter = env->CallLongMethod(m_jcallback_obj, - m_jcreate_compaction_filter_methodid, - static_cast(context.is_full_compaction), - static_cast(context.is_manual_compaction)); + jlong addr_compaction_filter = + env->CallLongMethod(m_jcallback_obj, m_jcreate_compaction_filter_methodid, + static_cast(context.is_full_compaction), + static_cast(context.is_manual_compaction)); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception thrown from CallLongMethod env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); diff --git a/java/rocksjni/compaction_filter_factory_jnicallback.h b/java/rocksjni/compaction_filter_factory_jnicallback.h index eb2d5111d6a..2f26f8dbe55 100644 --- a/java/rocksjni/compaction_filter_factory_jnicallback.h +++ b/java/rocksjni/compaction_filter_factory_jnicallback.h @@ -10,6 +10,7 @@ #define JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_ #include + #include #include "rocksdb/compaction_filter.h" @@ -17,17 +18,18 @@ namespace ROCKSDB_NAMESPACE { -class CompactionFilterFactoryJniCallback : public JniCallback, public CompactionFilterFactory { +class CompactionFilterFactoryJniCallback : public JniCallback, + public CompactionFilterFactory { public: - CompactionFilterFactoryJniCallback( - JNIEnv* env, jobject jcompaction_filter_factory); - virtual std::unique_ptr CreateCompactionFilter( + CompactionFilterFactoryJniCallback(JNIEnv* env, + jobject jcompaction_filter_factory); + virtual std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context); - virtual const char* Name() const; + virtual const char* Name() const; private: - std::unique_ptr m_name; - jmethodID m_jcreate_compaction_filter_methodid; + std::unique_ptr m_name; + jmethodID m_jcreate_compaction_filter_methodid; }; } // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/compaction_job_info.cc b/java/rocksjni/compaction_job_info.cc index de65478b627..fb292f59ce5 100644 --- a/java/rocksjni/compaction_job_info.cc +++ b/java/rocksjni/compaction_job_info.cc @@ -18,8 +18,7 @@ * Method: newCompactionJobInfo * Signature: ()J */ -jlong Java_org_rocksdb_CompactionJobInfo_newCompactionJobInfo( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_CompactionJobInfo_newCompactionJobInfo(JNIEnv*, jclass) { auto* compact_job_info = new ROCKSDB_NAMESPACE::CompactionJobInfo(); return GET_CPLUSPLUS_POINTER(compact_job_info); } @@ -29,8 +28,8 @@ jlong Java_org_rocksdb_CompactionJobInfo_newCompactionJobInfo( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionJobInfo_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_CompactionJobInfo_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); delete compact_job_info; @@ -41,8 +40,9 @@ void Java_org_rocksdb_CompactionJobInfo_disposeInternal( * Method: columnFamilyName * Signature: (J)[B */ -jbyteArray Java_org_rocksdb_CompactionJobInfo_columnFamilyName( - JNIEnv* env, jclass, jlong jhandle) { +jbyteArray Java_org_rocksdb_CompactionJobInfo_columnFamilyName(JNIEnv* env, + jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::JniUtil::copyBytes(env, compact_job_info->cf_name); @@ -53,8 +53,8 @@ jbyteArray Java_org_rocksdb_CompactionJobInfo_columnFamilyName( * Method: status * Signature: (J)Lorg/rocksdb/Status; */ -jobject Java_org_rocksdb_CompactionJobInfo_status( - JNIEnv* env, jclass, jlong jhandle) { +jobject Java_org_rocksdb_CompactionJobInfo_status(JNIEnv* env, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::StatusJni::construct(env, compact_job_info->status); @@ -65,8 +65,8 @@ jobject Java_org_rocksdb_CompactionJobInfo_status( * Method: threadId * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobInfo_threadId( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobInfo_threadId(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return static_cast(compact_job_info->thread_id); @@ -77,8 +77,7 @@ jlong Java_org_rocksdb_CompactionJobInfo_threadId( * Method: jobId * Signature: (J)I */ -jint Java_org_rocksdb_CompactionJobInfo_jobId( - JNIEnv*, jclass, jlong jhandle) { +jint Java_org_rocksdb_CompactionJobInfo_jobId(JNIEnv*, jclass, jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return static_cast(compact_job_info->job_id); @@ -89,8 +88,8 @@ jint Java_org_rocksdb_CompactionJobInfo_jobId( * Method: baseInputLevel * Signature: (J)I */ -jint Java_org_rocksdb_CompactionJobInfo_baseInputLevel( - JNIEnv*, jclass, jlong jhandle) { +jint Java_org_rocksdb_CompactionJobInfo_baseInputLevel(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return static_cast(compact_job_info->base_input_level); @@ -101,8 +100,8 @@ jint Java_org_rocksdb_CompactionJobInfo_baseInputLevel( * Method: outputLevel * Signature: (J)I */ -jint Java_org_rocksdb_CompactionJobInfo_outputLevel( - JNIEnv*, jclass, jlong jhandle) { +jint Java_org_rocksdb_CompactionJobInfo_outputLevel(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return static_cast(compact_job_info->output_level); @@ -113,8 +112,8 @@ jint Java_org_rocksdb_CompactionJobInfo_outputLevel( * Method: inputFiles * Signature: (J)[Ljava/lang/String; */ -jobjectArray Java_org_rocksdb_CompactionJobInfo_inputFiles( - JNIEnv* env, jclass, jlong jhandle) { +jobjectArray Java_org_rocksdb_CompactionJobInfo_inputFiles(JNIEnv* env, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::JniUtil::toJavaStrings( @@ -126,8 +125,8 @@ jobjectArray Java_org_rocksdb_CompactionJobInfo_inputFiles( * Method: outputFiles * Signature: (J)[Ljava/lang/String; */ -jobjectArray Java_org_rocksdb_CompactionJobInfo_outputFiles( - JNIEnv* env, jclass, jlong jhandle) { +jobjectArray Java_org_rocksdb_CompactionJobInfo_outputFiles(JNIEnv* env, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::JniUtil::toJavaStrings( @@ -139,8 +138,8 @@ jobjectArray Java_org_rocksdb_CompactionJobInfo_outputFiles( * Method: tableProperties * Signature: (J)Ljava/util/Map; */ -jobject Java_org_rocksdb_CompactionJobInfo_tableProperties( - JNIEnv* env, jclass, jlong jhandle) { +jobject Java_org_rocksdb_CompactionJobInfo_tableProperties(JNIEnv* env, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); auto* map = &compact_job_info->table_properties; @@ -196,8 +195,8 @@ jobject Java_org_rocksdb_CompactionJobInfo_tableProperties( * Method: compactionReason * Signature: (J)B */ -jbyte Java_org_rocksdb_CompactionJobInfo_compactionReason( - JNIEnv*, jclass, jlong jhandle) { +jbyte Java_org_rocksdb_CompactionJobInfo_compactionReason(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompactionReasonJni::toJavaCompactionReason( @@ -209,8 +208,8 @@ jbyte Java_org_rocksdb_CompactionJobInfo_compactionReason( * Method: compression * Signature: (J)B */ -jbyte Java_org_rocksdb_CompactionJobInfo_compression( - JNIEnv*, jclass, jlong jhandle) { +jbyte Java_org_rocksdb_CompactionJobInfo_compression(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( @@ -222,8 +221,7 @@ jbyte Java_org_rocksdb_CompactionJobInfo_compression( * Method: stats * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobInfo_stats( - JNIEnv *, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobInfo_stats(JNIEnv*, jclass, jlong jhandle) { auto* compact_job_info = reinterpret_cast(jhandle); auto* stats = new ROCKSDB_NAMESPACE::CompactionJobStats(); diff --git a/java/rocksjni/compaction_job_stats.cc b/java/rocksjni/compaction_job_stats.cc index a9bd9499e26..a2599c1321f 100644 --- a/java/rocksjni/compaction_job_stats.cc +++ b/java/rocksjni/compaction_job_stats.cc @@ -19,8 +19,8 @@ * Method: newCompactionJobStats * Signature: ()J */ -jlong Java_org_rocksdb_CompactionJobStats_newCompactionJobStats( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_CompactionJobStats_newCompactionJobStats(JNIEnv*, + jclass) { auto* compact_job_stats = new ROCKSDB_NAMESPACE::CompactionJobStats(); return GET_CPLUSPLUS_POINTER(compact_job_stats); } @@ -30,8 +30,8 @@ jlong Java_org_rocksdb_CompactionJobStats_newCompactionJobStats( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionJobStats_disposeInternal( - JNIEnv *, jobject, jlong jhandle) { +void Java_org_rocksdb_CompactionJobStats_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); delete compact_job_stats; @@ -42,8 +42,7 @@ void Java_org_rocksdb_CompactionJobStats_disposeInternal( * Method: reset * Signature: (J)V */ -void Java_org_rocksdb_CompactionJobStats_reset( - JNIEnv*, jclass, jlong jhandle) { +void Java_org_rocksdb_CompactionJobStats_reset(JNIEnv*, jclass, jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); compact_job_stats->Reset(); @@ -54,8 +53,8 @@ void Java_org_rocksdb_CompactionJobStats_reset( * Method: add * Signature: (JJ)V */ -void Java_org_rocksdb_CompactionJobStats_add( - JNIEnv*, jclass, jlong jhandle, jlong jother_handle) { +void Java_org_rocksdb_CompactionJobStats_add(JNIEnv*, jclass, jlong jhandle, + jlong jother_handle) { auto* compact_job_stats = reinterpret_cast(jhandle); auto* other_compact_job_stats = @@ -68,8 +67,8 @@ void Java_org_rocksdb_CompactionJobStats_add( * Method: elapsedMicros * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_elapsedMicros( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_elapsedMicros(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); return static_cast(compact_job_stats->elapsed_micros); @@ -80,8 +79,8 @@ jlong Java_org_rocksdb_CompactionJobStats_elapsedMicros( * Method: numInputRecords * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_numInputRecords( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_numInputRecords(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); return static_cast(compact_job_stats->num_input_records); @@ -92,8 +91,8 @@ jlong Java_org_rocksdb_CompactionJobStats_numInputRecords( * Method: numInputFiles * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_numInputFiles( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_numInputFiles(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); return static_cast(compact_job_stats->num_input_files); @@ -108,8 +107,7 @@ jlong Java_org_rocksdb_CompactionJobStats_numInputFilesAtOutputLevel( JNIEnv*, jclass, jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_input_files_at_output_level); + return static_cast(compact_job_stats->num_input_files_at_output_level); } /* @@ -117,12 +115,11 @@ jlong Java_org_rocksdb_CompactionJobStats_numInputFilesAtOutputLevel( * Method: numOutputRecords * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_numOutputRecords( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_numOutputRecords(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_output_records); + return static_cast(compact_job_stats->num_output_records); } /* @@ -130,12 +127,11 @@ jlong Java_org_rocksdb_CompactionJobStats_numOutputRecords( * Method: numOutputFiles * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_numOutputFiles( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_numOutputFiles(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_output_files); + return static_cast(compact_job_stats->num_output_files); } /* @@ -143,8 +139,8 @@ jlong Java_org_rocksdb_CompactionJobStats_numOutputFiles( * Method: isManualCompaction * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompactionJobStats_isManualCompaction( - JNIEnv*, jclass, jlong jhandle) { +jboolean Java_org_rocksdb_CompactionJobStats_isManualCompaction(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); if (compact_job_stats->is_manual_compaction) { @@ -159,12 +155,11 @@ jboolean Java_org_rocksdb_CompactionJobStats_isManualCompaction( * Method: totalInputBytes * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_totalInputBytes( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_totalInputBytes(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->total_input_bytes); + return static_cast(compact_job_stats->total_input_bytes); } /* @@ -172,12 +167,11 @@ jlong Java_org_rocksdb_CompactionJobStats_totalInputBytes( * Method: totalOutputBytes * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_totalOutputBytes( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_totalOutputBytes(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->total_output_bytes); + return static_cast(compact_job_stats->total_output_bytes); } /* @@ -185,12 +179,11 @@ jlong Java_org_rocksdb_CompactionJobStats_totalOutputBytes( * Method: numRecordsReplaced * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_numRecordsReplaced( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_numRecordsReplaced(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_records_replaced); + return static_cast(compact_job_stats->num_records_replaced); } /* @@ -198,12 +191,11 @@ jlong Java_org_rocksdb_CompactionJobStats_numRecordsReplaced( * Method: totalInputRawKeyBytes * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_totalInputRawKeyBytes( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_totalInputRawKeyBytes(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->total_input_raw_key_bytes); + return static_cast(compact_job_stats->total_input_raw_key_bytes); } /* @@ -215,8 +207,7 @@ jlong Java_org_rocksdb_CompactionJobStats_totalInputRawValueBytes( JNIEnv*, jclass, jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->total_input_raw_value_bytes); + return static_cast(compact_job_stats->total_input_raw_value_bytes); } /* @@ -228,8 +219,7 @@ jlong Java_org_rocksdb_CompactionJobStats_numInputDeletionRecords( JNIEnv*, jclass, jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_input_deletion_records); + return static_cast(compact_job_stats->num_input_deletion_records); } /* @@ -241,8 +231,7 @@ jlong Java_org_rocksdb_CompactionJobStats_numExpiredDeletionRecords( JNIEnv*, jclass, jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_expired_deletion_records); + return static_cast(compact_job_stats->num_expired_deletion_records); } /* @@ -250,12 +239,11 @@ jlong Java_org_rocksdb_CompactionJobStats_numExpiredDeletionRecords( * Method: numCorruptKeys * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_numCorruptKeys( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_numCorruptKeys(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_corrupt_keys); + return static_cast(compact_job_stats->num_corrupt_keys); } /* @@ -263,12 +251,11 @@ jlong Java_org_rocksdb_CompactionJobStats_numCorruptKeys( * Method: fileWriteNanos * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_fileWriteNanos( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_fileWriteNanos(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->file_write_nanos); + return static_cast(compact_job_stats->file_write_nanos); } /* @@ -276,12 +263,11 @@ jlong Java_org_rocksdb_CompactionJobStats_fileWriteNanos( * Method: fileRangeSyncNanos * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_fileRangeSyncNanos( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_fileRangeSyncNanos(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->file_range_sync_nanos); + return static_cast(compact_job_stats->file_range_sync_nanos); } /* @@ -289,12 +275,11 @@ jlong Java_org_rocksdb_CompactionJobStats_fileRangeSyncNanos( * Method: fileFsyncNanos * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_fileFsyncNanos( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_fileFsyncNanos(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->file_fsync_nanos); + return static_cast(compact_job_stats->file_fsync_nanos); } /* @@ -302,12 +287,11 @@ jlong Java_org_rocksdb_CompactionJobStats_fileFsyncNanos( * Method: filePrepareWriteNanos * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_filePrepareWriteNanos( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_filePrepareWriteNanos(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->file_prepare_write_nanos); + return static_cast(compact_job_stats->file_prepare_write_nanos); } /* @@ -341,12 +325,11 @@ jbyteArray Java_org_rocksdb_CompactionJobStats_largestOutputKeyPrefix( * Method: numSingleDelFallthru * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_numSingleDelFallthru( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_numSingleDelFallthru(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_single_del_fallthru); + return static_cast(compact_job_stats->num_single_del_fallthru); } /* @@ -354,10 +337,9 @@ jlong Java_org_rocksdb_CompactionJobStats_numSingleDelFallthru( * Method: numSingleDelMismatch * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionJobStats_numSingleDelMismatch( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionJobStats_numSingleDelMismatch(JNIEnv*, jclass, + jlong jhandle) { auto* compact_job_stats = reinterpret_cast(jhandle); - return static_cast( - compact_job_stats->num_single_del_mismatch); + return static_cast(compact_job_stats->num_single_del_mismatch); } diff --git a/java/rocksjni/compaction_options.cc b/java/rocksjni/compaction_options.cc index f5ddcd6d466..bbbde0313fa 100644 --- a/java/rocksjni/compaction_options.cc +++ b/java/rocksjni/compaction_options.cc @@ -18,8 +18,7 @@ * Method: newCompactionOptions * Signature: ()J */ -jlong Java_org_rocksdb_CompactionOptions_newCompactionOptions( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_CompactionOptions_newCompactionOptions(JNIEnv*, jclass) { auto* compact_opts = new ROCKSDB_NAMESPACE::CompactionOptions(); return GET_CPLUSPLUS_POINTER(compact_opts); } @@ -29,8 +28,8 @@ jlong Java_org_rocksdb_CompactionOptions_newCompactionOptions( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionOptions_disposeInternal( - JNIEnv *, jobject, jlong jhandle) { +void Java_org_rocksdb_CompactionOptions_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* compact_opts = reinterpret_cast(jhandle); delete compact_opts; @@ -41,8 +40,8 @@ void Java_org_rocksdb_CompactionOptions_disposeInternal( * Method: compression * Signature: (J)B */ -jbyte Java_org_rocksdb_CompactionOptions_compression( - JNIEnv*, jclass, jlong jhandle) { +jbyte Java_org_rocksdb_CompactionOptions_compression(JNIEnv*, jclass, + jlong jhandle) { auto* compact_opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( @@ -68,12 +67,11 @@ void Java_org_rocksdb_CompactionOptions_setCompression( * Method: outputFileSizeLimit * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionOptions_outputFileSizeLimit( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_CompactionOptions_outputFileSizeLimit(JNIEnv*, jclass, + jlong jhandle) { auto* compact_opts = reinterpret_cast(jhandle); - return static_cast( - compact_opts->output_file_size_limit); + return static_cast(compact_opts->output_file_size_limit); } /* @@ -94,12 +92,11 @@ void Java_org_rocksdb_CompactionOptions_setOutputFileSizeLimit( * Method: maxSubcompactions * Signature: (J)I */ -jint Java_org_rocksdb_CompactionOptions_maxSubcompactions( - JNIEnv*, jclass, jlong jhandle) { +jint Java_org_rocksdb_CompactionOptions_maxSubcompactions(JNIEnv*, jclass, + jlong jhandle) { auto* compact_opts = reinterpret_cast(jhandle); - return static_cast( - compact_opts->max_subcompactions); + return static_cast(compact_opts->max_subcompactions); } /* @@ -111,6 +108,5 @@ void Java_org_rocksdb_CompactionOptions_setMaxSubcompactions( JNIEnv*, jclass, jlong jhandle, jint jmax_subcompactions) { auto* compact_opts = reinterpret_cast(jhandle); - compact_opts->max_subcompactions = - static_cast(jmax_subcompactions); + compact_opts->max_subcompactions = static_cast(jmax_subcompactions); } diff --git a/java/rocksjni/compaction_options_fifo.cc b/java/rocksjni/compaction_options_fifo.cc index 36f99749bd6..f6a47fec5b9 100644 --- a/java/rocksjni/compaction_options_fifo.cc +++ b/java/rocksjni/compaction_options_fifo.cc @@ -17,8 +17,8 @@ * Method: newCompactionOptionsFIFO * Signature: ()J */ -jlong Java_org_rocksdb_CompactionOptionsFIFO_newCompactionOptionsFIFO( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_CompactionOptionsFIFO_newCompactionOptionsFIFO(JNIEnv*, + jclass) { const auto* opt = new ROCKSDB_NAMESPACE::CompactionOptionsFIFO(); return GET_CPLUSPLUS_POINTER(opt); } @@ -40,8 +40,8 @@ void Java_org_rocksdb_CompactionOptionsFIFO_setMaxTableFilesSize( * Method: maxTableFilesSize * Signature: (J)J */ -jlong Java_org_rocksdb_CompactionOptionsFIFO_maxTableFilesSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_CompactionOptionsFIFO_maxTableFilesSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_table_files_size); @@ -64,8 +64,9 @@ void Java_org_rocksdb_CompactionOptionsFIFO_setAllowCompaction( * Method: allowCompaction * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(JNIEnv*, + jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_compaction); @@ -76,7 +77,7 @@ jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompactionOptionsFIFO_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_CompactionOptionsFIFO_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { delete reinterpret_cast(jhandle); } diff --git a/java/rocksjni/compaction_options_universal.cc b/java/rocksjni/compaction_options_universal.cc index c3a1401e60d..9fc6f315828 100644 --- a/java/rocksjni/compaction_options_universal.cc +++ b/java/rocksjni/compaction_options_universal.cc @@ -41,8 +41,8 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setSizeRatio( * Method: sizeRatio * Signature: (J)I */ -jint Java_org_rocksdb_CompactionOptionsUniversal_sizeRatio( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_CompactionOptionsUniversal_sizeRatio(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->size_ratio); @@ -65,8 +65,8 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setMinMergeWidth( * Method: minMergeWidth * Signature: (J)I */ -jint Java_org_rocksdb_CompactionOptionsUniversal_minMergeWidth( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_CompactionOptionsUniversal_minMergeWidth(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->min_merge_width); @@ -89,8 +89,8 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setMaxMergeWidth( * Method: maxMergeWidth * Signature: (J)I */ -jint Java_org_rocksdb_CompactionOptionsUniversal_maxMergeWidth( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_CompactionOptionsUniversal_maxMergeWidth(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_merge_width); @@ -127,8 +127,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_maxSizeAmplificationPercent( * Signature: (JI)V */ void Java_org_rocksdb_CompactionOptionsUniversal_setCompressionSizePercent( - JNIEnv*, jobject, jlong jhandle, - jint jcompression_size_percent) { + JNIEnv*, jobject, jlong jhandle, jint jcompression_size_percent) { auto* opt = reinterpret_cast(jhandle); opt->compression_size_percent = @@ -166,8 +165,8 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setStopStyle( * Method: stopStyle * Signature: (J)B */ -jbyte Java_org_rocksdb_CompactionOptionsUniversal_stopStyle( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_CompactionOptionsUniversal_stopStyle(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompactionStopStyleJni::toJavaCompactionStopStyle( diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc index 4bf53a44656..11279c4ce08 100644 --- a/java/rocksjni/comparator.cc +++ b/java/rocksjni/comparator.cc @@ -39,8 +39,9 @@ jlong Java_org_rocksdb_AbstractComparator_createNewComparator( * Method: usingDirectBuffers * Signature: (J)Z */ -jboolean Java_org_rocksdb_AbstractComparator_usingDirectBuffers( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_AbstractComparator_usingDirectBuffers(JNIEnv*, + jobject, + jlong jhandle) { auto* c = reinterpret_cast(jhandle); return static_cast(c->m_options->direct_buffer); diff --git a/java/rocksjni/comparatorjnicallback.cc b/java/rocksjni/comparatorjnicallback.cc index 248b15d3a1f..07ab9fa41cc 100644 --- a/java/rocksjni/comparatorjnicallback.cc +++ b/java/rocksjni/comparatorjnicallback.cc @@ -7,18 +7,18 @@ // ROCKSDB_NAMESPACE::Comparator. #include "rocksjni/comparatorjnicallback.h" + #include "rocksjni/portal.h" namespace ROCKSDB_NAMESPACE { ComparatorJniCallback::ComparatorJniCallback( JNIEnv* env, jobject jcomparator, const ComparatorJniCallbackOptions* options) - : JniCallback(env, jcomparator), - m_options(options) { - - // cache the AbstractComparatorJniBridge class as we will reuse it many times for each callback - m_abstract_comparator_jni_bridge_clazz = - static_cast(env->NewGlobalRef(AbstractComparatorJniBridge::getJClass(env))); + : JniCallback(env, jcomparator), m_options(options) { + // cache the AbstractComparatorJniBridge class as we will reuse it many times + // for each callback + m_abstract_comparator_jni_bridge_clazz = static_cast( + env->NewGlobalRef(AbstractComparatorJniBridge::getJClass(env))); // Note: The name of a Comparator will not change during it's lifetime, // so we cache it in a global var @@ -34,7 +34,7 @@ ComparatorJniCallback::ComparatorJniCallback( } jboolean has_exception = JNI_FALSE; m_name = JniUtil::copyString(env, js_name, - &has_exception); // also releases jsName + &has_exception); // also releases jsName if (has_exception == JNI_TRUE) { // exception thrown return; @@ -52,16 +52,16 @@ ComparatorJniCallback::ComparatorJniCallback( } m_jshortest_mid = - AbstractComparatorJniBridge::getFindShortestSeparatorInternalMethodId( - env, m_abstract_comparator_jni_bridge_clazz); + AbstractComparatorJniBridge::getFindShortestSeparatorInternalMethodId( + env, m_abstract_comparator_jni_bridge_clazz); if (m_jshortest_mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } m_jshort_mid = - AbstractComparatorJniBridge::getFindShortSuccessorInternalMethodId(env, - m_abstract_comparator_jni_bridge_clazz); + AbstractComparatorJniBridge::getFindShortSuccessorInternalMethodId( + env, m_abstract_comparator_jni_bridge_clazz); if (m_jshort_mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; @@ -69,9 +69,8 @@ ComparatorJniCallback::ComparatorJniCallback( // do we need reusable buffers? if (m_options->max_reused_buffer_size > -1) { - - if (m_options->reused_synchronisation_type - == ReusedSynchronisationType::THREAD_LOCAL) { + if (m_options->reused_synchronisation_type == + ReusedSynchronisationType::THREAD_LOCAL) { // buffers reused per thread UnrefHandler unref = [](void* ptr) { ThreadLocalBuf* tlb = reinterpret_cast(ptr); @@ -97,9 +96,9 @@ ComparatorJniCallback::ComparatorJniCallback( m_jshort_buf_key = nullptr; } else { - //buffers reused and shared across threads - const bool adaptive = - m_options->reused_synchronisation_type == ReusedSynchronisationType::ADAPTIVE_MUTEX; + // buffers reused and shared across threads + const bool adaptive = m_options->reused_synchronisation_type == + ReusedSynchronisationType::ADAPTIVE_MUTEX; mtx_compare = std::unique_ptr(new port::Mutex(adaptive)); mtx_shortest = std::unique_ptr(new port::Mutex(adaptive)); mtx_short = std::unique_ptr(new port::Mutex(adaptive)); @@ -220,9 +219,7 @@ ComparatorJniCallback::~ComparatorJniCallback() { releaseJniEnv(attached_thread); } -const char* ComparatorJniCallback::Name() const { - return m_name.get(); -} +const char* ComparatorJniCallback::Name() const { return m_name.get(); } int ComparatorJniCallback::Compare(const Slice& a, const Slice& b) const { jboolean attached_thread = JNI_FALSE; @@ -236,38 +233,38 @@ int ComparatorJniCallback::Compare(const Slice& a, const Slice& b) const { MaybeLockForReuse(mtx_compare, reuse_jbuf_a || reuse_jbuf_b); - jobject jcompare_buf_a = GetBuffer(env, a, reuse_jbuf_a, m_tl_buf_a, m_jcompare_buf_a); + jobject jcompare_buf_a = + GetBuffer(env, a, reuse_jbuf_a, m_tl_buf_a, m_jcompare_buf_a); if (jcompare_buf_a == nullptr) { // exception occurred MaybeUnlockForReuse(mtx_compare, reuse_jbuf_a || reuse_jbuf_b); - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); return 0; } - jobject jcompare_buf_b = GetBuffer(env, b, reuse_jbuf_b, m_tl_buf_b, m_jcompare_buf_b); + jobject jcompare_buf_b = + GetBuffer(env, b, reuse_jbuf_b, m_tl_buf_b, m_jcompare_buf_b); if (jcompare_buf_b == nullptr) { // exception occurred if (!reuse_jbuf_a) { DeleteBuffer(env, jcompare_buf_a); } MaybeUnlockForReuse(mtx_compare, reuse_jbuf_a || reuse_jbuf_b); - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); return 0; } - jint result = - env->CallStaticIntMethod( - m_abstract_comparator_jni_bridge_clazz, m_jcompare_mid, - m_jcallback_obj, - jcompare_buf_a, reuse_jbuf_a ? a.size() : -1, - jcompare_buf_b, reuse_jbuf_b ? b.size() : -1); + jint result = env->CallStaticIntMethod( + m_abstract_comparator_jni_bridge_clazz, m_jcompare_mid, m_jcallback_obj, + jcompare_buf_a, reuse_jbuf_a ? a.size() : -1, jcompare_buf_b, + reuse_jbuf_b ? b.size() : -1); if (env->ExceptionCheck()) { // exception thrown from CallIntMethod - env->ExceptionDescribe(); // print out exception to stderr - result = 0; // we could not get a result from java callback so use 0 + env->ExceptionDescribe(); // print out exception to stderr + result = 0; // we could not get a result from java callback so use 0 } if (!reuse_jbuf_a) { @@ -284,8 +281,8 @@ int ComparatorJniCallback::Compare(const Slice& a, const Slice& b) const { return result; } -void ComparatorJniCallback::FindShortestSeparator( - std::string* start, const Slice& limit) const { +void ComparatorJniCallback::FindShortestSeparator(std::string* start, + const Slice& limit) const { if (start == nullptr) { return; } @@ -294,88 +291,90 @@ void ComparatorJniCallback::FindShortestSeparator( JNIEnv* env = getJniEnv(&attached_thread); assert(env != nullptr); - const bool reuse_jbuf_start = - static_cast(start->length()) <= m_options->max_reused_buffer_size; + const bool reuse_jbuf_start = static_cast(start->length()) <= + m_options->max_reused_buffer_size; const bool reuse_jbuf_limit = static_cast(limit.size()) <= m_options->max_reused_buffer_size; MaybeLockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit); Slice sstart(start->data(), start->length()); - jobject j_start_buf = GetBuffer(env, sstart, reuse_jbuf_start, m_tl_buf_a, m_jshortest_buf_start); + jobject j_start_buf = GetBuffer(env, sstart, reuse_jbuf_start, m_tl_buf_a, + m_jshortest_buf_start); if (j_start_buf == nullptr) { // exception occurred MaybeUnlockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit); - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); return; } - jobject j_limit_buf = GetBuffer(env, limit, reuse_jbuf_limit, m_tl_buf_b, m_jshortest_buf_limit); + jobject j_limit_buf = GetBuffer(env, limit, reuse_jbuf_limit, m_tl_buf_b, + m_jshortest_buf_limit); if (j_limit_buf == nullptr) { // exception occurred if (!reuse_jbuf_start) { DeleteBuffer(env, j_start_buf); } MaybeUnlockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit); - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); return; } jint jstart_len = env->CallStaticIntMethod( - m_abstract_comparator_jni_bridge_clazz, m_jshortest_mid, - m_jcallback_obj, - j_start_buf, reuse_jbuf_start ? start->length() : -1, - j_limit_buf, reuse_jbuf_limit ? limit.size() : -1); + m_abstract_comparator_jni_bridge_clazz, m_jshortest_mid, m_jcallback_obj, + j_start_buf, reuse_jbuf_start ? start->length() : -1, j_limit_buf, + reuse_jbuf_limit ? limit.size() : -1); if (env->ExceptionCheck()) { // exception thrown from CallIntMethod - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr } else if (static_cast(jstart_len) != start->length()) { // start buffer has changed in Java, so update `start` with the result bool copy_from_non_direct = false; if (reuse_jbuf_start) { - // reused a buffer - if (m_options->direct_buffer) { - // reused direct buffer - void* start_buf = env->GetDirectBufferAddress(j_start_buf); - if (start_buf == nullptr) { - if (!reuse_jbuf_start) { - DeleteBuffer(env, j_start_buf); - } - if (!reuse_jbuf_limit) { - DeleteBuffer(env, j_limit_buf); - } - MaybeUnlockForReuse(mtx_shortest, reuse_jbuf_start || reuse_jbuf_limit); - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, "Unable to get Direct Buffer Address"); - env->ExceptionDescribe(); // print out exception to stderr - releaseJniEnv(attached_thread); - return; + // reused a buffer + if (m_options->direct_buffer) { + // reused direct buffer + void* start_buf = env->GetDirectBufferAddress(j_start_buf); + if (start_buf == nullptr) { + if (!reuse_jbuf_start) { + DeleteBuffer(env, j_start_buf); } - start->assign(static_cast(start_buf), jstart_len); - - } else { - - // reused non-direct buffer - copy_from_non_direct = true; + if (!reuse_jbuf_limit) { + DeleteBuffer(env, j_limit_buf); + } + MaybeUnlockForReuse(mtx_shortest, + reuse_jbuf_start || reuse_jbuf_limit); + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Unable to get Direct Buffer Address"); + env->ExceptionDescribe(); // print out exception to stderr + releaseJniEnv(attached_thread); + return; } + start->assign(static_cast(start_buf), jstart_len); + + } else { + // reused non-direct buffer + copy_from_non_direct = true; + } } else { - // there was a new buffer - if (m_options->direct_buffer) { - // it was direct... don't forget to potentially truncate the `start` string - start->resize(jstart_len); - } else { - // it was non-direct - copy_from_non_direct = true; - } + // there was a new buffer + if (m_options->direct_buffer) { + // it was direct... don't forget to potentially truncate the `start` + // string + start->resize(jstart_len); + } else { + // it was non-direct + copy_from_non_direct = true; + } } if (copy_from_non_direct) { - jbyteArray jarray = ByteBufferJni::array(env, j_start_buf, - m_jbytebuffer_clazz); + jbyteArray jarray = + ByteBufferJni::array(env, j_start_buf, m_jbytebuffer_clazz); if (jarray == nullptr) { if (!reuse_jbuf_start) { DeleteBuffer(env, j_start_buf); @@ -389,9 +388,12 @@ void ComparatorJniCallback::FindShortestSeparator( return; } jboolean has_exception = JNI_FALSE; - JniUtil::byteString(env, jarray, [start, jstart_len](const char* data, const size_t) { - return start->assign(data, static_cast(jstart_len)); - }, &has_exception); + JniUtil::byteString( + env, jarray, + [start, jstart_len](const char* data, const size_t) { + return start->assign(data, static_cast(jstart_len)); + }, + &has_exception); env->DeleteLocalRef(jarray); if (has_exception == JNI_TRUE) { if (!reuse_jbuf_start) { @@ -420,8 +422,7 @@ void ComparatorJniCallback::FindShortestSeparator( releaseJniEnv(attached_thread); } -void ComparatorJniCallback::FindShortSuccessor( - std::string* key) const { +void ComparatorJniCallback::FindShortSuccessor(std::string* key) const { if (key == nullptr) { return; } @@ -436,18 +437,18 @@ void ComparatorJniCallback::FindShortSuccessor( MaybeLockForReuse(mtx_short, reuse_jbuf_key); Slice skey(key->data(), key->length()); - jobject j_key_buf = GetBuffer(env, skey, reuse_jbuf_key, m_tl_buf_a, m_jshort_buf_key); + jobject j_key_buf = + GetBuffer(env, skey, reuse_jbuf_key, m_tl_buf_a, m_jshort_buf_key); if (j_key_buf == nullptr) { // exception occurred MaybeUnlockForReuse(mtx_short, reuse_jbuf_key); - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); return; } jint jkey_len = env->CallStaticIntMethod( - m_abstract_comparator_jni_bridge_clazz, m_jshort_mid, - m_jcallback_obj, + m_abstract_comparator_jni_bridge_clazz, m_jshort_mid, m_jcallback_obj, j_key_buf, reuse_jbuf_key ? key->length() : -1); if (env->ExceptionCheck()) { @@ -459,49 +460,48 @@ void ComparatorJniCallback::FindShortSuccessor( env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); return; - } if (static_cast(jkey_len) != key->length()) { // key buffer has changed in Java, so update `key` with the result bool copy_from_non_direct = false; if (reuse_jbuf_key) { - // reused a buffer - if (m_options->direct_buffer) { - // reused direct buffer - void* key_buf = env->GetDirectBufferAddress(j_key_buf); - if (key_buf == nullptr) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, "Unable to get Direct Buffer Address"); - if (!reuse_jbuf_key) { - DeleteBuffer(env, j_key_buf); - } - MaybeUnlockForReuse(mtx_short, reuse_jbuf_key); - env->ExceptionDescribe(); // print out exception to stderr - releaseJniEnv(attached_thread); - return; + // reused a buffer + if (m_options->direct_buffer) { + // reused direct buffer + void* key_buf = env->GetDirectBufferAddress(j_key_buf); + if (key_buf == nullptr) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Unable to get Direct Buffer Address"); + if (!reuse_jbuf_key) { + DeleteBuffer(env, j_key_buf); } - key->assign(static_cast(key_buf), jkey_len); - } else { - // reused non-direct buffer - copy_from_non_direct = true; + MaybeUnlockForReuse(mtx_short, reuse_jbuf_key); + env->ExceptionDescribe(); // print out exception to stderr + releaseJniEnv(attached_thread); + return; } + key->assign(static_cast(key_buf), jkey_len); + } else { + // reused non-direct buffer + copy_from_non_direct = true; + } } else { - // there was a new buffer - if (m_options->direct_buffer) { - // it was direct... don't forget to potentially truncate the `key` string - key->resize(jkey_len); - } else { - // it was non-direct - copy_from_non_direct = true; - } + // there was a new buffer + if (m_options->direct_buffer) { + // it was direct... don't forget to potentially truncate the `key` + // string + key->resize(jkey_len); + } else { + // it was non-direct + copy_from_non_direct = true; + } } if (copy_from_non_direct) { - jbyteArray jarray = ByteBufferJni::array(env, j_key_buf, - m_jbytebuffer_clazz); + jbyteArray jarray = + ByteBufferJni::array(env, j_key_buf, m_jbytebuffer_clazz); if (jarray == nullptr) { - if (!reuse_jbuf_key) { DeleteBuffer(env, j_key_buf); } @@ -511,9 +511,12 @@ void ComparatorJniCallback::FindShortSuccessor( return; } jboolean has_exception = JNI_FALSE; - JniUtil::byteString(env, jarray, [key, jkey_len](const char* data, const size_t) { - return key->assign(data, static_cast(jkey_len)); - }, &has_exception); + JniUtil::byteString( + env, jarray, + [key, jkey_len](const char* data, const size_t) { + return key->assign(data, static_cast(jkey_len)); + }, + &has_exception); env->DeleteLocalRef(jarray); if (has_exception == JNI_TRUE) { if (!reuse_jbuf_key) { @@ -539,8 +542,9 @@ void ComparatorJniCallback::FindShortSuccessor( inline void ComparatorJniCallback::MaybeLockForReuse( const std::unique_ptr& mutex, const bool cond) const { // no need to lock if using thread_local - if (m_options->reused_synchronisation_type != ReusedSynchronisationType::THREAD_LOCAL - && cond) { + if (m_options->reused_synchronisation_type != + ReusedSynchronisationType::THREAD_LOCAL && + cond) { mutex.get()->Lock(); } } @@ -548,18 +552,20 @@ inline void ComparatorJniCallback::MaybeLockForReuse( inline void ComparatorJniCallback::MaybeUnlockForReuse( const std::unique_ptr& mutex, const bool cond) const { // no need to unlock if using thread_local - if (m_options->reused_synchronisation_type != ReusedSynchronisationType::THREAD_LOCAL - && cond) { + if (m_options->reused_synchronisation_type != + ReusedSynchronisationType::THREAD_LOCAL && + cond) { mutex.get()->Unlock(); } } jobject ComparatorJniCallback::GetBuffer(JNIEnv* env, const Slice& src, - bool reuse_buffer, ThreadLocalPtr* tl_buf, jobject jreuse_buffer) const { + bool reuse_buffer, + ThreadLocalPtr* tl_buf, + jobject jreuse_buffer) const { if (reuse_buffer) { - if (m_options->reused_synchronisation_type - == ReusedSynchronisationType::THREAD_LOCAL) { - + if (m_options->reused_synchronisation_type == + ReusedSynchronisationType::THREAD_LOCAL) { // reuse thread-local bufffer ThreadLocalBuf* tlb = reinterpret_cast(tl_buf->Get()); if (tlb == nullptr) { @@ -576,25 +582,25 @@ jobject ComparatorJniCallback::GetBuffer(JNIEnv* env, const Slice& src, } return ReuseBuffer(env, src, tlb->jbuf); } else { - // reuse class member buffer return ReuseBuffer(env, src, jreuse_buffer); } } else { - // new buffer return NewBuffer(env, src); } } -jobject ComparatorJniCallback::ReuseBuffer( - JNIEnv* env, const Slice& src, jobject jreuse_buffer) const { +jobject ComparatorJniCallback::ReuseBuffer(JNIEnv* env, const Slice& src, + jobject jreuse_buffer) const { // we can reuse the buffer if (m_options->direct_buffer) { // copy into direct buffer void* buf = env->GetDirectBufferAddress(jreuse_buffer); if (buf == nullptr) { - // either memory region is undefined, given object is not a direct java.nio.Buffer, or JNI access to direct buffers is not supported by this virtual machine. + // either memory region is undefined, given object is not a direct + // java.nio.Buffer, or JNI access to direct buffers is not supported by + // this virtual machine. ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, "Unable to get Direct Buffer Address"); return nullptr; @@ -602,13 +608,14 @@ jobject ComparatorJniCallback::ReuseBuffer( memcpy(buf, src.data(), src.size()); } else { // copy into non-direct buffer - const jbyteArray jarray = ByteBufferJni::array(env, jreuse_buffer, - m_jbytebuffer_clazz); + const jbyteArray jarray = + ByteBufferJni::array(env, jreuse_buffer, m_jbytebuffer_clazz); if (jarray == nullptr) { // exception occurred return nullptr; } - env->SetByteArrayRegion(jarray, 0, static_cast(src.size()), + env->SetByteArrayRegion( + jarray, 0, static_cast(src.size()), const_cast(reinterpret_cast(src.data()))); if (env->ExceptionCheck()) { // exception occurred @@ -622,8 +629,9 @@ jobject ComparatorJniCallback::ReuseBuffer( jobject ComparatorJniCallback::NewBuffer(JNIEnv* env, const Slice& src) const { // we need a new buffer - jobject jbuf = ByteBufferJni::constructWith(env, m_options->direct_buffer, - src.data(), src.size(), m_jbytebuffer_clazz); + jobject jbuf = + ByteBufferJni::constructWith(env, m_options->direct_buffer, src.data(), + src.size(), m_jbytebuffer_clazz); if (jbuf == nullptr) { // exception occurred return nullptr; diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h index 2e27de00889..a983ce4b595 100644 --- a/java/rocksjni/comparatorjnicallback.h +++ b/java/rocksjni/comparatorjnicallback.h @@ -10,12 +10,14 @@ #define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ #include + #include #include -#include "rocksjni/jnicallback.h" + +#include "port/port.h" #include "rocksdb/comparator.h" #include "rocksdb/slice.h" -#include "port/port.h" +#include "rocksjni/jnicallback.h" #include "util/thread_local.h" namespace ROCKSDB_NAMESPACE { @@ -41,7 +43,6 @@ enum ReusedSynchronisationType { }; struct ComparatorJniCallbackOptions { - // Set the synchronisation type used to guard the reused buffers. // Only used if max_reused_buffer_size > 0. // Default: ADAPTIVE_MUTEX @@ -83,54 +84,57 @@ struct ComparatorJniCallbackOptions { */ class ComparatorJniCallback : public JniCallback, public Comparator { public: - ComparatorJniCallback( - JNIEnv* env, jobject jcomparator, - const ComparatorJniCallbackOptions* options); - ~ComparatorJniCallback(); - virtual const char* Name() const; - virtual int Compare(const Slice& a, const Slice& b) const; - virtual void FindShortestSeparator( - std::string* start, const Slice& limit) const; - virtual void FindShortSuccessor(std::string* key) const; - const ComparatorJniCallbackOptions* m_options; + ComparatorJniCallback(JNIEnv* env, jobject jcomparator, + const ComparatorJniCallbackOptions* options); + ~ComparatorJniCallback(); + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + const ComparatorJniCallbackOptions* m_options; private: - struct ThreadLocalBuf { - ThreadLocalBuf(JavaVM* _jvm, bool _direct_buffer, jobject _jbuf) : - jvm(_jvm), direct_buffer(_direct_buffer), jbuf(_jbuf) {} - JavaVM* jvm; - bool direct_buffer; - jobject jbuf; - }; - inline void MaybeLockForReuse(const std::unique_ptr& mutex, - const bool cond) const; - inline void MaybeUnlockForReuse(const std::unique_ptr& mutex, - const bool cond) const; - jobject GetBuffer(JNIEnv* env, const Slice& src, bool reuse_buffer, - ThreadLocalPtr* tl_buf, jobject jreuse_buffer) const; - jobject ReuseBuffer(JNIEnv* env, const Slice& src, - jobject jreuse_buffer) const; - jobject NewBuffer(JNIEnv* env, const Slice& src) const; - void DeleteBuffer(JNIEnv* env, jobject jbuffer) const; - // used for synchronisation in compare method - std::unique_ptr mtx_compare; - // used for synchronisation in findShortestSeparator method - std::unique_ptr mtx_shortest; - // used for synchronisation in findShortSuccessor method - std::unique_ptr mtx_short; - std::unique_ptr m_name; - jclass m_abstract_comparator_jni_bridge_clazz; // TODO(AR) could we make this static somehow? - jclass m_jbytebuffer_clazz; // TODO(AR) we could cache this globally for the entire VM if we switch more APIs to use ByteBuffer // TODO(AR) could we make this static somehow? - jmethodID m_jcompare_mid; // TODO(AR) could we make this static somehow? - jmethodID m_jshortest_mid; // TODO(AR) could we make this static somehow? - jmethodID m_jshort_mid; // TODO(AR) could we make this static somehow? - jobject m_jcompare_buf_a; - jobject m_jcompare_buf_b; - jobject m_jshortest_buf_start; - jobject m_jshortest_buf_limit; - jobject m_jshort_buf_key; - ThreadLocalPtr* m_tl_buf_a; - ThreadLocalPtr* m_tl_buf_b; + struct ThreadLocalBuf { + ThreadLocalBuf(JavaVM* _jvm, bool _direct_buffer, jobject _jbuf) + : jvm(_jvm), direct_buffer(_direct_buffer), jbuf(_jbuf) {} + JavaVM* jvm; + bool direct_buffer; + jobject jbuf; + }; + inline void MaybeLockForReuse(const std::unique_ptr& mutex, + const bool cond) const; + inline void MaybeUnlockForReuse(const std::unique_ptr& mutex, + const bool cond) const; + jobject GetBuffer(JNIEnv* env, const Slice& src, bool reuse_buffer, + ThreadLocalPtr* tl_buf, jobject jreuse_buffer) const; + jobject ReuseBuffer(JNIEnv* env, const Slice& src, + jobject jreuse_buffer) const; + jobject NewBuffer(JNIEnv* env, const Slice& src) const; + void DeleteBuffer(JNIEnv* env, jobject jbuffer) const; + // used for synchronisation in compare method + std::unique_ptr mtx_compare; + // used for synchronisation in findShortestSeparator method + std::unique_ptr mtx_shortest; + // used for synchronisation in findShortSuccessor method + std::unique_ptr mtx_short; + std::unique_ptr m_name; + jclass m_abstract_comparator_jni_bridge_clazz; // TODO(AR) could we make this + // static somehow? + jclass m_jbytebuffer_clazz; // TODO(AR) we could cache this globally for the + // entire VM if we switch more APIs to use + // ByteBuffer // TODO(AR) could we make this + // static somehow? + jmethodID m_jcompare_mid; // TODO(AR) could we make this static somehow? + jmethodID m_jshortest_mid; // TODO(AR) could we make this static somehow? + jmethodID m_jshort_mid; // TODO(AR) could we make this static somehow? + jobject m_jcompare_buf_a; + jobject m_jcompare_buf_b; + jobject m_jshortest_buf_start; + jobject m_jshortest_buf_limit; + jobject m_jshort_buf_key; + ThreadLocalPtr* m_tl_buf_a; + ThreadLocalPtr* m_tl_buf_b; }; } // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/compression_options.cc b/java/rocksjni/compression_options.cc index 78ea2a5592f..53f2405601e 100644 --- a/java/rocksjni/compression_options.cc +++ b/java/rocksjni/compression_options.cc @@ -17,8 +17,8 @@ * Method: newCompressionOptions * Signature: ()J */ -jlong Java_org_rocksdb_CompressionOptions_newCompressionOptions( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_CompressionOptions_newCompressionOptions(JNIEnv*, + jclass) { const auto* opt = new ROCKSDB_NAMESPACE::CompressionOptions(); return GET_CPLUSPLUS_POINTER(opt); } @@ -28,8 +28,9 @@ jlong Java_org_rocksdb_CompressionOptions_newCompressionOptions( * Method: setWindowBits * Signature: (JI)V */ -void Java_org_rocksdb_CompressionOptions_setWindowBits( - JNIEnv*, jobject, jlong jhandle, jint jwindow_bits) { +void Java_org_rocksdb_CompressionOptions_setWindowBits(JNIEnv*, jobject, + jlong jhandle, + jint jwindow_bits) { auto* opt = reinterpret_cast(jhandle); opt->window_bits = static_cast(jwindow_bits); } @@ -39,8 +40,8 @@ void Java_org_rocksdb_CompressionOptions_setWindowBits( * Method: windowBits * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_windowBits( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_CompressionOptions_windowBits(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->window_bits); } @@ -50,8 +51,8 @@ jint Java_org_rocksdb_CompressionOptions_windowBits( * Method: setLevel * Signature: (JI)V */ -void Java_org_rocksdb_CompressionOptions_setLevel( - JNIEnv*, jobject, jlong jhandle, jint jlevel) { +void Java_org_rocksdb_CompressionOptions_setLevel(JNIEnv*, jobject, + jlong jhandle, jint jlevel) { auto* opt = reinterpret_cast(jhandle); opt->level = static_cast(jlevel); } @@ -61,8 +62,8 @@ void Java_org_rocksdb_CompressionOptions_setLevel( * Method: level * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_level( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_CompressionOptions_level(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->level); } @@ -72,8 +73,9 @@ jint Java_org_rocksdb_CompressionOptions_level( * Method: setStrategy * Signature: (JI)V */ -void Java_org_rocksdb_CompressionOptions_setStrategy( - JNIEnv*, jobject, jlong jhandle, jint jstrategy) { +void Java_org_rocksdb_CompressionOptions_setStrategy(JNIEnv*, jobject, + jlong jhandle, + jint jstrategy) { auto* opt = reinterpret_cast(jhandle); opt->strategy = static_cast(jstrategy); } @@ -83,8 +85,8 @@ void Java_org_rocksdb_CompressionOptions_setStrategy( * Method: strategy * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_strategy( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_CompressionOptions_strategy(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->strategy); } @@ -94,8 +96,9 @@ jint Java_org_rocksdb_CompressionOptions_strategy( * Method: setMaxDictBytes * Signature: (JI)V */ -void Java_org_rocksdb_CompressionOptions_setMaxDictBytes( - JNIEnv*, jobject, jlong jhandle, jint jmax_dict_bytes) { +void Java_org_rocksdb_CompressionOptions_setMaxDictBytes(JNIEnv*, jobject, + jlong jhandle, + jint jmax_dict_bytes) { auto* opt = reinterpret_cast(jhandle); opt->max_dict_bytes = static_cast(jmax_dict_bytes); } @@ -105,8 +108,8 @@ void Java_org_rocksdb_CompressionOptions_setMaxDictBytes( * Method: maxDictBytes * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_maxDictBytes( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_CompressionOptions_maxDictBytes(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_dict_bytes); } @@ -127,8 +130,8 @@ void Java_org_rocksdb_CompressionOptions_setZstdMaxTrainBytes( * Method: zstdMaxTrainBytes * Signature: (J)I */ -jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes( - JNIEnv *, jobject, jlong jhandle) { +jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->zstd_max_train_bytes); } @@ -183,8 +186,9 @@ jboolean Java_org_rocksdb_CompressionOptions_useZstdDictTrainer(JNIEnv*, * Method: setEnabled * Signature: (JZ)V */ -void Java_org_rocksdb_CompressionOptions_setEnabled( - JNIEnv*, jobject, jlong jhandle, jboolean jenabled) { +void Java_org_rocksdb_CompressionOptions_setEnabled(JNIEnv*, jobject, + jlong jhandle, + jboolean jenabled) { auto* opt = reinterpret_cast(jhandle); opt->enabled = jenabled == JNI_TRUE; } @@ -194,8 +198,8 @@ void Java_org_rocksdb_CompressionOptions_setEnabled( * Method: enabled * Signature: (J)Z */ -jboolean Java_org_rocksdb_CompressionOptions_enabled( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_CompressionOptions_enabled(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enabled); } @@ -204,7 +208,7 @@ jboolean Java_org_rocksdb_CompressionOptions_enabled( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_CompressionOptions_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_CompressionOptions_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { delete reinterpret_cast(jhandle); } diff --git a/java/rocksjni/config_options.cc b/java/rocksjni/config_options.cc index 3cda2524ef4..e6211132367 100644 --- a/java/rocksjni/config_options.cc +++ b/java/rocksjni/config_options.cc @@ -85,5 +85,6 @@ void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass, void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv *, jclass, jlong handle, jbyte level) { auto *cfg_opt = reinterpret_cast(handle); - cfg_opt->sanity_level = ROCKSDB_NAMESPACE::SanityLevelJni::toCppSanityLevel(level); + cfg_opt->sanity_level = + ROCKSDB_NAMESPACE::SanityLevelJni::toCppSanityLevel(level); } diff --git a/java/rocksjni/env.cc b/java/rocksjni/env.cc index b40a9b1d574..bb739fe2b42 100644 --- a/java/rocksjni/env.cc +++ b/java/rocksjni/env.cc @@ -24,8 +24,7 @@ * Method: getDefaultEnvInternal * Signature: ()J */ -jlong Java_org_rocksdb_Env_getDefaultEnvInternal( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_Env_getDefaultEnvInternal(JNIEnv*, jclass) { return GET_CPLUSPLUS_POINTER(ROCKSDB_NAMESPACE::Env::Default()); } @@ -34,8 +33,8 @@ jlong Java_org_rocksdb_Env_getDefaultEnvInternal( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RocksEnv_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_RocksEnv_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* e = reinterpret_cast(jhandle); assert(e != nullptr); delete e; @@ -46,8 +45,9 @@ void Java_org_rocksdb_RocksEnv_disposeInternal( * Method: setBackgroundThreads * Signature: (JIB)V */ -void Java_org_rocksdb_Env_setBackgroundThreads( - JNIEnv*, jobject, jlong jhandle, jint jnum, jbyte jpriority_value) { +void Java_org_rocksdb_Env_setBackgroundThreads(JNIEnv*, jobject, jlong jhandle, + jint jnum, + jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); rocks_env->SetBackgroundThreads( static_cast(jnum), @@ -59,8 +59,8 @@ void Java_org_rocksdb_Env_setBackgroundThreads( * Method: getBackgroundThreads * Signature: (JB)I */ -jint Java_org_rocksdb_Env_getBackgroundThreads( - JNIEnv*, jobject, jlong jhandle, jbyte jpriority_value) { +jint Java_org_rocksdb_Env_getBackgroundThreads(JNIEnv*, jobject, jlong jhandle, + jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); const int num = rocks_env->GetBackgroundThreads( ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value)); @@ -72,8 +72,8 @@ jint Java_org_rocksdb_Env_getBackgroundThreads( * Method: getThreadPoolQueueLen * Signature: (JB)I */ -jint Java_org_rocksdb_Env_getThreadPoolQueueLen( - JNIEnv*, jobject, jlong jhandle, jbyte jpriority_value) { +jint Java_org_rocksdb_Env_getThreadPoolQueueLen(JNIEnv*, jobject, jlong jhandle, + jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); const int queue_len = rocks_env->GetThreadPoolQueueLen( ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value)); @@ -85,8 +85,9 @@ jint Java_org_rocksdb_Env_getThreadPoolQueueLen( * Method: incBackgroundThreadsIfNeeded * Signature: (JIB)V */ -void Java_org_rocksdb_Env_incBackgroundThreadsIfNeeded( - JNIEnv*, jobject, jlong jhandle, jint jnum, jbyte jpriority_value) { +void Java_org_rocksdb_Env_incBackgroundThreadsIfNeeded(JNIEnv*, jobject, + jlong jhandle, jint jnum, + jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); rocks_env->IncBackgroundThreadsIfNeeded( static_cast(jnum), @@ -98,8 +99,9 @@ void Java_org_rocksdb_Env_incBackgroundThreadsIfNeeded( * Method: lowerThreadPoolIOPriority * Signature: (JB)V */ -void Java_org_rocksdb_Env_lowerThreadPoolIOPriority( - JNIEnv*, jobject, jlong jhandle, jbyte jpriority_value) { +void Java_org_rocksdb_Env_lowerThreadPoolIOPriority(JNIEnv*, jobject, + jlong jhandle, + jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); rocks_env->LowerThreadPoolIOPriority( ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value)); @@ -110,8 +112,9 @@ void Java_org_rocksdb_Env_lowerThreadPoolIOPriority( * Method: lowerThreadPoolCPUPriority * Signature: (JB)V */ -void Java_org_rocksdb_Env_lowerThreadPoolCPUPriority( - JNIEnv*, jobject, jlong jhandle, jbyte jpriority_value) { +void Java_org_rocksdb_Env_lowerThreadPoolCPUPriority(JNIEnv*, jobject, + jlong jhandle, + jbyte jpriority_value) { auto* rocks_env = reinterpret_cast(jhandle); rocks_env->LowerThreadPoolCPUPriority( ROCKSDB_NAMESPACE::PriorityJni::toCppPriority(jpriority_value)); @@ -122,8 +125,8 @@ void Java_org_rocksdb_Env_lowerThreadPoolCPUPriority( * Method: getThreadList * Signature: (J)[Lorg/rocksdb/ThreadStatus; */ -jobjectArray Java_org_rocksdb_Env_getThreadList( - JNIEnv* env, jobject, jlong jhandle) { +jobjectArray Java_org_rocksdb_Env_getThreadList(JNIEnv* env, jobject, + jlong jhandle) { auto* rocks_env = reinterpret_cast(jhandle); std::vector thread_status; ROCKSDB_NAMESPACE::Status s = rocks_env->GetThreadList(&thread_status); @@ -160,8 +163,8 @@ jobjectArray Java_org_rocksdb_Env_getThreadList( * Method: createMemEnv * Signature: (J)J */ -jlong Java_org_rocksdb_RocksMemEnv_createMemEnv( - JNIEnv*, jclass, jlong jbase_env_handle) { +jlong Java_org_rocksdb_RocksMemEnv_createMemEnv(JNIEnv*, jclass, + jlong jbase_env_handle) { auto* base_env = reinterpret_cast(jbase_env_handle); return GET_CPLUSPLUS_POINTER(ROCKSDB_NAMESPACE::NewMemEnv(base_env)); } @@ -171,8 +174,8 @@ jlong Java_org_rocksdb_RocksMemEnv_createMemEnv( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RocksMemEnv_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_RocksMemEnv_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* e = reinterpret_cast(jhandle); assert(e != nullptr); delete e; @@ -183,8 +186,8 @@ void Java_org_rocksdb_RocksMemEnv_disposeInternal( * Method: createTimedEnv * Signature: (J)J */ -jlong Java_org_rocksdb_TimedEnv_createTimedEnv( - JNIEnv*, jclass, jlong jbase_env_handle) { +jlong Java_org_rocksdb_TimedEnv_createTimedEnv(JNIEnv*, jclass, + jlong jbase_env_handle) { auto* base_env = reinterpret_cast(jbase_env_handle); return GET_CPLUSPLUS_POINTER(ROCKSDB_NAMESPACE::NewTimedEnv(base_env)); } @@ -194,10 +197,9 @@ jlong Java_org_rocksdb_TimedEnv_createTimedEnv( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TimedEnv_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_TimedEnv_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* e = reinterpret_cast(jhandle); assert(e != nullptr); delete e; } - diff --git a/java/rocksjni/env_options.cc b/java/rocksjni/env_options.cc index 64c94833266..3237e277543 100644 --- a/java/rocksjni/env_options.cc +++ b/java/rocksjni/env_options.cc @@ -33,8 +33,7 @@ * Method: newEnvOptions * Signature: ()J */ -jlong Java_org_rocksdb_EnvOptions_newEnvOptions__( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv *, jclass) { auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions(); return GET_CPLUSPLUS_POINTER(env_opt); } @@ -44,8 +43,8 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__( * Method: newEnvOptions * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J( - JNIEnv*, jclass, jlong jdboptions_handle) { +jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass, + jlong jdboptions_handle) { auto *db_options = reinterpret_cast(jdboptions_handle); auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions(*db_options); @@ -57,8 +56,8 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_EnvOptions_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_EnvOptions_disposeInternal(JNIEnv *, jobject, + jlong jhandle) { auto *eo = reinterpret_cast(jhandle); assert(eo != nullptr); delete eo; @@ -69,8 +68,9 @@ void Java_org_rocksdb_EnvOptions_disposeInternal( * Method: setUseMmapReads * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseMmapReads( - JNIEnv*, jobject, jlong jhandle, jboolean use_mmap_reads) { +void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jobject, + jlong jhandle, + jboolean use_mmap_reads) { ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_reads); } @@ -79,8 +79,8 @@ void Java_org_rocksdb_EnvOptions_setUseMmapReads( * Method: useMmapReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useMmapReads( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_mmap_reads); } @@ -89,8 +89,9 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapReads( * Method: setUseMmapWrites * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseMmapWrites( - JNIEnv*, jobject, jlong jhandle, jboolean use_mmap_writes) { +void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jobject, + jlong jhandle, + jboolean use_mmap_writes) { ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_writes); } @@ -99,8 +100,8 @@ void Java_org_rocksdb_EnvOptions_setUseMmapWrites( * Method: useMmapWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useMmapWrites( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_mmap_writes); } @@ -109,8 +110,9 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapWrites( * Method: setUseDirectReads * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setUseDirectReads( - JNIEnv*, jobject, jlong jhandle, jboolean use_direct_reads) { +void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jobject, + jlong jhandle, + jboolean use_direct_reads) { ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads); } @@ -119,8 +121,8 @@ void Java_org_rocksdb_EnvOptions_setUseDirectReads( * Method: useDirectReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useDirectReads( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_direct_reads); } @@ -130,7 +132,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectReads( * Signature: (JZ)V */ void Java_org_rocksdb_EnvOptions_setUseDirectWrites( - JNIEnv*, jobject, jlong jhandle, jboolean use_direct_writes) { + JNIEnv *, jobject, jlong jhandle, jboolean use_direct_writes) { ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes); } @@ -139,8 +141,8 @@ void Java_org_rocksdb_EnvOptions_setUseDirectWrites( * Method: useDirectWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_useDirectWrites( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, use_direct_writes); } @@ -149,8 +151,9 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectWrites( * Method: setAllowFallocate * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setAllowFallocate( - JNIEnv*, jobject, jlong jhandle, jboolean allow_fallocate) { +void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jobject, + jlong jhandle, + jboolean allow_fallocate) { ENV_OPTIONS_SET_BOOL(jhandle, allow_fallocate); } @@ -159,8 +162,8 @@ void Java_org_rocksdb_EnvOptions_setAllowFallocate( * Method: allowFallocate * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_allowFallocate( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, allow_fallocate); } @@ -169,8 +172,9 @@ jboolean Java_org_rocksdb_EnvOptions_allowFallocate( * Method: setSetFdCloexec * Signature: (JZ)V */ -void Java_org_rocksdb_EnvOptions_setSetFdCloexec( - JNIEnv*, jobject, jlong jhandle, jboolean set_fd_cloexec) { +void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jobject, + jlong jhandle, + jboolean set_fd_cloexec) { ENV_OPTIONS_SET_BOOL(jhandle, set_fd_cloexec); } @@ -179,8 +183,8 @@ void Java_org_rocksdb_EnvOptions_setSetFdCloexec( * Method: setFdCloexec * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_setFdCloexec( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, set_fd_cloexec); } @@ -189,8 +193,9 @@ jboolean Java_org_rocksdb_EnvOptions_setFdCloexec( * Method: setBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_EnvOptions_setBytesPerSync( - JNIEnv*, jobject, jlong jhandle, jlong bytes_per_sync) { +void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jobject, + jlong jhandle, + jlong bytes_per_sync) { ENV_OPTIONS_SET_UINT64_T(jhandle, bytes_per_sync); } @@ -199,8 +204,8 @@ void Java_org_rocksdb_EnvOptions_setBytesPerSync( * Method: bytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_bytesPerSync( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, bytes_per_sync); } @@ -210,7 +215,7 @@ jlong Java_org_rocksdb_EnvOptions_bytesPerSync( * Signature: (JZ)V */ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize( - JNIEnv*, jobject, jlong jhandle, jboolean fallocate_with_keep_size) { + JNIEnv *, jobject, jlong jhandle, jboolean fallocate_with_keep_size) { ENV_OPTIONS_SET_BOOL(jhandle, fallocate_with_keep_size); } @@ -219,8 +224,8 @@ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize( * Method: fallocateWithKeepSize * Signature: (J)Z */ -jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, fallocate_with_keep_size); } @@ -230,7 +235,7 @@ jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize( * Signature: (JJ)V */ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle, jlong compaction_readahead_size) { + JNIEnv *, jobject, jlong jhandle, jlong compaction_readahead_size) { ENV_OPTIONS_SET_SIZE_T(jhandle, compaction_readahead_size); } @@ -239,8 +244,8 @@ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize( * Method: compactionReadaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, compaction_readahead_size); } @@ -250,7 +255,7 @@ jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize( * Signature: (JJ)V */ void Java_org_rocksdb_EnvOptions_setRandomAccessMaxBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong random_access_max_buffer_size) { + JNIEnv *, jobject, jlong jhandle, jlong random_access_max_buffer_size) { ENV_OPTIONS_SET_SIZE_T(jhandle, random_access_max_buffer_size); } @@ -259,8 +264,8 @@ void Java_org_rocksdb_EnvOptions_setRandomAccessMaxBufferSize( * Method: randomAccessMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, random_access_max_buffer_size); } @@ -270,7 +275,7 @@ jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize( * Signature: (JJ)V */ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize( - JNIEnv*, jobject, jlong jhandle, jlong writable_file_max_buffer_size) { + JNIEnv *, jobject, jlong jhandle, jlong writable_file_max_buffer_size) { ENV_OPTIONS_SET_SIZE_T(jhandle, writable_file_max_buffer_size); } @@ -279,8 +284,8 @@ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize( * Method: writableFileMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jobject, + jlong jhandle) { return ENV_OPTIONS_GET(jhandle, writable_file_max_buffer_size); } @@ -289,8 +294,9 @@ jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize( * Method: setRateLimiter * Signature: (JJ)V */ -void Java_org_rocksdb_EnvOptions_setRateLimiter( - JNIEnv*, jobject, jlong jhandle, jlong rl_handle) { +void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv *, jobject, + jlong jhandle, + jlong rl_handle) { auto *sptr_rate_limiter = reinterpret_cast *>( rl_handle); diff --git a/java/rocksjni/ingest_external_file_options.cc b/java/rocksjni/ingest_external_file_options.cc index 4460c804082..052cf33256e 100644 --- a/java/rocksjni/ingest_external_file_options.cc +++ b/java/rocksjni/ingest_external_file_options.cc @@ -29,9 +29,8 @@ jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__( * Signature: (ZZZZ)J */ jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__ZZZZ( - JNIEnv*, jclass, jboolean jmove_files, - jboolean jsnapshot_consistency, jboolean jallow_global_seqno, - jboolean jallow_blocking_flush) { + JNIEnv*, jclass, jboolean jmove_files, jboolean jsnapshot_consistency, + jboolean jallow_global_seqno, jboolean jallow_blocking_flush) { auto* options = new ROCKSDB_NAMESPACE::IngestExternalFileOptions(); options->move_files = static_cast(jmove_files); options->snapshot_consistency = static_cast(jsnapshot_consistency); @@ -45,8 +44,8 @@ jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__Z * Method: moveFiles * Signature: (J)Z */ -jboolean Java_org_rocksdb_IngestExternalFileOptions_moveFiles( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_IngestExternalFileOptions_moveFiles(JNIEnv*, jobject, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return static_cast(options->move_files); @@ -165,8 +164,9 @@ void Java_org_rocksdb_IngestExternalFileOptions_setIngestBehind( * Method: writeGlobalSeqno * Signature: (J)Z */ -JNIEXPORT jboolean JNICALL Java_org_rocksdb_IngestExternalFileOptions_writeGlobalSeqno( - JNIEnv*, jobject, jlong jhandle) { +JNIEXPORT jboolean JNICALL +Java_org_rocksdb_IngestExternalFileOptions_writeGlobalSeqno(JNIEnv*, jobject, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return options->write_global_seqno == JNI_TRUE; @@ -177,7 +177,8 @@ JNIEXPORT jboolean JNICALL Java_org_rocksdb_IngestExternalFileOptions_writeGloba * Method: setWriteGlobalSeqno * Signature: (JZ)V */ -JNIEXPORT void JNICALL Java_org_rocksdb_IngestExternalFileOptions_setWriteGlobalSeqno( +JNIEXPORT void JNICALL +Java_org_rocksdb_IngestExternalFileOptions_setWriteGlobalSeqno( JNIEnv*, jobject, jlong jhandle, jboolean jwrite_global_seqno) { auto* options = reinterpret_cast(jhandle); @@ -189,8 +190,9 @@ JNIEXPORT void JNICALL Java_org_rocksdb_IngestExternalFileOptions_setWriteGlobal * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_IngestExternalFileOptions_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_IngestExternalFileOptions_disposeInternal(JNIEnv*, + jobject, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); delete options; diff --git a/java/rocksjni/jnicallback.cc b/java/rocksjni/jnicallback.cc index 28d84f277a2..f2742cd88ed 100644 --- a/java/rocksjni/jnicallback.cc +++ b/java/rocksjni/jnicallback.cc @@ -6,8 +6,10 @@ // This file implements the callback "bridge" between Java and C++ for // JNI Callbacks from C++ to sub-classes or org.rocksdb.RocksCallbackObject -#include #include "rocksjni/jnicallback.h" + +#include + #include "rocksjni/portal.h" namespace ROCKSDB_NAMESPACE { @@ -15,7 +17,7 @@ JniCallback::JniCallback(JNIEnv* env, jobject jcallback_obj) { // Note: jcallback_obj may be accessed by multiple threads, // so we ref the jvm not the env const jint rs = env->GetJavaVM(&m_jvm); - if(rs != JNI_OK) { + if (rs != JNI_OK) { // exception thrown return; } @@ -24,7 +26,7 @@ JniCallback::JniCallback(JNIEnv* env, jobject jcallback_obj) { // across multiple method calls, so we create a global ref assert(jcallback_obj != nullptr); m_jcallback_obj = env->NewGlobalRef(jcallback_obj); - if(jcallback_obj == nullptr) { + if (jcallback_obj == nullptr) { // exception thrown: OutOfMemoryError return; } diff --git a/java/rocksjni/jnicallback.h b/java/rocksjni/jnicallback.h index 5baa8973c14..a03a041282a 100644 --- a/java/rocksjni/jnicallback.h +++ b/java/rocksjni/jnicallback.h @@ -26,7 +26,7 @@ class JniCallback { jobject m_jcallback_obj; JNIEnv* getJniEnv(jboolean* attached) const; void releaseJniEnv(jboolean& attached) const; - }; - } // namespace ROCKSDB_NAMESPACE +}; +} // namespace ROCKSDB_NAMESPACE #endif // JAVA_ROCKSJNI_JNICALLBACK_H_ diff --git a/java/rocksjni/loggerjnicallback.h b/java/rocksjni/loggerjnicallback.h index 7bcba82eecc..57774988c5b 100644 --- a/java/rocksjni/loggerjnicallback.h +++ b/java/rocksjni/loggerjnicallback.h @@ -10,11 +10,13 @@ #define JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_ #include + #include #include -#include "rocksjni/jnicallback.h" + #include "port/port.h" #include "rocksdb/env.h" +#include "rocksjni/jnicallback.h" namespace ROCKSDB_NAMESPACE { @@ -23,8 +25,8 @@ class LoggerJniCallback : public JniCallback, public Logger { LoggerJniCallback(JNIEnv* env, jobject jLogger); ~LoggerJniCallback(); - using Logger::SetInfoLogLevel; using Logger::GetInfoLogLevel; + using Logger::SetInfoLogLevel; // Write an entry to the log file with the specified format. virtual void Logv(const char* format, va_list ap); // Write an entry to the log file with the specified log level @@ -43,7 +45,7 @@ class LoggerJniCallback : public JniCallback, public Logger { jobject m_jfatal_level; jobject m_jheader_level; std::unique_ptr format_str(const char* format, va_list ap) const; - }; - } // namespace ROCKSDB_NAMESPACE +}; +} // namespace ROCKSDB_NAMESPACE #endif // JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_ diff --git a/java/rocksjni/memory_util.cc b/java/rocksjni/memory_util.cc index 07284d434fe..c87c4f403bb 100644 --- a/java/rocksjni/memory_util.cc +++ b/java/rocksjni/memory_util.cc @@ -3,19 +3,18 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "rocksdb/utilities/memory_util.h" + #include + #include #include #include #include #include "include/org_rocksdb_MemoryUtil.h" - #include "rocksjni/portal.h" -#include "rocksdb/utilities/memory_util.h" - - /* * Class: org_rocksdb_MemoryUtil * Method: getApproximateMemoryUsageByType @@ -34,8 +33,9 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( std::unordered_set cache_set; jsize cache_handle_count = env->GetArrayLength(jcache_handles); - if(cache_handle_count > 0) { - jlong *ptr_jcache_handles = env->GetLongArrayElements(jcache_handles, nullptr); + if (cache_handle_count > 0) { + jlong *ptr_jcache_handles = + env->GetLongArrayElements(jcache_handles, nullptr); if (ptr_jcache_handles == nullptr) { // exception thrown: OutOfMemoryError return nullptr; @@ -46,7 +46,8 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( ptr_jcache_handles[i]); cache_set.insert(cache_ptr->get()); } - env->ReleaseLongArrayElements(jcache_handles, ptr_jcache_handles, JNI_ABORT); + env->ReleaseLongArrayElements(jcache_handles, ptr_jcache_handles, + JNI_ABORT); } std::map usage_by_type; @@ -85,8 +86,7 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType( } // Construct and return pointer to pair of jobjects return std::unique_ptr>( - new std::pair(jusage_type, - jusage_value)); + new std::pair(jusage_type, jusage_value)); }; if (!ROCKSDB_NAMESPACE::HashMapJni::putAll(env, jusage_by_type, diff --git a/java/rocksjni/optimistic_transaction_db.cc b/java/rocksjni/optimistic_transaction_db.cc index dd507caa756..238224f588d 100644 --- a/java/rocksjni/optimistic_transaction_db.cc +++ b/java/rocksjni/optimistic_transaction_db.cc @@ -63,12 +63,6 @@ Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2_3_3B_3J( std::vector column_families; const jsize len_cols = env->GetArrayLength(jcolumn_names); if (len_cols > 0) { - if (env->EnsureLocalCapacity(len_cols) != 0) { - // out of memory - env->ReleaseStringUTFChars(jdb_path, db_path); - return nullptr; - } - jlong* jco = env->GetLongArrayElements(jcolumn_options_handles, nullptr); if (jco == nullptr) { // exception thrown: OutOfMemoryError @@ -87,14 +81,6 @@ Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2_3_3B_3J( const jbyteArray jcn_ba = reinterpret_cast(jcn); const jsize jcf_name_len = env->GetArrayLength(jcn_ba); - if (env->EnsureLocalCapacity(jcf_name_len) != 0) { - // out of memory - env->DeleteLocalRef(jcn); - env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); - env->ReleaseStringUTFChars(jdb_path, db_path); - return nullptr; - } - jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, nullptr); if (jcf_name == nullptr) { // exception thrown: OutOfMemoryError @@ -159,8 +145,8 @@ Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2_3_3B_3J( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_OptimisticTransactionDB_disposeInternal( - JNIEnv *, jobject, jlong jhandle) { +void Java_org_rocksdb_OptimisticTransactionDB_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); assert(optimistic_txn_db != nullptr); @@ -172,8 +158,8 @@ void Java_org_rocksdb_OptimisticTransactionDB_disposeInternal( * Method: closeDatabase * Signature: (J)V */ -void Java_org_rocksdb_OptimisticTransactionDB_closeDatabase( - JNIEnv* env, jclass, jlong jhandle) { +void Java_org_rocksdb_OptimisticTransactionDB_closeDatabase(JNIEnv* env, jclass, + jlong jhandle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); assert(optimistic_txn_db != nullptr); @@ -276,8 +262,8 @@ jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJJ( * Method: getBaseDB * Signature: (J)J */ -jlong Java_org_rocksdb_OptimisticTransactionDB_getBaseDB( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_OptimisticTransactionDB_getBaseDB(JNIEnv*, jobject, + jlong jhandle) { auto* optimistic_txn_db = reinterpret_cast(jhandle); return GET_CPLUSPLUS_POINTER(optimistic_txn_db->GetBaseDB()); diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 34eb900b32f..b848ea9cffd 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -44,8 +44,7 @@ * Method: newOptions * Signature: ()J */ -jlong Java_org_rocksdb_Options_newOptions__( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_Options_newOptions__(JNIEnv*, jclass) { auto* op = new ROCKSDB_NAMESPACE::Options(); return GET_CPLUSPLUS_POINTER(op); } @@ -55,8 +54,8 @@ jlong Java_org_rocksdb_Options_newOptions__( * Method: newOptions * Signature: (JJ)J */ -jlong Java_org_rocksdb_Options_newOptions__JJ( - JNIEnv*, jclass, jlong jdboptions, jlong jcfoptions) { +jlong Java_org_rocksdb_Options_newOptions__JJ(JNIEnv*, jclass, jlong jdboptions, + jlong jcfoptions) { auto* dbOpt = reinterpret_cast(jdboptions); auto* cfOpt = reinterpret_cast( @@ -70,8 +69,7 @@ jlong Java_org_rocksdb_Options_newOptions__JJ( * Method: copyOptions * Signature: (J)J */ -jlong Java_org_rocksdb_Options_copyOptions( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_Options_copyOptions(JNIEnv*, jclass, jlong jhandle) { auto new_opt = new ROCKSDB_NAMESPACE::Options( *(reinterpret_cast(jhandle))); return GET_CPLUSPLUS_POINTER(new_opt); @@ -82,8 +80,7 @@ jlong Java_org_rocksdb_Options_copyOptions( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_Options_disposeInternal( - JNIEnv*, jobject, jlong handle) { +void Java_org_rocksdb_Options_disposeInternal(JNIEnv*, jobject, jlong handle) { auto* op = reinterpret_cast(handle); assert(op != nullptr); delete op; @@ -94,8 +91,9 @@ void Java_org_rocksdb_Options_disposeInternal( * Method: setIncreaseParallelism * Signature: (JI)V */ -void Java_org_rocksdb_Options_setIncreaseParallelism( - JNIEnv*, jobject, jlong jhandle, jint totalThreads) { +void Java_org_rocksdb_Options_setIncreaseParallelism(JNIEnv*, jobject, + jlong jhandle, + jint totalThreads) { reinterpret_cast(jhandle)->IncreaseParallelism( static_cast(totalThreads)); } @@ -105,8 +103,8 @@ void Java_org_rocksdb_Options_setIncreaseParallelism( * Method: setCreateIfMissing * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setCreateIfMissing( - JNIEnv*, jobject, jlong jhandle, jboolean flag) { +void Java_org_rocksdb_Options_setCreateIfMissing(JNIEnv*, jobject, + jlong jhandle, jboolean flag) { reinterpret_cast(jhandle)->create_if_missing = flag; } @@ -116,8 +114,8 @@ void Java_org_rocksdb_Options_setCreateIfMissing( * Method: createIfMissing * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_createIfMissing( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_createIfMissing(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->create_if_missing; } @@ -127,8 +125,9 @@ jboolean Java_org_rocksdb_Options_createIfMissing( * Method: setCreateMissingColumnFamilies * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setCreateMissingColumnFamilies( - JNIEnv*, jobject, jlong jhandle, jboolean flag) { +void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(JNIEnv*, jobject, + jlong jhandle, + jboolean flag) { reinterpret_cast(jhandle) ->create_missing_column_families = flag; } @@ -138,8 +137,8 @@ void Java_org_rocksdb_Options_setCreateMissingColumnFamilies( * Method: createMissingColumnFamilies * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_createMissingColumnFamilies( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->create_missing_column_families; } @@ -149,8 +148,9 @@ jboolean Java_org_rocksdb_Options_createMissingColumnFamilies( * Method: setComparatorHandle * Signature: (JI)V */ -void Java_org_rocksdb_Options_setComparatorHandle__JI( - JNIEnv*, jobject, jlong jhandle, jint builtinComparator) { +void Java_org_rocksdb_Options_setComparatorHandle__JI(JNIEnv*, jobject, + jlong jhandle, + jint builtinComparator) { switch (builtinComparator) { case 1: reinterpret_cast(jhandle)->comparator = @@ -168,9 +168,10 @@ void Java_org_rocksdb_Options_setComparatorHandle__JI( * Method: setComparatorHandle * Signature: (JJB)V */ -void Java_org_rocksdb_Options_setComparatorHandle__JJB( - JNIEnv*, jobject, jlong jopt_handle, jlong jcomparator_handle, - jbyte jcomparator_type) { +void Java_org_rocksdb_Options_setComparatorHandle__JJB(JNIEnv*, jobject, + jlong jopt_handle, + jlong jcomparator_handle, + jbyte jcomparator_type) { ROCKSDB_NAMESPACE::Comparator* comparator = nullptr; switch (jcomparator_type) { // JAVA_COMPARATOR @@ -194,8 +195,9 @@ void Java_org_rocksdb_Options_setComparatorHandle__JJB( * Method: setMergeOperatorName * Signature: (JJjava/lang/String)V */ -void Java_org_rocksdb_Options_setMergeOperatorName( - JNIEnv* env, jobject, jlong jhandle, jstring jop_name) { +void Java_org_rocksdb_Options_setMergeOperatorName(JNIEnv* env, jobject, + jlong jhandle, + jstring jop_name) { const char* op_name = env->GetStringUTFChars(jop_name, nullptr); if (op_name == nullptr) { // exception thrown: OutOfMemoryError @@ -214,8 +216,8 @@ void Java_org_rocksdb_Options_setMergeOperatorName( * Method: setMergeOperator * Signature: (JJjava/lang/String)V */ -void Java_org_rocksdb_Options_setMergeOperator( - JNIEnv*, jobject, jlong jhandle, jlong mergeOperatorHandle) { +void Java_org_rocksdb_Options_setMergeOperator(JNIEnv*, jobject, jlong jhandle, + jlong mergeOperatorHandle) { reinterpret_cast(jhandle)->merge_operator = *(reinterpret_cast*>( mergeOperatorHandle)); @@ -227,8 +229,7 @@ void Java_org_rocksdb_Options_setMergeOperator( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setCompactionFilterHandle( - JNIEnv*, jobject, jlong jopt_handle, - jlong jcompactionfilter_handle) { + JNIEnv*, jobject, jlong jopt_handle, jlong jcompactionfilter_handle) { reinterpret_cast(jopt_handle) ->compaction_filter = reinterpret_cast( @@ -255,8 +256,9 @@ void JNICALL Java_org_rocksdb_Options_setCompactionFilterFactoryHandle( * Method: setWriteBufferSize * Signature: (JJ)I */ -void Java_org_rocksdb_Options_setWriteBufferSize( - JNIEnv* env, jobject, jlong jhandle, jlong jwrite_buffer_size) { +void Java_org_rocksdb_Options_setWriteBufferSize(JNIEnv* env, jobject, + jlong jhandle, + jlong jwrite_buffer_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( jwrite_buffer_size); if (s.ok()) { @@ -287,8 +289,8 @@ void Java_org_rocksdb_Options_setWriteBufferManager( * Method: writeBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_writeBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_writeBufferSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->write_buffer_size; } @@ -299,8 +301,7 @@ jlong Java_org_rocksdb_Options_writeBufferSize( * Signature: (JI)V */ void Java_org_rocksdb_Options_setMaxWriteBufferNumber( - JNIEnv*, jobject, jlong jhandle, - jint jmax_write_buffer_number) { + JNIEnv*, jobject, jlong jhandle, jint jmax_write_buffer_number) { reinterpret_cast(jhandle) ->max_write_buffer_number = jmax_write_buffer_number; } @@ -310,8 +311,8 @@ void Java_org_rocksdb_Options_setMaxWriteBufferNumber( * Method: setStatistics * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setStatistics( - JNIEnv*, jobject, jlong jhandle, jlong jstatistics_handle) { +void Java_org_rocksdb_Options_setStatistics(JNIEnv*, jobject, jlong jhandle, + jlong jstatistics_handle) { auto* opt = reinterpret_cast(jhandle); auto* pSptr = reinterpret_cast*>( @@ -324,8 +325,7 @@ void Java_org_rocksdb_Options_setStatistics( * Method: statistics * Signature: (J)J */ -jlong Java_org_rocksdb_Options_statistics( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_statistics(JNIEnv*, jobject, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); std::shared_ptr sptr = opt->statistics; if (sptr == nullptr) { @@ -342,8 +342,8 @@ jlong Java_org_rocksdb_Options_statistics( * Method: maxWriteBufferNumber * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxWriteBufferNumber( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxWriteBufferNumber(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_write_buffer_number; } @@ -353,8 +353,8 @@ jint Java_org_rocksdb_Options_maxWriteBufferNumber( * Method: errorIfExists * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_errorIfExists( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_errorIfExists(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->error_if_exists; } @@ -364,8 +364,8 @@ jboolean Java_org_rocksdb_Options_errorIfExists( * Method: setErrorIfExists * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setErrorIfExists( - JNIEnv*, jobject, jlong jhandle, jboolean error_if_exists) { +void Java_org_rocksdb_Options_setErrorIfExists(JNIEnv*, jobject, jlong jhandle, + jboolean error_if_exists) { reinterpret_cast(jhandle)->error_if_exists = static_cast(error_if_exists); } @@ -375,8 +375,8 @@ void Java_org_rocksdb_Options_setErrorIfExists( * Method: paranoidChecks * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_paranoidChecks( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_paranoidChecks(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->paranoid_checks; } @@ -386,8 +386,8 @@ jboolean Java_org_rocksdb_Options_paranoidChecks( * Method: setParanoidChecks * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setParanoidChecks( - JNIEnv*, jobject, jlong jhandle, jboolean paranoid_checks) { +void Java_org_rocksdb_Options_setParanoidChecks(JNIEnv*, jobject, jlong jhandle, + jboolean paranoid_checks) { reinterpret_cast(jhandle)->paranoid_checks = static_cast(paranoid_checks); } @@ -397,8 +397,8 @@ void Java_org_rocksdb_Options_setParanoidChecks( * Method: setEnv * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setEnv( - JNIEnv*, jobject, jlong jhandle, jlong jenv) { +void Java_org_rocksdb_Options_setEnv(JNIEnv*, jobject, jlong jhandle, + jlong jenv) { reinterpret_cast(jhandle)->env = reinterpret_cast(jenv); } @@ -408,8 +408,9 @@ void Java_org_rocksdb_Options_setEnv( * Method: setMaxTotalWalSize * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setMaxTotalWalSize( - JNIEnv*, jobject, jlong jhandle, jlong jmax_total_wal_size) { +void Java_org_rocksdb_Options_setMaxTotalWalSize(JNIEnv*, jobject, + jlong jhandle, + jlong jmax_total_wal_size) { reinterpret_cast(jhandle)->max_total_wal_size = static_cast(jmax_total_wal_size); } @@ -419,8 +420,8 @@ void Java_org_rocksdb_Options_setMaxTotalWalSize( * Method: maxTotalWalSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxTotalWalSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxTotalWalSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_total_wal_size; } @@ -430,8 +431,7 @@ jlong Java_org_rocksdb_Options_maxTotalWalSize( * Method: maxOpenFiles * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxOpenFiles( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxOpenFiles(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->max_open_files; } @@ -440,8 +440,8 @@ jint Java_org_rocksdb_Options_maxOpenFiles( * Method: setMaxOpenFiles * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMaxOpenFiles( - JNIEnv*, jobject, jlong jhandle, jint max_open_files) { +void Java_org_rocksdb_Options_setMaxOpenFiles(JNIEnv*, jobject, jlong jhandle, + jint max_open_files) { reinterpret_cast(jhandle)->max_open_files = static_cast(max_open_files); } @@ -462,8 +462,8 @@ void Java_org_rocksdb_Options_setMaxFileOpeningThreads( * Method: maxFileOpeningThreads * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxFileOpeningThreads( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxFileOpeningThreads(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_file_opening_threads); } @@ -473,8 +473,7 @@ jint Java_org_rocksdb_Options_maxFileOpeningThreads( * Method: useFsync * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_useFsync( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_useFsync(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->use_fsync; } @@ -483,8 +482,8 @@ jboolean Java_org_rocksdb_Options_useFsync( * Method: setUseFsync * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setUseFsync( - JNIEnv*, jobject, jlong jhandle, jboolean use_fsync) { +void Java_org_rocksdb_Options_setUseFsync(JNIEnv*, jobject, jlong jhandle, + jboolean use_fsync) { reinterpret_cast(jhandle)->use_fsync = static_cast(use_fsync); } @@ -494,9 +493,9 @@ void Java_org_rocksdb_Options_setUseFsync( * Method: setDbPaths * Signature: (J[Ljava/lang/String;[J)V */ -void Java_org_rocksdb_Options_setDbPaths( - JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths, - jlongArray jtarget_sizes) { +void Java_org_rocksdb_Options_setDbPaths(JNIEnv* env, jobject, jlong jhandle, + jobjectArray jpaths, + jlongArray jtarget_sizes) { std::vector db_paths; jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr); if (ptr_jtarget_size == nullptr) { @@ -540,8 +539,7 @@ void Java_org_rocksdb_Options_setDbPaths( * Method: dbPathsLen * Signature: (J)J */ -jlong Java_org_rocksdb_Options_dbPathsLen( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_dbPathsLen(JNIEnv*, jobject, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->db_paths.size()); } @@ -551,9 +549,9 @@ jlong Java_org_rocksdb_Options_dbPathsLen( * Method: dbPaths * Signature: (J[Ljava/lang/String;[J)V */ -void Java_org_rocksdb_Options_dbPaths( - JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths, - jlongArray jtarget_sizes) { +void Java_org_rocksdb_Options_dbPaths(JNIEnv* env, jobject, jlong jhandle, + jobjectArray jpaths, + jlongArray jtarget_sizes) { jboolean is_copy; jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy); if (ptr_jtarget_size == nullptr) { @@ -592,8 +590,7 @@ void Java_org_rocksdb_Options_dbPaths( * Method: dbLogDir * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_Options_dbLogDir( - JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_Options_dbLogDir(JNIEnv* env, jobject, jlong jhandle) { return env->NewStringUTF( reinterpret_cast(jhandle) ->db_log_dir.c_str()); @@ -604,8 +601,8 @@ jstring Java_org_rocksdb_Options_dbLogDir( * Method: setDbLogDir * Signature: (JLjava/lang/String)V */ -void Java_org_rocksdb_Options_setDbLogDir( - JNIEnv* env, jobject, jlong jhandle, jstring jdb_log_dir) { +void Java_org_rocksdb_Options_setDbLogDir(JNIEnv* env, jobject, jlong jhandle, + jstring jdb_log_dir) { const char* log_dir = env->GetStringUTFChars(jdb_log_dir, nullptr); if (log_dir == nullptr) { // exception thrown: OutOfMemoryError @@ -621,8 +618,7 @@ void Java_org_rocksdb_Options_setDbLogDir( * Method: walDir * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_Options_walDir( - JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_Options_walDir(JNIEnv* env, jobject, jlong jhandle) { return env->NewStringUTF( reinterpret_cast(jhandle)->wal_dir.c_str()); } @@ -632,8 +628,8 @@ jstring Java_org_rocksdb_Options_walDir( * Method: setWalDir * Signature: (JLjava/lang/String)V */ -void Java_org_rocksdb_Options_setWalDir( - JNIEnv* env, jobject, jlong jhandle, jstring jwal_dir) { +void Java_org_rocksdb_Options_setWalDir(JNIEnv* env, jobject, jlong jhandle, + jstring jwal_dir) { const char* wal_dir = env->GetStringUTFChars(jwal_dir, nullptr); if (wal_dir == nullptr) { // exception thrown: OutOfMemoryError @@ -649,8 +645,8 @@ void Java_org_rocksdb_Options_setWalDir( * Method: deleteObsoleteFilesPeriodMicros * Signature: (J)J */ -jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->delete_obsolete_files_period_micros; } @@ -660,8 +656,10 @@ jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros( * Method: setDeleteObsoleteFilesPeriodMicros * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros( - JNIEnv*, jobject, jlong jhandle, jlong micros) { +void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(JNIEnv*, + jobject, + jlong jhandle, + jlong micros) { reinterpret_cast(jhandle) ->delete_obsolete_files_period_micros = static_cast(micros); } @@ -671,8 +669,8 @@ void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros( * Method: maxBackgroundCompactions * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxBackgroundCompactions( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxBackgroundCompactions(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_compactions; } @@ -682,8 +680,9 @@ jint Java_org_rocksdb_Options_maxBackgroundCompactions( * Method: setMaxBackgroundCompactions * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMaxBackgroundCompactions( - JNIEnv*, jobject, jlong jhandle, jint max) { +void Java_org_rocksdb_Options_setMaxBackgroundCompactions(JNIEnv*, jobject, + jlong jhandle, + jint max) { reinterpret_cast(jhandle) ->max_background_compactions = static_cast(max); } @@ -693,8 +692,8 @@ void Java_org_rocksdb_Options_setMaxBackgroundCompactions( * Method: setMaxSubcompactions * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMaxSubcompactions( - JNIEnv*, jobject, jlong jhandle, jint max) { +void Java_org_rocksdb_Options_setMaxSubcompactions(JNIEnv*, jobject, + jlong jhandle, jint max) { reinterpret_cast(jhandle)->max_subcompactions = static_cast(max); } @@ -704,8 +703,8 @@ void Java_org_rocksdb_Options_setMaxSubcompactions( * Method: maxSubcompactions * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxSubcompactions( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxSubcompactions(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_subcompactions; } @@ -715,8 +714,8 @@ jint Java_org_rocksdb_Options_maxSubcompactions( * Method: maxBackgroundFlushes * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxBackgroundFlushes( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxBackgroundFlushes(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_flushes; } @@ -737,8 +736,8 @@ void Java_org_rocksdb_Options_setMaxBackgroundFlushes( * Method: maxBackgroundJobs * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxBackgroundJobs( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxBackgroundJobs(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_jobs; } @@ -748,8 +747,9 @@ jint Java_org_rocksdb_Options_maxBackgroundJobs( * Method: setMaxBackgroundJobs * Signature: (JI)V */ -void Java_org_rocksdb_Options_setMaxBackgroundJobs( - JNIEnv*, jobject, jlong jhandle, jint max_background_jobs) { +void Java_org_rocksdb_Options_setMaxBackgroundJobs(JNIEnv*, jobject, + jlong jhandle, + jint max_background_jobs) { reinterpret_cast(jhandle)->max_background_jobs = static_cast(max_background_jobs); } @@ -759,8 +759,7 @@ void Java_org_rocksdb_Options_setMaxBackgroundJobs( * Method: maxLogFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxLogFileSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxLogFileSize(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle) ->max_log_file_size; } @@ -770,8 +769,9 @@ jlong Java_org_rocksdb_Options_maxLogFileSize( * Method: setMaxLogFileSize * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setMaxLogFileSize( - JNIEnv* env, jobject, jlong jhandle, jlong max_log_file_size) { +void Java_org_rocksdb_Options_setMaxLogFileSize(JNIEnv* env, jobject, + jlong jhandle, + jlong max_log_file_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(max_log_file_size); if (s.ok()) { @@ -787,8 +787,8 @@ void Java_org_rocksdb_Options_setMaxLogFileSize( * Method: logFileTimeToRoll * Signature: (J)J */ -jlong Java_org_rocksdb_Options_logFileTimeToRoll( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_logFileTimeToRoll(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->log_file_time_to_roll; } @@ -815,8 +815,7 @@ void Java_org_rocksdb_Options_setLogFileTimeToRoll( * Method: keepLogFileNum * Signature: (J)J */ -jlong Java_org_rocksdb_Options_keepLogFileNum( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_keepLogFileNum(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle) ->keep_log_file_num; } @@ -826,8 +825,9 @@ jlong Java_org_rocksdb_Options_keepLogFileNum( * Method: setKeepLogFileNum * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setKeepLogFileNum( - JNIEnv* env, jobject, jlong jhandle, jlong keep_log_file_num) { +void Java_org_rocksdb_Options_setKeepLogFileNum(JNIEnv* env, jobject, + jlong jhandle, + jlong keep_log_file_num) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(keep_log_file_num); if (s.ok()) { @@ -843,8 +843,8 @@ void Java_org_rocksdb_Options_setKeepLogFileNum( * Method: recycleLogFileNum * Signature: (J)J */ -jlong Java_org_rocksdb_Options_recycleLogFileNum( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->recycle_log_file_num; } @@ -854,8 +854,9 @@ jlong Java_org_rocksdb_Options_recycleLogFileNum( * Method: setRecycleLogFileNum * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setRecycleLogFileNum( - JNIEnv* env, jobject, jlong jhandle, jlong recycle_log_file_num) { +void Java_org_rocksdb_Options_setRecycleLogFileNum(JNIEnv* env, jobject, + jlong jhandle, + jlong recycle_log_file_num) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t( recycle_log_file_num); if (s.ok()) { @@ -871,8 +872,8 @@ void Java_org_rocksdb_Options_setRecycleLogFileNum( * Method: maxManifestFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxManifestFileSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxManifestFileSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_manifest_file_size; } @@ -881,8 +882,8 @@ jlong Java_org_rocksdb_Options_maxManifestFileSize( * Method: memTableFactoryName * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_Options_memTableFactoryName( - JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_Options_memTableFactoryName(JNIEnv* env, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::MemTableRepFactory* tf = opt->memtable_factory.get(); @@ -1042,8 +1043,9 @@ void Java_org_rocksdb_Options_setMaxManifestFileSize( * Method: setMemTableFactory * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setMemTableFactory( - JNIEnv*, jobject, jlong jhandle, jlong jfactory_handle) { +void Java_org_rocksdb_Options_setMemTableFactory(JNIEnv*, jobject, + jlong jhandle, + jlong jfactory_handle) { reinterpret_cast(jhandle) ->memtable_factory.reset( reinterpret_cast( @@ -1055,8 +1057,8 @@ void Java_org_rocksdb_Options_setMemTableFactory( * Method: setRateLimiter * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setRateLimiter( - JNIEnv*, jobject, jlong jhandle, jlong jrate_limiter_handle) { +void Java_org_rocksdb_Options_setRateLimiter(JNIEnv*, jobject, jlong jhandle, + jlong jrate_limiter_handle) { std::shared_ptr* pRateLimiter = reinterpret_cast*>( jrate_limiter_handle); @@ -1083,8 +1085,8 @@ void Java_org_rocksdb_Options_setSstFileManager( * Method: setLogger * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setLogger( - JNIEnv*, jobject, jlong jhandle, jlong jlogger_handle) { +void Java_org_rocksdb_Options_setLogger(JNIEnv*, jobject, jlong jhandle, + jlong jlogger_handle) { std::shared_ptr* pLogger = reinterpret_cast*>( jlogger_handle); @@ -1096,8 +1098,8 @@ void Java_org_rocksdb_Options_setLogger( * Method: setInfoLogLevel * Signature: (JB)V */ -void Java_org_rocksdb_Options_setInfoLogLevel( - JNIEnv*, jobject, jlong jhandle, jbyte jlog_level) { +void Java_org_rocksdb_Options_setInfoLogLevel(JNIEnv*, jobject, jlong jhandle, + jbyte jlog_level) { reinterpret_cast(jhandle)->info_log_level = static_cast(jlog_level); } @@ -1107,8 +1109,7 @@ void Java_org_rocksdb_Options_setInfoLogLevel( * Method: infoLogLevel * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_infoLogLevel( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Options_infoLogLevel(JNIEnv*, jobject, jlong jhandle) { return static_cast( reinterpret_cast(jhandle)->info_log_level); } @@ -1118,8 +1119,8 @@ jbyte Java_org_rocksdb_Options_infoLogLevel( * Method: tableCacheNumshardbits * Signature: (J)I */ -jint Java_org_rocksdb_Options_tableCacheNumshardbits( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_tableCacheNumshardbits(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->table_cache_numshardbits; } @@ -1150,8 +1151,9 @@ void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor( * Method: useCappedPrefixExtractor * Signature: (JI)V */ -void Java_org_rocksdb_Options_useCappedPrefixExtractor( - JNIEnv*, jobject, jlong jhandle, jint jprefix_length) { +void Java_org_rocksdb_Options_useCappedPrefixExtractor(JNIEnv*, jobject, + jlong jhandle, + jint jprefix_length) { reinterpret_cast(jhandle) ->prefix_extractor.reset(ROCKSDB_NAMESPACE::NewCappedPrefixTransform( static_cast(jprefix_length))); @@ -1162,8 +1164,7 @@ void Java_org_rocksdb_Options_useCappedPrefixExtractor( * Method: walTtlSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_Options_walTtlSeconds( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_walTtlSeconds(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle) ->WAL_ttl_seconds; } @@ -1173,8 +1174,8 @@ jlong Java_org_rocksdb_Options_walTtlSeconds( * Method: setWalTtlSeconds * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWalTtlSeconds( - JNIEnv*, jobject, jlong jhandle, jlong WAL_ttl_seconds) { +void Java_org_rocksdb_Options_setWalTtlSeconds(JNIEnv*, jobject, jlong jhandle, + jlong WAL_ttl_seconds) { reinterpret_cast(jhandle)->WAL_ttl_seconds = static_cast(WAL_ttl_seconds); } @@ -1184,8 +1185,7 @@ void Java_org_rocksdb_Options_setWalTtlSeconds( * Method: walTtlSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_Options_walSizeLimitMB( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_walSizeLimitMB(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle) ->WAL_size_limit_MB; } @@ -1195,8 +1195,8 @@ jlong Java_org_rocksdb_Options_walSizeLimitMB( * Method: setWalSizeLimitMB * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWalSizeLimitMB( - JNIEnv*, jobject, jlong jhandle, jlong WAL_size_limit_MB) { +void Java_org_rocksdb_Options_setWalSizeLimitMB(JNIEnv*, jobject, jlong jhandle, + jlong WAL_size_limit_MB) { reinterpret_cast(jhandle)->WAL_size_limit_MB = static_cast(WAL_size_limit_MB); } @@ -1229,8 +1229,8 @@ jlong Java_org_rocksdb_Options_maxWriteBatchGroupSizeBytes(JNIEnv*, jclass, * Method: manifestPreallocationSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_manifestPreallocationSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_manifestPreallocationSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->manifest_preallocation_size; } @@ -1256,8 +1256,8 @@ void Java_org_rocksdb_Options_setManifestPreallocationSize( * Method: setTableFactory * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setTableFactory( - JNIEnv*, jobject, jlong jhandle, jlong jtable_factory_handle) { +void Java_org_rocksdb_Options_setTableFactory(JNIEnv*, jobject, jlong jhandle, + jlong jtable_factory_handle) { auto* options = reinterpret_cast(jhandle); auto* table_factory = reinterpret_cast(jtable_factory_handle); @@ -1297,8 +1297,8 @@ void Java_org_rocksdb_Options_setCompactionThreadLimiter( * Method: allowMmapReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowMmapReads( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_allowMmapReads(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->allow_mmap_reads; } @@ -1308,8 +1308,8 @@ jboolean Java_org_rocksdb_Options_allowMmapReads( * Method: setAllowMmapReads * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllowMmapReads( - JNIEnv*, jobject, jlong jhandle, jboolean allow_mmap_reads) { +void Java_org_rocksdb_Options_setAllowMmapReads(JNIEnv*, jobject, jlong jhandle, + jboolean allow_mmap_reads) { reinterpret_cast(jhandle)->allow_mmap_reads = static_cast(allow_mmap_reads); } @@ -1319,8 +1319,8 @@ void Java_org_rocksdb_Options_setAllowMmapReads( * Method: allowMmapWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowMmapWrites( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_allowMmapWrites(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->allow_mmap_writes; } @@ -1330,8 +1330,9 @@ jboolean Java_org_rocksdb_Options_allowMmapWrites( * Method: setAllowMmapWrites * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllowMmapWrites( - JNIEnv*, jobject, jlong jhandle, jboolean allow_mmap_writes) { +void Java_org_rocksdb_Options_setAllowMmapWrites(JNIEnv*, jobject, + jlong jhandle, + jboolean allow_mmap_writes) { reinterpret_cast(jhandle)->allow_mmap_writes = static_cast(allow_mmap_writes); } @@ -1341,8 +1342,8 @@ void Java_org_rocksdb_Options_setAllowMmapWrites( * Method: useDirectReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_useDirectReads( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->use_direct_reads; } @@ -1352,8 +1353,8 @@ jboolean Java_org_rocksdb_Options_useDirectReads( * Method: setUseDirectReads * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setUseDirectReads( - JNIEnv*, jobject, jlong jhandle, jboolean use_direct_reads) { +void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv*, jobject, jlong jhandle, + jboolean use_direct_reads) { reinterpret_cast(jhandle)->use_direct_reads = static_cast(use_direct_reads); } @@ -1387,8 +1388,8 @@ void Java_org_rocksdb_Options_setUseDirectIoForFlushAndCompaction( * Method: setAllowFAllocate * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllowFAllocate( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_fallocate) { +void Java_org_rocksdb_Options_setAllowFAllocate(JNIEnv*, jobject, jlong jhandle, + jboolean jallow_fallocate) { reinterpret_cast(jhandle)->allow_fallocate = static_cast(jallow_fallocate); } @@ -1398,8 +1399,8 @@ void Java_org_rocksdb_Options_setAllowFAllocate( * Method: allowFAllocate * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowFAllocate( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_allowFAllocate(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_fallocate); } @@ -1409,8 +1410,8 @@ jboolean Java_org_rocksdb_Options_allowFAllocate( * Method: isFdCloseOnExec * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_isFdCloseOnExec( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_isFdCloseOnExec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->is_fd_close_on_exec; } @@ -1420,8 +1421,9 @@ jboolean Java_org_rocksdb_Options_isFdCloseOnExec( * Method: setIsFdCloseOnExec * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setIsFdCloseOnExec( - JNIEnv*, jobject, jlong jhandle, jboolean is_fd_close_on_exec) { +void Java_org_rocksdb_Options_setIsFdCloseOnExec(JNIEnv*, jobject, + jlong jhandle, + jboolean is_fd_close_on_exec) { reinterpret_cast(jhandle)->is_fd_close_on_exec = static_cast(is_fd_close_on_exec); } @@ -1431,8 +1433,8 @@ void Java_org_rocksdb_Options_setIsFdCloseOnExec( * Method: statsDumpPeriodSec * Signature: (J)I */ -jint Java_org_rocksdb_Options_statsDumpPeriodSec( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_statsDumpPeriodSec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->stats_dump_period_sec; } @@ -1443,8 +1445,7 @@ jint Java_org_rocksdb_Options_statsDumpPeriodSec( * Signature: (JI)V */ void Java_org_rocksdb_Options_setStatsDumpPeriodSec( - JNIEnv*, jobject, jlong jhandle, - jint jstats_dump_period_sec) { + JNIEnv*, jobject, jlong jhandle, jint jstats_dump_period_sec) { reinterpret_cast(jhandle) ->stats_dump_period_sec = static_cast(jstats_dump_period_sec); @@ -1455,8 +1456,8 @@ void Java_org_rocksdb_Options_setStatsDumpPeriodSec( * Method: statsPersistPeriodSec * Signature: (J)I */ -jint Java_org_rocksdb_Options_statsPersistPeriodSec( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_statsPersistPeriodSec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->stats_persist_period_sec; } @@ -1478,8 +1479,8 @@ void Java_org_rocksdb_Options_setStatsPersistPeriodSec( * Method: statsHistoryBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_statsHistoryBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_statsHistoryBufferSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->stats_history_buffer_size; } @@ -1501,8 +1502,8 @@ void Java_org_rocksdb_Options_setStatsHistoryBufferSize( * Method: adviseRandomOnOpen * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_adviseRandomOnOpen( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->advise_random_on_open; } @@ -1513,8 +1514,7 @@ jboolean Java_org_rocksdb_Options_adviseRandomOnOpen( * Signature: (JZ)V */ void Java_org_rocksdb_Options_setAdviseRandomOnOpen( - JNIEnv*, jobject, jlong jhandle, - jboolean advise_random_on_open) { + JNIEnv*, jobject, jlong jhandle, jboolean advise_random_on_open) { reinterpret_cast(jhandle) ->advise_random_on_open = static_cast(advise_random_on_open); } @@ -1525,8 +1525,7 @@ void Java_org_rocksdb_Options_setAdviseRandomOnOpen( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setDbWriteBufferSize( - JNIEnv*, jobject, jlong jhandle, - jlong jdb_write_buffer_size) { + JNIEnv*, jobject, jlong jhandle, jlong jdb_write_buffer_size) { auto* opt = reinterpret_cast(jhandle); opt->db_write_buffer_size = static_cast(jdb_write_buffer_size); } @@ -1536,8 +1535,8 @@ void Java_org_rocksdb_Options_setDbWriteBufferSize( * Method: dbWriteBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_dbWriteBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_dbWriteBufferSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->db_write_buffer_size); } @@ -1548,8 +1547,7 @@ jlong Java_org_rocksdb_Options_dbWriteBufferSize( * Signature: (JB)V */ void Java_org_rocksdb_Options_setAccessHintOnCompactionStart( - JNIEnv*, jobject, jlong jhandle, - jbyte jaccess_hint_value) { + JNIEnv*, jobject, jlong jhandle, jbyte jaccess_hint_value) { auto* opt = reinterpret_cast(jhandle); opt->access_hint_on_compaction_start = ROCKSDB_NAMESPACE::AccessHintJni::toCppAccessHint(jaccess_hint_value); @@ -1560,8 +1558,8 @@ void Java_org_rocksdb_Options_setAccessHintOnCompactionStart( * Method: accessHintOnCompactionStart * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_accessHintOnCompactionStart( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Options_accessHintOnCompactionStart(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::AccessHintJni::toJavaAccessHint( opt->access_hint_on_compaction_start); @@ -1573,8 +1571,7 @@ jbyte Java_org_rocksdb_Options_accessHintOnCompactionStart( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setCompactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle, - jlong jcompaction_readahead_size) { + JNIEnv*, jobject, jlong jhandle, jlong jcompaction_readahead_size) { auto* opt = reinterpret_cast(jhandle); opt->compaction_readahead_size = static_cast(jcompaction_readahead_size); @@ -1585,8 +1582,8 @@ void Java_org_rocksdb_Options_setCompactionReadaheadSize( * Method: compactionReadaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_compactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_compactionReadaheadSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->compaction_readahead_size); } @@ -1608,8 +1605,8 @@ void Java_org_rocksdb_Options_setRandomAccessMaxBufferSize( * Method: randomAccessMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->random_access_max_buffer_size); } @@ -1620,8 +1617,7 @@ jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setWritableFileMaxBufferSize( - JNIEnv*, jobject, jlong jhandle, - jlong jwritable_file_max_buffer_size) { + JNIEnv*, jobject, jlong jhandle, jlong jwritable_file_max_buffer_size) { auto* opt = reinterpret_cast(jhandle); opt->writable_file_max_buffer_size = static_cast(jwritable_file_max_buffer_size); @@ -1632,8 +1628,8 @@ void Java_org_rocksdb_Options_setWritableFileMaxBufferSize( * Method: writableFileMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_writableFileMaxBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_writableFileMaxBufferSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->writable_file_max_buffer_size); } @@ -1643,8 +1639,8 @@ jlong Java_org_rocksdb_Options_writableFileMaxBufferSize( * Method: useAdaptiveMutex * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_useAdaptiveMutex( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_useAdaptiveMutex(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->use_adaptive_mutex; } @@ -1654,8 +1650,9 @@ jboolean Java_org_rocksdb_Options_useAdaptiveMutex( * Method: setUseAdaptiveMutex * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setUseAdaptiveMutex( - JNIEnv*, jobject, jlong jhandle, jboolean use_adaptive_mutex) { +void Java_org_rocksdb_Options_setUseAdaptiveMutex(JNIEnv*, jobject, + jlong jhandle, + jboolean use_adaptive_mutex) { reinterpret_cast(jhandle)->use_adaptive_mutex = static_cast(use_adaptive_mutex); } @@ -1665,8 +1662,7 @@ void Java_org_rocksdb_Options_setUseAdaptiveMutex( * Method: bytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_Options_bytesPerSync( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_bytesPerSync(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->bytes_per_sync; } @@ -1675,8 +1671,8 @@ jlong Java_org_rocksdb_Options_bytesPerSync( * Method: setBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setBytesPerSync( - JNIEnv*, jobject, jlong jhandle, jlong bytes_per_sync) { +void Java_org_rocksdb_Options_setBytesPerSync(JNIEnv*, jobject, jlong jhandle, + jlong bytes_per_sync) { reinterpret_cast(jhandle)->bytes_per_sync = static_cast(bytes_per_sync); } @@ -1686,8 +1682,9 @@ void Java_org_rocksdb_Options_setBytesPerSync( * Method: setWalBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWalBytesPerSync( - JNIEnv*, jobject, jlong jhandle, jlong jwal_bytes_per_sync) { +void Java_org_rocksdb_Options_setWalBytesPerSync(JNIEnv*, jobject, + jlong jhandle, + jlong jwal_bytes_per_sync) { reinterpret_cast(jhandle)->wal_bytes_per_sync = static_cast(jwal_bytes_per_sync); } @@ -1697,8 +1694,8 @@ void Java_org_rocksdb_Options_setWalBytesPerSync( * Method: walBytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_Options_walBytesPerSync( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_walBytesPerSync(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->wal_bytes_per_sync); } @@ -1719,8 +1716,8 @@ void Java_org_rocksdb_Options_setStrictBytesPerSync( * Method: strictBytesPerSync * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_strictBytesPerSync( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_strictBytesPerSync(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->strict_bytes_per_sync); } @@ -1811,8 +1808,8 @@ void Java_org_rocksdb_Options_setEnableThreadTracking( * Method: enableThreadTracking * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_enableThreadTracking( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_enableThreadTracking(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enable_thread_tracking); } @@ -1822,8 +1819,9 @@ jboolean Java_org_rocksdb_Options_enableThreadTracking( * Method: setDelayedWriteRate * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setDelayedWriteRate( - JNIEnv*, jobject, jlong jhandle, jlong jdelayed_write_rate) { +void Java_org_rocksdb_Options_setDelayedWriteRate(JNIEnv*, jobject, + jlong jhandle, + jlong jdelayed_write_rate) { auto* opt = reinterpret_cast(jhandle); opt->delayed_write_rate = static_cast(jdelayed_write_rate); } @@ -1833,8 +1831,8 @@ void Java_org_rocksdb_Options_setDelayedWriteRate( * Method: delayedWriteRate * Signature: (J)J */ -jlong Java_org_rocksdb_Options_delayedWriteRate( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_delayedWriteRate(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->delayed_write_rate); } @@ -1855,8 +1853,8 @@ void Java_org_rocksdb_Options_setEnablePipelinedWrite( * Method: enablePipelinedWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_enablePipelinedWrite( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_enablePipelinedWrite(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enable_pipelined_write); } @@ -1866,8 +1864,8 @@ jboolean Java_org_rocksdb_Options_enablePipelinedWrite( * Method: setUnorderedWrite * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setUnorderedWrite( - JNIEnv*, jobject, jlong jhandle, jboolean unordered_write) { +void Java_org_rocksdb_Options_setUnorderedWrite(JNIEnv*, jobject, jlong jhandle, + jboolean unordered_write) { reinterpret_cast(jhandle)->unordered_write = static_cast(unordered_write); } @@ -1877,8 +1875,8 @@ void Java_org_rocksdb_Options_setUnorderedWrite( * Method: unorderedWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_unorderedWrite( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_unorderedWrite(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->unordered_write; } @@ -1888,8 +1886,9 @@ jboolean Java_org_rocksdb_Options_unorderedWrite( * Method: setAllowConcurrentMemtableWrite * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite( - JNIEnv*, jobject, jlong jhandle, jboolean allow) { +void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite(JNIEnv*, jobject, + jlong jhandle, + jboolean allow) { reinterpret_cast(jhandle) ->allow_concurrent_memtable_write = static_cast(allow); } @@ -1899,8 +1898,8 @@ void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite( * Method: allowConcurrentMemtableWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowConcurrentMemtableWrite( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_allowConcurrentMemtableWrite(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->allow_concurrent_memtable_write; } @@ -1932,8 +1931,9 @@ jboolean Java_org_rocksdb_Options_enableWriteThreadAdaptiveYield( * Method: setWriteThreadMaxYieldUsec * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec( - JNIEnv*, jobject, jlong jhandle, jlong max) { +void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec(JNIEnv*, jobject, + jlong jhandle, + jlong max) { reinterpret_cast(jhandle) ->write_thread_max_yield_usec = static_cast(max); } @@ -1943,8 +1943,8 @@ void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec( * Method: writeThreadMaxYieldUsec * Signature: (J)J */ -jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->write_thread_max_yield_usec; } @@ -1954,8 +1954,9 @@ jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec( * Method: setWriteThreadSlowYieldUsec * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec( - JNIEnv*, jobject, jlong jhandle, jlong slow) { +void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec(JNIEnv*, jobject, + jlong jhandle, + jlong slow) { reinterpret_cast(jhandle) ->write_thread_slow_yield_usec = static_cast(slow); } @@ -1965,8 +1966,8 @@ void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec( * Method: writeThreadSlowYieldUsec * Signature: (J)J */ -jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->write_thread_slow_yield_usec; } @@ -1977,8 +1978,7 @@ jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec( * Signature: (JZ)V */ void Java_org_rocksdb_Options_setSkipStatsUpdateOnDbOpen( - JNIEnv*, jobject, jlong jhandle, - jboolean jskip_stats_update_on_db_open) { + JNIEnv*, jobject, jlong jhandle, jboolean jskip_stats_update_on_db_open) { auto* opt = reinterpret_cast(jhandle); opt->skip_stats_update_on_db_open = static_cast(jskip_stats_update_on_db_open); @@ -1989,8 +1989,8 @@ void Java_org_rocksdb_Options_setSkipStatsUpdateOnDbOpen( * Method: skipStatsUpdateOnDbOpen * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->skip_stats_update_on_db_open); } @@ -2025,8 +2025,7 @@ jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen( * Signature: (JB)V */ void Java_org_rocksdb_Options_setWalRecoveryMode( - JNIEnv*, jobject, jlong jhandle, - jbyte jwal_recovery_mode_value) { + JNIEnv*, jobject, jlong jhandle, jbyte jwal_recovery_mode_value) { auto* opt = reinterpret_cast(jhandle); opt->wal_recovery_mode = ROCKSDB_NAMESPACE::WALRecoveryModeJni::toCppWALRecoveryMode( @@ -2038,8 +2037,8 @@ void Java_org_rocksdb_Options_setWalRecoveryMode( * Method: walRecoveryMode * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_walRecoveryMode( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Options_walRecoveryMode(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::WALRecoveryModeJni::toJavaWALRecoveryMode( opt->wal_recovery_mode); @@ -2050,8 +2049,8 @@ jbyte Java_org_rocksdb_Options_walRecoveryMode( * Method: setAllow2pc * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAllow2pc( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_2pc) { +void Java_org_rocksdb_Options_setAllow2pc(JNIEnv*, jobject, jlong jhandle, + jboolean jallow_2pc) { auto* opt = reinterpret_cast(jhandle); opt->allow_2pc = static_cast(jallow_2pc); } @@ -2061,8 +2060,7 @@ void Java_org_rocksdb_Options_setAllow2pc( * Method: allow2pc * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allow2pc( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_allow2pc(JNIEnv*, jobject, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_2pc); } @@ -2072,8 +2070,8 @@ jboolean Java_org_rocksdb_Options_allow2pc( * Method: setRowCache * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setRowCache( - JNIEnv*, jobject, jlong jhandle, jlong jrow_cache_handle) { +void Java_org_rocksdb_Options_setRowCache(JNIEnv*, jobject, jlong jhandle, + jlong jrow_cache_handle) { auto* opt = reinterpret_cast(jhandle); auto* row_cache = reinterpret_cast*>( @@ -2081,14 +2079,13 @@ void Java_org_rocksdb_Options_setRowCache( opt->row_cache = *row_cache; } - /* * Class: org_rocksdb_Options * Method: setWalFilter * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setWalFilter( - JNIEnv*, jobject, jlong jhandle, jlong jwal_filter_handle) { +void Java_org_rocksdb_Options_setWalFilter(JNIEnv*, jobject, jlong jhandle, + jlong jwal_filter_handle) { auto* opt = reinterpret_cast(jhandle); auto* wal_filter = reinterpret_cast( jwal_filter_handle); @@ -2112,8 +2109,8 @@ void Java_org_rocksdb_Options_setFailIfOptionsFileError( * Method: failIfOptionsFileError * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_failIfOptionsFileError( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_failIfOptionsFileError(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->fail_if_options_file_error); } @@ -2123,8 +2120,9 @@ jboolean Java_org_rocksdb_Options_failIfOptionsFileError( * Method: setDumpMallocStats * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setDumpMallocStats( - JNIEnv*, jobject, jlong jhandle, jboolean jdump_malloc_stats) { +void Java_org_rocksdb_Options_setDumpMallocStats(JNIEnv*, jobject, + jlong jhandle, + jboolean jdump_malloc_stats) { auto* opt = reinterpret_cast(jhandle); opt->dump_malloc_stats = static_cast(jdump_malloc_stats); } @@ -2134,8 +2132,8 @@ void Java_org_rocksdb_Options_setDumpMallocStats( * Method: dumpMallocStats * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_dumpMallocStats( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_dumpMallocStats(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->dump_malloc_stats); } @@ -2157,8 +2155,8 @@ void Java_org_rocksdb_Options_setAvoidFlushDuringRecovery( * Method: avoidFlushDuringRecovery * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_avoidFlushDuringRecovery( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_avoidFlushDuringRecovery(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->avoid_flush_during_recovery); } @@ -2336,8 +2334,8 @@ void Java_org_rocksdb_Options_setAvoidFlushDuringShutdown( * Method: avoidFlushDuringShutdown * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_avoidFlushDuringShutdown( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_avoidFlushDuringShutdown(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->avoid_flush_during_shutdown); } @@ -2358,8 +2356,8 @@ void Java_org_rocksdb_Options_setAllowIngestBehind( * Method: allowIngestBehind * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_allowIngestBehind( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_allowIngestBehind(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_ingest_behind); } @@ -2369,8 +2367,8 @@ jboolean Java_org_rocksdb_Options_allowIngestBehind( * Method: setTwoWriteQueues * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setTwoWriteQueues( - JNIEnv*, jobject, jlong jhandle, jboolean jtwo_write_queues) { +void Java_org_rocksdb_Options_setTwoWriteQueues(JNIEnv*, jobject, jlong jhandle, + jboolean jtwo_write_queues) { auto* opt = reinterpret_cast(jhandle); opt->two_write_queues = jtwo_write_queues == JNI_TRUE; } @@ -2380,8 +2378,8 @@ void Java_org_rocksdb_Options_setTwoWriteQueues( * Method: twoWriteQueues * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_twoWriteQueues( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_twoWriteQueues(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->two_write_queues); } @@ -2391,8 +2389,8 @@ jboolean Java_org_rocksdb_Options_twoWriteQueues( * Method: setManualWalFlush * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setManualWalFlush( - JNIEnv*, jobject, jlong jhandle, jboolean jmanual_wal_flush) { +void Java_org_rocksdb_Options_setManualWalFlush(JNIEnv*, jobject, jlong jhandle, + jboolean jmanual_wal_flush) { auto* opt = reinterpret_cast(jhandle); opt->manual_wal_flush = jmanual_wal_flush == JNI_TRUE; } @@ -2402,8 +2400,8 @@ void Java_org_rocksdb_Options_setManualWalFlush( * Method: manualWalFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_manualWalFlush( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_manualWalFlush(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->manual_wal_flush); } @@ -2413,8 +2411,8 @@ jboolean Java_org_rocksdb_Options_manualWalFlush( * Method: setAtomicFlush * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setAtomicFlush( - JNIEnv*, jobject, jlong jhandle, jboolean jatomic_flush) { +void Java_org_rocksdb_Options_setAtomicFlush(JNIEnv*, jobject, jlong jhandle, + jboolean jatomic_flush) { auto* opt = reinterpret_cast(jhandle); opt->atomic_flush = jatomic_flush == JNI_TRUE; } @@ -2424,8 +2422,7 @@ void Java_org_rocksdb_Options_setAtomicFlush( * Method: atomicFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_atomicFlush( - JNIEnv *, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_atomicFlush(JNIEnv*, jobject, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->atomic_flush); } @@ -2434,8 +2431,8 @@ jboolean Java_org_rocksdb_Options_atomicFlush( * Method: tableFactoryName * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_Options_tableFactoryName( - JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_Options_tableFactoryName(JNIEnv* env, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::TableFactory* tf = opt->table_factory.get(); @@ -2451,8 +2448,8 @@ jstring Java_org_rocksdb_Options_tableFactoryName( * Method: minWriteBufferNumberToMerge * Signature: (J)I */ -jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->min_write_buffer_number_to_merge; } @@ -2473,8 +2470,8 @@ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge( * Method: maxWriteBufferNumberToMaintain * Signature: (J)I */ -jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_write_buffer_number_to_maintain; } @@ -2510,8 +2507,8 @@ void Java_org_rocksdb_Options_setCompressionType( * Method: compressionType * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_compressionType( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Options_compressionType(JNIEnv*, jobject, + jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( opts->compression); @@ -2618,8 +2615,8 @@ void Java_org_rocksdb_Options_setCompressionPerLevel( * Method: compressionPerLevel * Signature: (J)[B */ -jbyteArray Java_org_rocksdb_Options_compressionPerLevel( - JNIEnv* env, jobject, jlong jhandle) { +jbyteArray Java_org_rocksdb_Options_compressionPerLevel(JNIEnv* env, jobject, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return rocksdb_compression_list_helper(env, options->compression_per_level); } @@ -2642,8 +2639,8 @@ void Java_org_rocksdb_Options_setBottommostCompressionType( * Method: bottommostCompressionType * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_bottommostCompressionType( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Options_bottommostCompressionType(JNIEnv*, jobject, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( options->bottommost_compression); @@ -2683,8 +2680,9 @@ void Java_org_rocksdb_Options_setCompressionOptions( * Method: setCompactionStyle * Signature: (JB)V */ -void Java_org_rocksdb_Options_setCompactionStyle( - JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_style) { +void Java_org_rocksdb_Options_setCompactionStyle(JNIEnv*, jobject, + jlong jhandle, + jbyte jcompaction_style) { auto* options = reinterpret_cast(jhandle); options->compaction_style = ROCKSDB_NAMESPACE::CompactionStyleJni::toCppCompactionStyle( @@ -2696,8 +2694,8 @@ void Java_org_rocksdb_Options_setCompactionStyle( * Method: compactionStyle * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_compactionStyle( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Options_compactionStyle(JNIEnv*, jobject, + jlong jhandle) { auto* options = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompactionStyleJni::toJavaCompactionStyle( options->compaction_style); @@ -2720,8 +2718,8 @@ void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO( * Method: maxTableFilesSizeFIFO * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->compaction_options_fifo.max_table_files_size; } @@ -2731,8 +2729,7 @@ jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO( * Method: numLevels * Signature: (J)I */ -jint Java_org_rocksdb_Options_numLevels( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_numLevels(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->num_levels; } @@ -2741,8 +2738,8 @@ jint Java_org_rocksdb_Options_numLevels( * Method: setNumLevels * Signature: (JI)V */ -void Java_org_rocksdb_Options_setNumLevels( - JNIEnv*, jobject, jlong jhandle, jint jnum_levels) { +void Java_org_rocksdb_Options_setNumLevels(JNIEnv*, jobject, jlong jhandle, + jint jnum_levels) { reinterpret_cast(jhandle)->num_levels = static_cast(jnum_levels); } @@ -2752,8 +2749,9 @@ void Java_org_rocksdb_Options_setNumLevels( * Method: levelZeroFileNumCompactionTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(JNIEnv*, + jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger; } @@ -2764,8 +2762,7 @@ jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger( * Signature: (JI)V */ void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle, - jint jlevel0_file_num_compaction_trigger) { + JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger = static_cast(jlevel0_file_num_compaction_trigger); @@ -2776,8 +2773,8 @@ void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger( * Method: levelZeroSlowdownWritesTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger; } @@ -2799,8 +2796,8 @@ void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger( * Method: levelZeroStopWritesTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->level0_stop_writes_trigger; } @@ -2822,8 +2819,8 @@ void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger( * Method: targetFileSizeBase * Signature: (J)J */ -jlong Java_org_rocksdb_Options_targetFileSizeBase( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_targetFileSizeBase(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->target_file_size_base; } @@ -2844,8 +2841,8 @@ void Java_org_rocksdb_Options_setTargetFileSizeBase( * Method: targetFileSizeMultiplier * Signature: (J)I */ -jint Java_org_rocksdb_Options_targetFileSizeMultiplier( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_targetFileSizeMultiplier(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->target_file_size_multiplier; } @@ -2867,8 +2864,8 @@ void Java_org_rocksdb_Options_setTargetFileSizeMultiplier( * Method: maxBytesForLevelBase * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxBytesForLevelBase( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxBytesForLevelBase(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_bytes_for_level_base; } @@ -2912,8 +2909,8 @@ void Java_org_rocksdb_Options_setLevelCompactionDynamicLevelBytes( * Method: maxBytesForLevelMultiplier * Signature: (J)D */ -jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier( - JNIEnv*, jobject, jlong jhandle) { +jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_bytes_for_level_multiplier; } @@ -2935,8 +2932,8 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier( * Method: maxCompactionBytes * Signature: (J)I */ -jlong Java_org_rocksdb_Options_maxCompactionBytes( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxCompactionBytes(JNIEnv*, jobject, + jlong jhandle) { return static_cast( reinterpret_cast(jhandle) ->max_compaction_bytes); @@ -2958,8 +2955,7 @@ void Java_org_rocksdb_Options_setMaxCompactionBytes( * Method: arenaBlockSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_arenaBlockSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_arenaBlockSize(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle) ->arena_block_size; } @@ -2969,8 +2965,9 @@ jlong Java_org_rocksdb_Options_arenaBlockSize( * Method: setArenaBlockSize * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setArenaBlockSize( - JNIEnv* env, jobject, jlong jhandle, jlong jarena_block_size) { +void Java_org_rocksdb_Options_setArenaBlockSize(JNIEnv* env, jobject, + jlong jhandle, + jlong jarena_block_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(jarena_block_size); if (s.ok()) { @@ -2986,8 +2983,8 @@ void Java_org_rocksdb_Options_setArenaBlockSize( * Method: disableAutoCompactions * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_disableAutoCompactions( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_disableAutoCompactions(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->disable_auto_compactions; } @@ -3008,8 +3005,8 @@ void Java_org_rocksdb_Options_setDisableAutoCompactions( * Method: maxSequentialSkipInIterations * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_sequential_skip_in_iterations; } @@ -3020,8 +3017,7 @@ jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations( * Signature: (JJ)V */ void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations( - JNIEnv*, jobject, jlong jhandle, - jlong jmax_sequential_skip_in_iterations) { + JNIEnv*, jobject, jlong jhandle, jlong jmax_sequential_skip_in_iterations) { reinterpret_cast(jhandle) ->max_sequential_skip_in_iterations = static_cast(jmax_sequential_skip_in_iterations); @@ -3032,8 +3028,8 @@ void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations( * Method: inplaceUpdateSupport * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_inplaceUpdateSupport( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->inplace_update_support; } @@ -3054,8 +3050,8 @@ void Java_org_rocksdb_Options_setInplaceUpdateSupport( * Method: inplaceUpdateNumLocks * Signature: (J)J */ -jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->inplace_update_num_locks; } @@ -3082,8 +3078,8 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( * Method: memtablePrefixBloomSizeRatio * Signature: (J)I */ -jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio( - JNIEnv*, jobject, jlong jhandle) { +jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_prefix_bloom_size_ratio; } @@ -3152,8 +3148,7 @@ void Java_org_rocksdb_Options_setMemtableWholeKeyFiltering( * Method: bloomLocality * Signature: (J)I */ -jint Java_org_rocksdb_Options_bloomLocality( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_bloomLocality(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->bloom_locality; } @@ -3162,8 +3157,8 @@ jint Java_org_rocksdb_Options_bloomLocality( * Method: setBloomLocality * Signature: (JI)V */ -void Java_org_rocksdb_Options_setBloomLocality( - JNIEnv*, jobject, jlong jhandle, jint jbloom_locality) { +void Java_org_rocksdb_Options_setBloomLocality(JNIEnv*, jobject, jlong jhandle, + jint jbloom_locality) { reinterpret_cast(jhandle)->bloom_locality = static_cast(jbloom_locality); } @@ -3173,8 +3168,8 @@ void Java_org_rocksdb_Options_setBloomLocality( * Method: maxSuccessiveMerges * Signature: (J)J */ -jlong Java_org_rocksdb_Options_maxSuccessiveMerges( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_maxSuccessiveMerges(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_successive_merges; } @@ -3201,8 +3196,8 @@ void Java_org_rocksdb_Options_setMaxSuccessiveMerges( * Method: optimizeFiltersForHits * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_optimizeFiltersForHits( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_optimizeFiltersForHits(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->optimize_filters_for_hits; } @@ -3296,8 +3291,8 @@ void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction( * Method: prepareForBulkLoad * Signature: (J)V */ -void Java_org_rocksdb_Options_prepareForBulkLoad( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_Options_prepareForBulkLoad(JNIEnv*, jobject, + jlong jhandle) { reinterpret_cast(jhandle)->PrepareForBulkLoad(); } @@ -3306,8 +3301,8 @@ void Java_org_rocksdb_Options_prepareForBulkLoad( * Method: memtableHugePageSize * Signature: (J)J */ -jlong Java_org_rocksdb_Options_memtableHugePageSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_memtableHugePageSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_huge_page_size; } @@ -3334,8 +3329,8 @@ void Java_org_rocksdb_Options_setMemtableHugePageSize( * Method: softPendingCompactionBytesLimit * Signature: (J)J */ -jlong Java_org_rocksdb_Options_softPendingCompactionBytesLimit( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_softPendingCompactionBytesLimit(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->soft_pending_compaction_bytes_limit; } @@ -3358,8 +3353,8 @@ void Java_org_rocksdb_Options_setSoftPendingCompactionBytesLimit( * Method: softHardCompactionBytesLimit * Signature: (J)J */ -jlong Java_org_rocksdb_Options_hardPendingCompactionBytesLimit( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_hardPendingCompactionBytesLimit(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->hard_pending_compaction_bytes_limit; } @@ -3382,8 +3377,8 @@ void Java_org_rocksdb_Options_setHardPendingCompactionBytesLimit( * Method: level0FileNumCompactionTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger; } @@ -3394,8 +3389,7 @@ jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger( * Signature: (JI)V */ void Java_org_rocksdb_Options_setLevel0FileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle, - jint jlevel0_file_num_compaction_trigger) { + JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger = static_cast(jlevel0_file_num_compaction_trigger); @@ -3406,8 +3400,8 @@ void Java_org_rocksdb_Options_setLevel0FileNumCompactionTrigger( * Method: level0SlowdownWritesTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_level0SlowdownWritesTrigger( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_level0SlowdownWritesTrigger(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->level0_slowdown_writes_trigger; } @@ -3429,8 +3423,8 @@ void Java_org_rocksdb_Options_setLevel0SlowdownWritesTrigger( * Method: level0StopWritesTrigger * Signature: (J)I */ -jint Java_org_rocksdb_Options_level0StopWritesTrigger( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_Options_level0StopWritesTrigger(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->level0_stop_writes_trigger; } @@ -3517,8 +3511,8 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplierAdditional( * Method: paranoidFileChecks * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_paranoidFileChecks( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_paranoidFileChecks(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->paranoid_file_checks; } @@ -3552,8 +3546,8 @@ void Java_org_rocksdb_Options_setCompactionPriority( * Method: compactionPriority * Signature: (J)B */ -jbyte Java_org_rocksdb_Options_compactionPriority( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Options_compactionPriority(JNIEnv*, jobject, + jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompactionPriorityJni::toJavaCompactionPriority( opts->compaction_pri); @@ -3564,8 +3558,9 @@ jbyte Java_org_rocksdb_Options_compactionPriority( * Method: setReportBgIoStats * Signature: (JZ)V */ -void Java_org_rocksdb_Options_setReportBgIoStats( - JNIEnv*, jobject, jlong jhandle, jboolean jreport_bg_io_stats) { +void Java_org_rocksdb_Options_setReportBgIoStats(JNIEnv*, jobject, + jlong jhandle, + jboolean jreport_bg_io_stats) { auto* opts = reinterpret_cast(jhandle); opts->report_bg_io_stats = static_cast(jreport_bg_io_stats); } @@ -3575,8 +3570,8 @@ void Java_org_rocksdb_Options_setReportBgIoStats( * Method: reportBgIoStats * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_reportBgIoStats( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_reportBgIoStats(JNIEnv*, jobject, + jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->report_bg_io_stats); } @@ -3586,8 +3581,8 @@ jboolean Java_org_rocksdb_Options_reportBgIoStats( * Method: setTtl * Signature: (JJ)V */ -void Java_org_rocksdb_Options_setTtl( - JNIEnv*, jobject, jlong jhandle, jlong jttl) { +void Java_org_rocksdb_Options_setTtl(JNIEnv*, jobject, jlong jhandle, + jlong jttl) { auto* opts = reinterpret_cast(jhandle); opts->ttl = static_cast(jttl); } @@ -3597,8 +3592,7 @@ void Java_org_rocksdb_Options_setTtl( * Method: ttl * Signature: (J)J */ -jlong Java_org_rocksdb_Options_ttl( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_Options_ttl(JNIEnv*, jobject, jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->ttl); } @@ -3670,8 +3664,8 @@ void Java_org_rocksdb_Options_setForceConsistencyChecks( * Method: forceConsistencyChecks * Signature: (J)Z */ -jboolean Java_org_rocksdb_Options_forceConsistencyChecks( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_Options_forceConsistencyChecks(JNIEnv*, jobject, + jlong jhandle) { auto* opts = reinterpret_cast(jhandle); return static_cast(opts->force_consistency_checks); } @@ -3918,8 +3912,8 @@ jbyte Java_org_rocksdb_Options_prepopulateBlobCache(JNIEnv*, jobject, * Method: newColumnFamilyOptions * Signature: ()J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions(JNIEnv*, + jclass) { auto* op = new ROCKSDB_NAMESPACE::ColumnFamilyOptions(); return GET_CPLUSPLUS_POINTER(op); } @@ -4019,8 +4013,8 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__Ljav * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal( - JNIEnv*, jobject, jlong handle) { +void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal(JNIEnv*, jobject, + jlong handle) { auto* cfo = reinterpret_cast(handle); assert(cfo != nullptr); delete cfo; @@ -4227,8 +4221,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize( * Method: writeBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->write_buffer_size; } @@ -4249,8 +4243,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber( * Method: maxWriteBufferNumber * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_write_buffer_number; } @@ -4356,8 +4350,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionThreadLimiter( * Method: tableFactoryName * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName( - JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName(JNIEnv* env, + jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); ROCKSDB_NAMESPACE::TableFactory* tf = opt->table_factory.get(); @@ -4481,8 +4476,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType( * Method: compressionType * Signature: (J)B */ -jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(JNIEnv*, jobject, + jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompressionTypeJni::toJavaCompressionType( @@ -4596,8 +4591,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle( * Method: compactionStyle * Signature: (J)B */ -jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(JNIEnv*, jobject, + jlong jhandle) { auto* cf_options = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompactionStyleJni::toJavaCompactionStyle( @@ -4632,8 +4627,8 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxTableFilesSizeFIFO( * Method: numLevels * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyOptions_numLevels( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->num_levels; } @@ -4643,8 +4638,9 @@ jint Java_org_rocksdb_ColumnFamilyOptions_numLevels( * Method: setNumLevels * Signature: (JI)V */ -void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels( - JNIEnv*, jobject, jlong jhandle, jint jnum_levels) { +void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(JNIEnv*, jobject, + jlong jhandle, + jint jnum_levels) { reinterpret_cast(jhandle) ->num_levels = static_cast(jnum_levels); } @@ -4666,8 +4662,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle, - jint jlevel0_file_num_compaction_trigger) { + JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger = static_cast(jlevel0_file_num_compaction_trigger); @@ -4724,8 +4719,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger( * Method: targetFileSizeBase * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->target_file_size_base; } @@ -4769,8 +4764,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier( * Method: maxBytesForLevelBase * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(JNIEnv*, + jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_bytes_for_level_base; } @@ -4837,8 +4833,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier( * Method: maxCompactionBytes * Signature: (J)I */ -jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes(JNIEnv*, jobject, + jlong jhandle) { return static_cast( reinterpret_cast(jhandle) ->max_compaction_bytes); @@ -4860,8 +4856,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxCompactionBytes( * Method: arenaBlockSize * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->arena_block_size; } @@ -4922,8 +4918,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations( * Signature: (JJ)V */ void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations( - JNIEnv*, jobject, jlong jhandle, - jlong jmax_sequential_skip_in_iterations) { + JNIEnv*, jobject, jlong jhandle, jlong jmax_sequential_skip_in_iterations) { reinterpret_cast(jhandle) ->max_sequential_skip_in_iterations = static_cast(jmax_sequential_skip_in_iterations); @@ -5054,8 +5049,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemtableWholeKeyFiltering( * Method: bloomLocality * Signature: (J)I */ -jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->bloom_locality; } @@ -5076,8 +5071,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality( * Method: maxSuccessiveMerges * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_successive_merges; } @@ -5127,8 +5122,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setOptimizeFiltersForHits( * Method: memtableHugePageSize * Signature: (J)J */ -jlong Java_org_rocksdb_ColumnFamilyOptions_memtableHugePageSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ColumnFamilyOptions_memtableHugePageSize(JNIEnv*, + jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->memtable_huge_page_size; } @@ -5215,8 +5211,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_level0FileNumCompactionTrigger( * Signature: (JI)V */ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0FileNumCompactionTrigger( - JNIEnv*, jobject, jlong jhandle, - jint jlevel0_file_num_compaction_trigger) { + JNIEnv*, jobject, jlong jhandle, jint jlevel0_file_num_compaction_trigger) { reinterpret_cast(jhandle) ->level0_file_num_compaction_trigger = static_cast(jlevel0_file_num_compaction_trigger); @@ -5273,7 +5268,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0StopWritesTrigger( * Method: maxBytesForLevelMultiplierAdditional * Signature: (J)[I */ -jintArray Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplierAdditional( +jintArray +Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplierAdditional( JNIEnv* env, jobject, jlong jhandle) { auto mbflma = reinterpret_cast(jhandle) @@ -5375,8 +5371,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionPriority( * Method: compactionPriority * Signature: (J)B */ -jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionPriority( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionPriority(JNIEnv*, jobject, + jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::CompactionPriorityJni::toJavaCompactionPriority( @@ -5400,8 +5396,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setReportBgIoStats( * Method: reportBgIoStats * Signature: (J)Z */ -jboolean Java_org_rocksdb_ColumnFamilyOptions_reportBgIoStats( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ColumnFamilyOptions_reportBgIoStats(JNIEnv*, jobject, + jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); return static_cast(cf_opts->report_bg_io_stats); @@ -5412,8 +5408,8 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_reportBgIoStats( * Method: setTtl * Signature: (JJ)V */ -void Java_org_rocksdb_ColumnFamilyOptions_setTtl( - JNIEnv*, jobject, jlong jhandle, jlong jttl) { +void Java_org_rocksdb_ColumnFamilyOptions_setTtl(JNIEnv*, jobject, + jlong jhandle, jlong jttl) { auto* cf_opts = reinterpret_cast(jhandle); cf_opts->ttl = static_cast(jttl); @@ -5424,8 +5420,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTtl( * Method: ttl * Signature: (J)J */ -JNIEXPORT jlong JNICALL Java_org_rocksdb_ColumnFamilyOptions_ttl( - JNIEnv*, jobject, jlong jhandle) { +JNIEXPORT jlong JNICALL +Java_org_rocksdb_ColumnFamilyOptions_ttl(JNIEnv*, jobject, jlong jhandle) { auto* cf_opts = reinterpret_cast(jhandle); return static_cast(cf_opts->ttl); @@ -5778,8 +5774,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_prepopulateBlobCache(JNIEnv*, * Method: newDBOptions * Signature: ()J */ -jlong Java_org_rocksdb_DBOptions_newDBOptions( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv*, jclass) { auto* dbop = new ROCKSDB_NAMESPACE::DBOptions(); return GET_CPLUSPLUS_POINTER(dbop); } @@ -5789,8 +5784,7 @@ jlong Java_org_rocksdb_DBOptions_newDBOptions( * Method: copyDBOptions * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_copyDBOptions( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_copyDBOptions(JNIEnv*, jclass, jlong jhandle) { auto new_opt = new ROCKSDB_NAMESPACE::DBOptions( *(reinterpret_cast(jhandle))); return GET_CPLUSPLUS_POINTER(new_opt); @@ -5877,8 +5871,8 @@ jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__Ljava_lang_String_2( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_DBOptions_disposeInternal( - JNIEnv*, jobject, jlong handle) { +void Java_org_rocksdb_DBOptions_disposeInternal(JNIEnv*, jobject, + jlong handle) { auto* dbo = reinterpret_cast(handle); assert(dbo != nullptr); delete dbo; @@ -5889,8 +5883,8 @@ void Java_org_rocksdb_DBOptions_disposeInternal( * Method: optimizeForSmallDb * Signature: (J)V */ -void Java_org_rocksdb_DBOptions_optimizeForSmallDb( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_DBOptions_optimizeForSmallDb(JNIEnv*, jobject, + jlong jhandle) { reinterpret_cast(jhandle) ->OptimizeForSmallDb(); } @@ -5900,8 +5894,8 @@ void Java_org_rocksdb_DBOptions_optimizeForSmallDb( * Method: setEnv * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setEnv( - JNIEnv*, jobject, jlong jhandle, jlong jenv_handle) { +void Java_org_rocksdb_DBOptions_setEnv(JNIEnv*, jobject, jlong jhandle, + jlong jenv_handle) { reinterpret_cast(jhandle)->env = reinterpret_cast(jenv_handle); } @@ -5911,8 +5905,9 @@ void Java_org_rocksdb_DBOptions_setEnv( * Method: setIncreaseParallelism * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setIncreaseParallelism( - JNIEnv*, jobject, jlong jhandle, jint totalThreads) { +void Java_org_rocksdb_DBOptions_setIncreaseParallelism(JNIEnv*, jobject, + jlong jhandle, + jint totalThreads) { reinterpret_cast(jhandle)->IncreaseParallelism( static_cast(totalThreads)); } @@ -5922,8 +5917,9 @@ void Java_org_rocksdb_DBOptions_setIncreaseParallelism( * Method: setCreateIfMissing * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setCreateIfMissing( - JNIEnv*, jobject, jlong jhandle, jboolean flag) { +void Java_org_rocksdb_DBOptions_setCreateIfMissing(JNIEnv*, jobject, + jlong jhandle, + jboolean flag) { reinterpret_cast(jhandle)->create_if_missing = flag; } @@ -5933,8 +5929,8 @@ void Java_org_rocksdb_DBOptions_setCreateIfMissing( * Method: createIfMissing * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_createIfMissing( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_createIfMissing(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->create_if_missing; } @@ -5944,8 +5940,9 @@ jboolean Java_org_rocksdb_DBOptions_createIfMissing( * Method: setCreateMissingColumnFamilies * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies( - JNIEnv*, jobject, jlong jhandle, jboolean flag) { +void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(JNIEnv*, jobject, + jlong jhandle, + jboolean flag) { reinterpret_cast(jhandle) ->create_missing_column_families = flag; } @@ -5955,8 +5952,9 @@ void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies( * Method: createMissingColumnFamilies * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(JNIEnv*, + jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->create_missing_column_families; } @@ -5966,8 +5964,9 @@ jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies( * Method: setErrorIfExists * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setErrorIfExists( - JNIEnv*, jobject, jlong jhandle, jboolean error_if_exists) { +void Java_org_rocksdb_DBOptions_setErrorIfExists(JNIEnv*, jobject, + jlong jhandle, + jboolean error_if_exists) { reinterpret_cast(jhandle)->error_if_exists = static_cast(error_if_exists); } @@ -5977,8 +5976,8 @@ void Java_org_rocksdb_DBOptions_setErrorIfExists( * Method: errorIfExists * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_errorIfExists( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_errorIfExists(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->error_if_exists; } @@ -5988,8 +5987,9 @@ jboolean Java_org_rocksdb_DBOptions_errorIfExists( * Method: setParanoidChecks * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setParanoidChecks( - JNIEnv*, jobject, jlong jhandle, jboolean paranoid_checks) { +void Java_org_rocksdb_DBOptions_setParanoidChecks(JNIEnv*, jobject, + jlong jhandle, + jboolean paranoid_checks) { reinterpret_cast(jhandle)->paranoid_checks = static_cast(paranoid_checks); } @@ -5999,8 +5999,8 @@ void Java_org_rocksdb_DBOptions_setParanoidChecks( * Method: paranoidChecks * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_paranoidChecks( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_paranoidChecks(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->paranoid_checks; } @@ -6010,8 +6010,8 @@ jboolean Java_org_rocksdb_DBOptions_paranoidChecks( * Method: setRateLimiter * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setRateLimiter( - JNIEnv*, jobject, jlong jhandle, jlong jrate_limiter_handle) { +void Java_org_rocksdb_DBOptions_setRateLimiter(JNIEnv*, jobject, jlong jhandle, + jlong jrate_limiter_handle) { std::shared_ptr* pRateLimiter = reinterpret_cast*>( jrate_limiter_handle); @@ -6038,8 +6038,8 @@ void Java_org_rocksdb_DBOptions_setSstFileManager( * Method: setLogger * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setLogger( - JNIEnv*, jobject, jlong jhandle, jlong jlogger_handle) { +void Java_org_rocksdb_DBOptions_setLogger(JNIEnv*, jobject, jlong jhandle, + jlong jlogger_handle) { std::shared_ptr* pLogger = reinterpret_cast*>( jlogger_handle); @@ -6051,8 +6051,8 @@ void Java_org_rocksdb_DBOptions_setLogger( * Method: setInfoLogLevel * Signature: (JB)V */ -void Java_org_rocksdb_DBOptions_setInfoLogLevel( - JNIEnv*, jobject, jlong jhandle, jbyte jlog_level) { +void Java_org_rocksdb_DBOptions_setInfoLogLevel(JNIEnv*, jobject, jlong jhandle, + jbyte jlog_level) { reinterpret_cast(jhandle)->info_log_level = static_cast(jlog_level); } @@ -6062,8 +6062,7 @@ void Java_org_rocksdb_DBOptions_setInfoLogLevel( * Method: infoLogLevel * Signature: (J)B */ -jbyte Java_org_rocksdb_DBOptions_infoLogLevel( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_DBOptions_infoLogLevel(JNIEnv*, jobject, jlong jhandle) { return static_cast( reinterpret_cast(jhandle)->info_log_level); } @@ -6073,8 +6072,9 @@ jbyte Java_org_rocksdb_DBOptions_infoLogLevel( * Method: setMaxTotalWalSize * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setMaxTotalWalSize( - JNIEnv*, jobject, jlong jhandle, jlong jmax_total_wal_size) { +void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(JNIEnv*, jobject, + jlong jhandle, + jlong jmax_total_wal_size) { reinterpret_cast(jhandle)->max_total_wal_size = static_cast(jmax_total_wal_size); } @@ -6084,8 +6084,8 @@ void Java_org_rocksdb_DBOptions_setMaxTotalWalSize( * Method: maxTotalWalSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_maxTotalWalSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_total_wal_size; } @@ -6095,8 +6095,8 @@ jlong Java_org_rocksdb_DBOptions_maxTotalWalSize( * Method: setMaxOpenFiles * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setMaxOpenFiles( - JNIEnv*, jobject, jlong jhandle, jint max_open_files) { +void Java_org_rocksdb_DBOptions_setMaxOpenFiles(JNIEnv*, jobject, jlong jhandle, + jint max_open_files) { reinterpret_cast(jhandle)->max_open_files = static_cast(max_open_files); } @@ -6106,8 +6106,7 @@ void Java_org_rocksdb_DBOptions_setMaxOpenFiles( * Method: maxOpenFiles * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxOpenFiles( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_maxOpenFiles(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle) ->max_open_files; } @@ -6128,8 +6127,8 @@ void Java_org_rocksdb_DBOptions_setMaxFileOpeningThreads( * Method: maxFileOpeningThreads * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_file_opening_threads); } @@ -6139,8 +6138,8 @@ jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads( * Method: setStatistics * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setStatistics( - JNIEnv*, jobject, jlong jhandle, jlong jstatistics_handle) { +void Java_org_rocksdb_DBOptions_setStatistics(JNIEnv*, jobject, jlong jhandle, + jlong jstatistics_handle) { auto* opt = reinterpret_cast(jhandle); auto* pSptr = reinterpret_cast*>( @@ -6153,8 +6152,7 @@ void Java_org_rocksdb_DBOptions_setStatistics( * Method: statistics * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_statistics( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_statistics(JNIEnv*, jobject, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); std::shared_ptr sptr = opt->statistics; if (sptr == nullptr) { @@ -6171,8 +6169,8 @@ jlong Java_org_rocksdb_DBOptions_statistics( * Method: setUseFsync * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setUseFsync( - JNIEnv*, jobject, jlong jhandle, jboolean use_fsync) { +void Java_org_rocksdb_DBOptions_setUseFsync(JNIEnv*, jobject, jlong jhandle, + jboolean use_fsync) { reinterpret_cast(jhandle)->use_fsync = static_cast(use_fsync); } @@ -6182,8 +6180,7 @@ void Java_org_rocksdb_DBOptions_setUseFsync( * Method: useFsync * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_useFsync( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_useFsync(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->use_fsync; } @@ -6192,9 +6189,9 @@ jboolean Java_org_rocksdb_DBOptions_useFsync( * Method: setDbPaths * Signature: (J[Ljava/lang/String;[J)V */ -void Java_org_rocksdb_DBOptions_setDbPaths( - JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths, - jlongArray jtarget_sizes) { +void Java_org_rocksdb_DBOptions_setDbPaths(JNIEnv* env, jobject, jlong jhandle, + jobjectArray jpaths, + jlongArray jtarget_sizes) { std::vector db_paths; jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr); if (ptr_jtarget_size == nullptr) { @@ -6238,8 +6235,7 @@ void Java_org_rocksdb_DBOptions_setDbPaths( * Method: dbPathsLen * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_dbPathsLen( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_dbPathsLen(JNIEnv*, jobject, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->db_paths.size()); } @@ -6249,9 +6245,9 @@ jlong Java_org_rocksdb_DBOptions_dbPathsLen( * Method: dbPaths * Signature: (J[Ljava/lang/String;[J)V */ -void Java_org_rocksdb_DBOptions_dbPaths( - JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths, - jlongArray jtarget_sizes) { +void Java_org_rocksdb_DBOptions_dbPaths(JNIEnv* env, jobject, jlong jhandle, + jobjectArray jpaths, + jlongArray jtarget_sizes) { jboolean is_copy; jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy); if (ptr_jtarget_size == nullptr) { @@ -6290,8 +6286,8 @@ void Java_org_rocksdb_DBOptions_dbPaths( * Method: setDbLogDir * Signature: (JLjava/lang/String)V */ -void Java_org_rocksdb_DBOptions_setDbLogDir( - JNIEnv* env, jobject, jlong jhandle, jstring jdb_log_dir) { +void Java_org_rocksdb_DBOptions_setDbLogDir(JNIEnv* env, jobject, jlong jhandle, + jstring jdb_log_dir) { const char* log_dir = env->GetStringUTFChars(jdb_log_dir, nullptr); if (log_dir == nullptr) { // exception thrown: OutOfMemoryError @@ -6308,8 +6304,8 @@ void Java_org_rocksdb_DBOptions_setDbLogDir( * Method: dbLogDir * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_DBOptions_dbLogDir( - JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_DBOptions_dbLogDir(JNIEnv* env, jobject, + jlong jhandle) { return env->NewStringUTF( reinterpret_cast(jhandle) ->db_log_dir.c_str()); @@ -6320,8 +6316,8 @@ jstring Java_org_rocksdb_DBOptions_dbLogDir( * Method: setWalDir * Signature: (JLjava/lang/String)V */ -void Java_org_rocksdb_DBOptions_setWalDir( - JNIEnv* env, jobject, jlong jhandle, jstring jwal_dir) { +void Java_org_rocksdb_DBOptions_setWalDir(JNIEnv* env, jobject, jlong jhandle, + jstring jwal_dir) { const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0); reinterpret_cast(jhandle)->wal_dir.assign( wal_dir); @@ -6333,8 +6329,7 @@ void Java_org_rocksdb_DBOptions_setWalDir( * Method: walDir * Signature: (J)Ljava/lang/String */ -jstring Java_org_rocksdb_DBOptions_walDir( - JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_DBOptions_walDir(JNIEnv* env, jobject, jlong jhandle) { return env->NewStringUTF( reinterpret_cast(jhandle) ->wal_dir.c_str()); @@ -6367,8 +6362,9 @@ jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros( * Method: setMaxBackgroundCompactions * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions( - JNIEnv*, jobject, jlong jhandle, jint max) { +void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(JNIEnv*, jobject, + jlong jhandle, + jint max) { reinterpret_cast(jhandle) ->max_background_compactions = static_cast(max); } @@ -6378,8 +6374,8 @@ void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions( * Method: maxBackgroundCompactions * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_compactions; } @@ -6389,8 +6385,8 @@ jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions( * Method: setMaxSubcompactions * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setMaxSubcompactions( - JNIEnv*, jobject, jlong jhandle, jint max) { +void Java_org_rocksdb_DBOptions_setMaxSubcompactions(JNIEnv*, jobject, + jlong jhandle, jint max) { reinterpret_cast(jhandle)->max_subcompactions = static_cast(max); } @@ -6400,8 +6396,8 @@ void Java_org_rocksdb_DBOptions_setMaxSubcompactions( * Method: maxSubcompactions * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxSubcompactions( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_maxSubcompactions(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_subcompactions; } @@ -6422,8 +6418,8 @@ void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes( * Method: maxBackgroundFlushes * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_flushes; } @@ -6433,8 +6429,9 @@ jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes( * Method: setMaxBackgroundJobs * Signature: (JI)V */ -void Java_org_rocksdb_DBOptions_setMaxBackgroundJobs( - JNIEnv*, jobject, jlong jhandle, jint max_background_jobs) { +void Java_org_rocksdb_DBOptions_setMaxBackgroundJobs(JNIEnv*, jobject, + jlong jhandle, + jint max_background_jobs) { reinterpret_cast(jhandle) ->max_background_jobs = static_cast(max_background_jobs); } @@ -6444,8 +6441,8 @@ void Java_org_rocksdb_DBOptions_setMaxBackgroundJobs( * Method: maxBackgroundJobs * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_maxBackgroundJobs( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_maxBackgroundJobs(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_background_jobs; } @@ -6455,8 +6452,9 @@ jint Java_org_rocksdb_DBOptions_maxBackgroundJobs( * Method: setMaxLogFileSize * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setMaxLogFileSize( - JNIEnv* env, jobject, jlong jhandle, jlong max_log_file_size) { +void Java_org_rocksdb_DBOptions_setMaxLogFileSize(JNIEnv* env, jobject, + jlong jhandle, + jlong max_log_file_size) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(max_log_file_size); if (s.ok()) { @@ -6472,8 +6470,8 @@ void Java_org_rocksdb_DBOptions_setMaxLogFileSize( * Method: maxLogFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_maxLogFileSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_maxLogFileSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_log_file_size; } @@ -6500,8 +6498,8 @@ void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll( * Method: logFileTimeToRoll * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->log_file_time_to_roll; } @@ -6511,8 +6509,9 @@ jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll( * Method: setKeepLogFileNum * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setKeepLogFileNum( - JNIEnv* env, jobject, jlong jhandle, jlong keep_log_file_num) { +void Java_org_rocksdb_DBOptions_setKeepLogFileNum(JNIEnv* env, jobject, + jlong jhandle, + jlong keep_log_file_num) { auto s = ROCKSDB_NAMESPACE::JniUtil::check_if_jlong_fits_size_t(keep_log_file_num); if (s.ok()) { @@ -6528,8 +6527,8 @@ void Java_org_rocksdb_DBOptions_setKeepLogFileNum( * Method: keepLogFileNum * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_keepLogFileNum( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_keepLogFileNum(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->keep_log_file_num; } @@ -6556,8 +6555,8 @@ void Java_org_rocksdb_DBOptions_setRecycleLogFileNum( * Method: recycleLogFileNum * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_recycleLogFileNum( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->recycle_log_file_num; } @@ -6578,8 +6577,8 @@ void Java_org_rocksdb_DBOptions_setMaxManifestFileSize( * Method: maxManifestFileSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_maxManifestFileSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->max_manifest_file_size; } @@ -6600,8 +6599,8 @@ void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits( * Method: tableCacheNumshardbits * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->table_cache_numshardbits; } @@ -6611,8 +6610,9 @@ jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits( * Method: setWalTtlSeconds * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWalTtlSeconds( - JNIEnv*, jobject, jlong jhandle, jlong WAL_ttl_seconds) { +void Java_org_rocksdb_DBOptions_setWalTtlSeconds(JNIEnv*, jobject, + jlong jhandle, + jlong WAL_ttl_seconds) { reinterpret_cast(jhandle)->WAL_ttl_seconds = static_cast(WAL_ttl_seconds); } @@ -6622,8 +6622,8 @@ void Java_org_rocksdb_DBOptions_setWalTtlSeconds( * Method: walTtlSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_walTtlSeconds( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_walTtlSeconds(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->WAL_ttl_seconds; } @@ -6633,8 +6633,9 @@ jlong Java_org_rocksdb_DBOptions_walTtlSeconds( * Method: setWalSizeLimitMB * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWalSizeLimitMB( - JNIEnv*, jobject, jlong jhandle, jlong WAL_size_limit_MB) { +void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(JNIEnv*, jobject, + jlong jhandle, + jlong WAL_size_limit_MB) { reinterpret_cast(jhandle)->WAL_size_limit_MB = static_cast(WAL_size_limit_MB); } @@ -6644,8 +6645,8 @@ void Java_org_rocksdb_DBOptions_setWalSizeLimitMB( * Method: walTtlSeconds * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_walSizeLimitMB( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_walSizeLimitMB(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->WAL_size_limit_MB; } @@ -6695,8 +6696,8 @@ void Java_org_rocksdb_DBOptions_setManifestPreallocationSize( * Method: manifestPreallocationSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->manifest_preallocation_size; } @@ -6706,8 +6707,8 @@ jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize( * Method: useDirectReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_useDirectReads( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->use_direct_reads; } @@ -6717,8 +6718,9 @@ jboolean Java_org_rocksdb_DBOptions_useDirectReads( * Method: setUseDirectReads * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setUseDirectReads( - JNIEnv*, jobject, jlong jhandle, jboolean use_direct_reads) { +void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv*, jobject, + jlong jhandle, + jboolean use_direct_reads) { reinterpret_cast(jhandle)->use_direct_reads = static_cast(use_direct_reads); } @@ -6752,8 +6754,9 @@ void Java_org_rocksdb_DBOptions_setUseDirectIoForFlushAndCompaction( * Method: setAllowFAllocate * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAllowFAllocate( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_fallocate) { +void Java_org_rocksdb_DBOptions_setAllowFAllocate(JNIEnv*, jobject, + jlong jhandle, + jboolean jallow_fallocate) { reinterpret_cast(jhandle)->allow_fallocate = static_cast(jallow_fallocate); } @@ -6763,8 +6766,8 @@ void Java_org_rocksdb_DBOptions_setAllowFAllocate( * Method: allowFAllocate * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allowFAllocate( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_allowFAllocate(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_fallocate); } @@ -6774,8 +6777,9 @@ jboolean Java_org_rocksdb_DBOptions_allowFAllocate( * Method: setAllowMmapReads * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAllowMmapReads( - JNIEnv*, jobject, jlong jhandle, jboolean allow_mmap_reads) { +void Java_org_rocksdb_DBOptions_setAllowMmapReads(JNIEnv*, jobject, + jlong jhandle, + jboolean allow_mmap_reads) { reinterpret_cast(jhandle)->allow_mmap_reads = static_cast(allow_mmap_reads); } @@ -6785,8 +6789,8 @@ void Java_org_rocksdb_DBOptions_setAllowMmapReads( * Method: allowMmapReads * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allowMmapReads( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_allowMmapReads(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->allow_mmap_reads; } @@ -6796,8 +6800,9 @@ jboolean Java_org_rocksdb_DBOptions_allowMmapReads( * Method: setAllowMmapWrites * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAllowMmapWrites( - JNIEnv*, jobject, jlong jhandle, jboolean allow_mmap_writes) { +void Java_org_rocksdb_DBOptions_setAllowMmapWrites(JNIEnv*, jobject, + jlong jhandle, + jboolean allow_mmap_writes) { reinterpret_cast(jhandle)->allow_mmap_writes = static_cast(allow_mmap_writes); } @@ -6807,8 +6812,8 @@ void Java_org_rocksdb_DBOptions_setAllowMmapWrites( * Method: allowMmapWrites * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allowMmapWrites( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->allow_mmap_writes; } @@ -6829,8 +6834,8 @@ void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec( * Method: isFdCloseOnExec * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->is_fd_close_on_exec; } @@ -6852,8 +6857,8 @@ void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec( * Method: statsDumpPeriodSec * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->stats_dump_period_sec; } @@ -6875,8 +6880,8 @@ void Java_org_rocksdb_DBOptions_setStatsPersistPeriodSec( * Method: statsPersistPeriodSec * Signature: (J)I */ -jint Java_org_rocksdb_DBOptions_statsPersistPeriodSec( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_DBOptions_statsPersistPeriodSec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->stats_persist_period_sec; } @@ -6898,8 +6903,8 @@ void Java_org_rocksdb_DBOptions_setStatsHistoryBufferSize( * Method: statsHistoryBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_statsHistoryBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_statsHistoryBufferSize(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->stats_history_buffer_size; } @@ -6920,8 +6925,8 @@ void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen( * Method: adviseRandomOnOpen * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->advise_random_on_open; } @@ -6957,8 +6962,8 @@ void Java_org_rocksdb_DBOptions_setWriteBufferManager( * Method: dbWriteBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_dbWriteBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_dbWriteBufferSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->db_write_buffer_size); } @@ -6980,8 +6985,8 @@ void Java_org_rocksdb_DBOptions_setAccessHintOnCompactionStart( * Method: accessHintOnCompactionStart * Signature: (J)B */ -jbyte Java_org_rocksdb_DBOptions_accessHintOnCompactionStart( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_DBOptions_accessHintOnCompactionStart(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::AccessHintJni::toJavaAccessHint( opt->access_hint_on_compaction_start); @@ -7004,8 +7009,8 @@ void Java_org_rocksdb_DBOptions_setCompactionReadaheadSize( * Method: compactionReadaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_compactionReadaheadSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_compactionReadaheadSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->compaction_readahead_size); } @@ -7027,8 +7032,8 @@ void Java_org_rocksdb_DBOptions_setRandomAccessMaxBufferSize( * Method: randomAccessMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_randomAccessMaxBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_randomAccessMaxBufferSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->random_access_max_buffer_size); } @@ -7050,8 +7055,8 @@ void Java_org_rocksdb_DBOptions_setWritableFileMaxBufferSize( * Method: writableFileMaxBufferSize * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_writableFileMaxBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_writableFileMaxBufferSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->writable_file_max_buffer_size); } @@ -7072,8 +7077,8 @@ void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex( * Method: useAdaptiveMutex * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->use_adaptive_mutex; } @@ -7083,8 +7088,8 @@ jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex( * Method: setBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setBytesPerSync( - JNIEnv*, jobject, jlong jhandle, jlong bytes_per_sync) { +void Java_org_rocksdb_DBOptions_setBytesPerSync(JNIEnv*, jobject, jlong jhandle, + jlong bytes_per_sync) { reinterpret_cast(jhandle)->bytes_per_sync = static_cast(bytes_per_sync); } @@ -7094,8 +7099,7 @@ void Java_org_rocksdb_DBOptions_setBytesPerSync( * Method: bytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_bytesPerSync( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_bytesPerSync(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle) ->bytes_per_sync; } @@ -7105,8 +7109,9 @@ jlong Java_org_rocksdb_DBOptions_bytesPerSync( * Method: setWalBytesPerSync * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWalBytesPerSync( - JNIEnv*, jobject, jlong jhandle, jlong jwal_bytes_per_sync) { +void Java_org_rocksdb_DBOptions_setWalBytesPerSync(JNIEnv*, jobject, + jlong jhandle, + jlong jwal_bytes_per_sync) { reinterpret_cast(jhandle)->wal_bytes_per_sync = static_cast(jwal_bytes_per_sync); } @@ -7116,8 +7121,8 @@ void Java_org_rocksdb_DBOptions_setWalBytesPerSync( * Method: walBytesPerSync * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_walBytesPerSync( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_walBytesPerSync(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->wal_bytes_per_sync); } @@ -7138,8 +7143,8 @@ void Java_org_rocksdb_DBOptions_setStrictBytesPerSync( * Method: strictBytesPerSync * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_strictBytesPerSync( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_strictBytesPerSync(JNIEnv*, jobject, + jlong jhandle) { return static_cast( reinterpret_cast(jhandle) ->strict_bytes_per_sync); @@ -7173,8 +7178,9 @@ jobjectArray Java_org_rocksdb_DBOptions_eventListeners(JNIEnv* env, jclass, * Method: setDelayedWriteRate * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setDelayedWriteRate( - JNIEnv*, jobject, jlong jhandle, jlong jdelayed_write_rate) { +void Java_org_rocksdb_DBOptions_setDelayedWriteRate(JNIEnv*, jobject, + jlong jhandle, + jlong jdelayed_write_rate) { auto* opt = reinterpret_cast(jhandle); opt->delayed_write_rate = static_cast(jdelayed_write_rate); } @@ -7184,8 +7190,8 @@ void Java_org_rocksdb_DBOptions_setDelayedWriteRate( * Method: delayedWriteRate * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_delayedWriteRate( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_delayedWriteRate(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->delayed_write_rate); } @@ -7206,8 +7212,8 @@ void Java_org_rocksdb_DBOptions_setEnablePipelinedWrite( * Method: enablePipelinedWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enable_pipelined_write); } @@ -7217,8 +7223,9 @@ jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite( * Method: setUnorderedWrite * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setUnorderedWrite( - JNIEnv*, jobject, jlong jhandle, jboolean junordered_write) { +void Java_org_rocksdb_DBOptions_setUnorderedWrite(JNIEnv*, jobject, + jlong jhandle, + jboolean junordered_write) { auto* opt = reinterpret_cast(jhandle); opt->unordered_write = junordered_write == JNI_TRUE; } @@ -7228,13 +7235,12 @@ void Java_org_rocksdb_DBOptions_setUnorderedWrite( * Method: unorderedWrite * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_unorderedWrite( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_unorderedWrite(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->unordered_write); } - /* * Class: org_rocksdb_DBOptions * Method: setEnableThreadTracking @@ -7251,8 +7257,8 @@ void Java_org_rocksdb_DBOptions_setEnableThreadTracking( * Method: enableThreadTracking * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_enableThreadTracking( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_enableThreadTracking(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->enable_thread_tracking); } @@ -7306,8 +7312,9 @@ jboolean Java_org_rocksdb_DBOptions_enableWriteThreadAdaptiveYield( * Method: setWriteThreadMaxYieldUsec * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec( - JNIEnv*, jobject, jlong jhandle, jlong max) { +void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec(JNIEnv*, jobject, + jlong jhandle, + jlong max) { reinterpret_cast(jhandle) ->write_thread_max_yield_usec = static_cast(max); } @@ -7317,8 +7324,8 @@ void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec( * Method: writeThreadMaxYieldUsec * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->write_thread_max_yield_usec; } @@ -7328,8 +7335,9 @@ jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec( * Method: setWriteThreadSlowYieldUsec * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec( - JNIEnv*, jobject, jlong jhandle, jlong slow) { +void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec(JNIEnv*, jobject, + jlong jhandle, + jlong slow) { reinterpret_cast(jhandle) ->write_thread_slow_yield_usec = static_cast(slow); } @@ -7339,8 +7347,8 @@ void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec( * Method: writeThreadSlowYieldUsec * Signature: (J)J */ -jlong Java_org_rocksdb_DBOptions_writeThreadSlowYieldUsec( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_DBOptions_writeThreadSlowYieldUsec(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->write_thread_slow_yield_usec; } @@ -7362,8 +7370,8 @@ void Java_org_rocksdb_DBOptions_setSkipStatsUpdateOnDbOpen( * Method: skipStatsUpdateOnDbOpen * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->skip_stats_update_on_db_open); } @@ -7410,8 +7418,8 @@ void Java_org_rocksdb_DBOptions_setWalRecoveryMode( * Method: walRecoveryMode * Signature: (J)B */ -jbyte Java_org_rocksdb_DBOptions_walRecoveryMode( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_DBOptions_walRecoveryMode(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::WALRecoveryModeJni::toJavaWALRecoveryMode( opt->wal_recovery_mode); @@ -7422,8 +7430,8 @@ jbyte Java_org_rocksdb_DBOptions_walRecoveryMode( * Method: setAllow2pc * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAllow2pc( - JNIEnv*, jobject, jlong jhandle, jboolean jallow_2pc) { +void Java_org_rocksdb_DBOptions_setAllow2pc(JNIEnv*, jobject, jlong jhandle, + jboolean jallow_2pc) { auto* opt = reinterpret_cast(jhandle); opt->allow_2pc = static_cast(jallow_2pc); } @@ -7433,8 +7441,7 @@ void Java_org_rocksdb_DBOptions_setAllow2pc( * Method: allow2pc * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allow2pc( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_allow2pc(JNIEnv*, jobject, jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_2pc); } @@ -7444,8 +7451,8 @@ jboolean Java_org_rocksdb_DBOptions_allow2pc( * Method: setRowCache * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setRowCache( - JNIEnv*, jobject, jlong jhandle, jlong jrow_cache_handle) { +void Java_org_rocksdb_DBOptions_setRowCache(JNIEnv*, jobject, jlong jhandle, + jlong jrow_cache_handle) { auto* opt = reinterpret_cast(jhandle); auto* row_cache = reinterpret_cast*>( @@ -7458,8 +7465,8 @@ void Java_org_rocksdb_DBOptions_setRowCache( * Method: setWalFilter * Signature: (JJ)V */ -void Java_org_rocksdb_DBOptions_setWalFilter( - JNIEnv*, jobject, jlong jhandle, jlong jwal_filter_handle) { +void Java_org_rocksdb_DBOptions_setWalFilter(JNIEnv*, jobject, jlong jhandle, + jlong jwal_filter_handle) { auto* opt = reinterpret_cast(jhandle); auto* wal_filter = reinterpret_cast( jwal_filter_handle); @@ -7483,8 +7490,8 @@ void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError( * Method: failIfOptionsFileError * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->fail_if_options_file_error); } @@ -7505,8 +7512,8 @@ void Java_org_rocksdb_DBOptions_setDumpMallocStats( * Method: dumpMallocStats * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_dumpMallocStats( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_dumpMallocStats(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->dump_malloc_stats); } @@ -7528,8 +7535,8 @@ void Java_org_rocksdb_DBOptions_setAvoidFlushDuringRecovery( * Method: avoidFlushDuringRecovery * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringRecovery( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringRecovery(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->avoid_flush_during_recovery); } @@ -7550,8 +7557,8 @@ void Java_org_rocksdb_DBOptions_setAllowIngestBehind( * Method: allowIngestBehind * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_allowIngestBehind( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_allowIngestBehind(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->allow_ingest_behind); } @@ -7561,8 +7568,9 @@ jboolean Java_org_rocksdb_DBOptions_allowIngestBehind( * Method: setTwoWriteQueues * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setTwoWriteQueues( - JNIEnv*, jobject, jlong jhandle, jboolean jtwo_write_queues) { +void Java_org_rocksdb_DBOptions_setTwoWriteQueues(JNIEnv*, jobject, + jlong jhandle, + jboolean jtwo_write_queues) { auto* opt = reinterpret_cast(jhandle); opt->two_write_queues = jtwo_write_queues == JNI_TRUE; } @@ -7572,8 +7580,8 @@ void Java_org_rocksdb_DBOptions_setTwoWriteQueues( * Method: twoWriteQueues * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_twoWriteQueues( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_twoWriteQueues(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->two_write_queues); } @@ -7583,8 +7591,9 @@ jboolean Java_org_rocksdb_DBOptions_twoWriteQueues( * Method: setManualWalFlush * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setManualWalFlush( - JNIEnv*, jobject, jlong jhandle, jboolean jmanual_wal_flush) { +void Java_org_rocksdb_DBOptions_setManualWalFlush(JNIEnv*, jobject, + jlong jhandle, + jboolean jmanual_wal_flush) { auto* opt = reinterpret_cast(jhandle); opt->manual_wal_flush = jmanual_wal_flush == JNI_TRUE; } @@ -7594,8 +7603,8 @@ void Java_org_rocksdb_DBOptions_setManualWalFlush( * Method: manualWalFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_manualWalFlush( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_manualWalFlush(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->manual_wal_flush); } @@ -7605,8 +7614,8 @@ jboolean Java_org_rocksdb_DBOptions_manualWalFlush( * Method: setAtomicFlush * Signature: (JZ)V */ -void Java_org_rocksdb_DBOptions_setAtomicFlush( - JNIEnv*, jobject, jlong jhandle, jboolean jatomic_flush) { +void Java_org_rocksdb_DBOptions_setAtomicFlush(JNIEnv*, jobject, jlong jhandle, + jboolean jatomic_flush) { auto* opt = reinterpret_cast(jhandle); opt->atomic_flush = jatomic_flush == JNI_TRUE; } @@ -7616,8 +7625,8 @@ void Java_org_rocksdb_DBOptions_setAtomicFlush( * Method: atomicFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_atomicFlush( - JNIEnv *, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_atomicFlush(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->atomic_flush); } @@ -7639,8 +7648,8 @@ void Java_org_rocksdb_DBOptions_setAvoidFlushDuringShutdown( * Method: avoidFlushDuringShutdown * Signature: (J)Z */ -jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringShutdown( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringShutdown(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->avoid_flush_during_shutdown); } @@ -7809,8 +7818,7 @@ jlong Java_org_rocksdb_DBOptions_bgerrorResumeRetryInterval(JNIEnv*, jclass, * Method: newWriteOptions * Signature: ()J */ -jlong Java_org_rocksdb_WriteOptions_newWriteOptions( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_WriteOptions_newWriteOptions(JNIEnv*, jclass) { auto* op = new ROCKSDB_NAMESPACE::WriteOptions(); return GET_CPLUSPLUS_POINTER(op); } @@ -7820,8 +7828,8 @@ jlong Java_org_rocksdb_WriteOptions_newWriteOptions( * Method: copyWriteOptions * Signature: (J)J */ -jlong Java_org_rocksdb_WriteOptions_copyWriteOptions( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_WriteOptions_copyWriteOptions(JNIEnv*, jclass, + jlong jhandle) { auto new_opt = new ROCKSDB_NAMESPACE::WriteOptions( *(reinterpret_cast(jhandle))); return GET_CPLUSPLUS_POINTER(new_opt); @@ -7832,8 +7840,8 @@ jlong Java_org_rocksdb_WriteOptions_copyWriteOptions( * Method: disposeInternal * Signature: ()V */ -void Java_org_rocksdb_WriteOptions_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_WriteOptions_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* write_options = reinterpret_cast(jhandle); assert(write_options != nullptr); @@ -7845,8 +7853,8 @@ void Java_org_rocksdb_WriteOptions_disposeInternal( * Method: setSync * Signature: (JZ)V */ -void Java_org_rocksdb_WriteOptions_setSync( - JNIEnv*, jobject, jlong jhandle, jboolean jflag) { +void Java_org_rocksdb_WriteOptions_setSync(JNIEnv*, jobject, jlong jhandle, + jboolean jflag) { reinterpret_cast(jhandle)->sync = jflag; } @@ -7855,8 +7863,7 @@ void Java_org_rocksdb_WriteOptions_setSync( * Method: sync * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteOptions_sync( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_WriteOptions_sync(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->sync; } @@ -7865,8 +7872,9 @@ jboolean Java_org_rocksdb_WriteOptions_sync( * Method: setDisableWAL * Signature: (JZ)V */ -void Java_org_rocksdb_WriteOptions_setDisableWAL( - JNIEnv*, jobject, jlong jhandle, jboolean jflag) { +void Java_org_rocksdb_WriteOptions_setDisableWAL(JNIEnv*, jobject, + jlong jhandle, + jboolean jflag) { reinterpret_cast(jhandle)->disableWAL = jflag; } @@ -7876,8 +7884,8 @@ void Java_org_rocksdb_WriteOptions_setDisableWAL( * Method: disableWAL * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteOptions_disableWAL( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_WriteOptions_disableWAL(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->disableWAL; } @@ -7888,8 +7896,7 @@ jboolean Java_org_rocksdb_WriteOptions_disableWAL( * Signature: (JZ)V */ void Java_org_rocksdb_WriteOptions_setIgnoreMissingColumnFamilies( - JNIEnv*, jobject, jlong jhandle, - jboolean jignore_missing_column_families) { + JNIEnv*, jobject, jlong jhandle, jboolean jignore_missing_column_families) { reinterpret_cast(jhandle) ->ignore_missing_column_families = static_cast(jignore_missing_column_families); @@ -7911,8 +7918,9 @@ jboolean Java_org_rocksdb_WriteOptions_ignoreMissingColumnFamilies( * Method: setNoSlowdown * Signature: (JZ)V */ -void Java_org_rocksdb_WriteOptions_setNoSlowdown( - JNIEnv*, jobject, jlong jhandle, jboolean jno_slowdown) { +void Java_org_rocksdb_WriteOptions_setNoSlowdown(JNIEnv*, jobject, + jlong jhandle, + jboolean jno_slowdown) { reinterpret_cast(jhandle)->no_slowdown = static_cast(jno_slowdown); } @@ -7922,8 +7930,8 @@ void Java_org_rocksdb_WriteOptions_setNoSlowdown( * Method: noSlowdown * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteOptions_noSlowdown( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_WriteOptions_noSlowdown(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->no_slowdown; } @@ -7933,8 +7941,8 @@ jboolean Java_org_rocksdb_WriteOptions_noSlowdown( * Method: setLowPri * Signature: (JZ)V */ -void Java_org_rocksdb_WriteOptions_setLowPri( - JNIEnv*, jobject, jlong jhandle, jboolean jlow_pri) { +void Java_org_rocksdb_WriteOptions_setLowPri(JNIEnv*, jobject, jlong jhandle, + jboolean jlow_pri) { reinterpret_cast(jhandle)->low_pri = static_cast(jlow_pri); } @@ -7944,8 +7952,7 @@ void Java_org_rocksdb_WriteOptions_setLowPri( * Method: lowPri * Signature: (J)Z */ -jboolean Java_org_rocksdb_WriteOptions_lowPri( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_WriteOptions_lowPri(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->low_pri; } @@ -7980,8 +7987,7 @@ void Java_org_rocksdb_WriteOptions_setMemtableInsertHintPerBatch( * Method: newReadOptions * Signature: ()J */ -jlong Java_org_rocksdb_ReadOptions_newReadOptions__( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_ReadOptions_newReadOptions__(JNIEnv*, jclass) { auto* read_options = new ROCKSDB_NAMESPACE::ReadOptions(); return GET_CPLUSPLUS_POINTER(read_options); } @@ -8003,8 +8009,8 @@ jlong Java_org_rocksdb_ReadOptions_newReadOptions__ZZ( * Method: copyReadOptions * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_copyReadOptions( - JNIEnv*, jclass, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_copyReadOptions(JNIEnv*, jclass, + jlong jhandle) { auto new_opt = new ROCKSDB_NAMESPACE::ReadOptions( *(reinterpret_cast(jhandle))); return GET_CPLUSPLUS_POINTER(new_opt); @@ -8015,8 +8021,8 @@ jlong Java_org_rocksdb_ReadOptions_copyReadOptions( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ReadOptions_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_ReadOptions_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* read_options = reinterpret_cast(jhandle); assert(read_options != nullptr); @@ -8039,8 +8045,8 @@ void Java_org_rocksdb_ReadOptions_setVerifyChecksums( * Method: verifyChecksums * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_verifyChecksums( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->verify_checksums; } @@ -8050,8 +8056,8 @@ jboolean Java_org_rocksdb_ReadOptions_verifyChecksums( * Method: setFillCache * Signature: (JZ)V */ -void Java_org_rocksdb_ReadOptions_setFillCache( - JNIEnv*, jobject, jlong jhandle, jboolean jfill_cache) { +void Java_org_rocksdb_ReadOptions_setFillCache(JNIEnv*, jobject, jlong jhandle, + jboolean jfill_cache) { reinterpret_cast(jhandle)->fill_cache = static_cast(jfill_cache); } @@ -8061,8 +8067,8 @@ void Java_org_rocksdb_ReadOptions_setFillCache( * Method: fillCache * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_fillCache( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_fillCache(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle)->fill_cache; } @@ -8071,8 +8077,8 @@ jboolean Java_org_rocksdb_ReadOptions_fillCache( * Method: setTailing * Signature: (JZ)V */ -void Java_org_rocksdb_ReadOptions_setTailing( - JNIEnv*, jobject, jlong jhandle, jboolean jtailing) { +void Java_org_rocksdb_ReadOptions_setTailing(JNIEnv*, jobject, jlong jhandle, + jboolean jtailing) { reinterpret_cast(jhandle)->tailing = static_cast(jtailing); } @@ -8082,8 +8088,7 @@ void Java_org_rocksdb_ReadOptions_setTailing( * Method: tailing * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_tailing( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_tailing(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->tailing; } @@ -8092,8 +8097,7 @@ jboolean Java_org_rocksdb_ReadOptions_tailing( * Method: managed * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_managed( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_managed(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->managed; } @@ -8102,8 +8106,8 @@ jboolean Java_org_rocksdb_ReadOptions_managed( * Method: setManaged * Signature: (JZ)V */ -void Java_org_rocksdb_ReadOptions_setManaged( - JNIEnv*, jobject, jlong jhandle, jboolean jmanaged) { +void Java_org_rocksdb_ReadOptions_setManaged(JNIEnv*, jobject, jlong jhandle, + jboolean jmanaged) { reinterpret_cast(jhandle)->managed = static_cast(jmanaged); } @@ -8113,8 +8117,8 @@ void Java_org_rocksdb_ReadOptions_setManaged( * Method: totalOrderSeek * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->total_order_seek; } @@ -8135,8 +8139,8 @@ void Java_org_rocksdb_ReadOptions_setTotalOrderSeek( * Method: prefixSameAsStart * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle) ->prefix_same_as_start; } @@ -8157,8 +8161,7 @@ void Java_org_rocksdb_ReadOptions_setPrefixSameAsStart( * Method: pinData * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_pinData( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_pinData(JNIEnv*, jobject, jlong jhandle) { return reinterpret_cast(jhandle)->pin_data; } @@ -8167,8 +8170,8 @@ jboolean Java_org_rocksdb_ReadOptions_pinData( * Method: setPinData * Signature: (JZ)V */ -void Java_org_rocksdb_ReadOptions_setPinData( - JNIEnv*, jobject, jlong jhandle, jboolean jpin_data) { +void Java_org_rocksdb_ReadOptions_setPinData(JNIEnv*, jobject, jlong jhandle, + jboolean jpin_data) { reinterpret_cast(jhandle)->pin_data = static_cast(jpin_data); } @@ -8202,8 +8205,8 @@ void Java_org_rocksdb_ReadOptions_setBackgroundPurgeOnIteratorCleanup( * Method: readaheadSize * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_readaheadSize( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_readaheadSize(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->readahead_size); } @@ -8213,8 +8216,9 @@ jlong Java_org_rocksdb_ReadOptions_readaheadSize( * Method: setReadaheadSize * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_setReadaheadSize( - JNIEnv*, jobject, jlong jhandle, jlong jreadahead_size) { +void Java_org_rocksdb_ReadOptions_setReadaheadSize(JNIEnv*, jobject, + jlong jhandle, + jlong jreadahead_size) { auto* opt = reinterpret_cast(jhandle); opt->readahead_size = static_cast(jreadahead_size); } @@ -8224,8 +8228,8 @@ void Java_org_rocksdb_ReadOptions_setReadaheadSize( * Method: maxSkippableInternalKeys * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_maxSkippableInternalKeys( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_maxSkippableInternalKeys(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->max_skippable_internal_keys); } @@ -8247,8 +8251,8 @@ void Java_org_rocksdb_ReadOptions_setMaxSkippableInternalKeys( * Method: ignoreRangeDeletions * Signature: (J)Z */ -jboolean Java_org_rocksdb_ReadOptions_ignoreRangeDeletions( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ReadOptions_ignoreRangeDeletions(JNIEnv*, jobject, + jlong jhandle) { auto* opt = reinterpret_cast(jhandle); return static_cast(opt->ignore_range_deletions); } @@ -8269,8 +8273,8 @@ void Java_org_rocksdb_ReadOptions_setIgnoreRangeDeletions( * Method: setSnapshot * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_setSnapshot( - JNIEnv*, jobject, jlong jhandle, jlong jsnapshot) { +void Java_org_rocksdb_ReadOptions_setSnapshot(JNIEnv*, jobject, jlong jhandle, + jlong jsnapshot) { reinterpret_cast(jhandle)->snapshot = reinterpret_cast(jsnapshot); } @@ -8280,8 +8284,7 @@ void Java_org_rocksdb_ReadOptions_setSnapshot( * Method: snapshot * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_snapshot( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_snapshot(JNIEnv*, jobject, jlong jhandle) { auto& snapshot = reinterpret_cast(jhandle)->snapshot; return GET_CPLUSPLUS_POINTER(snapshot); @@ -8292,8 +8295,7 @@ jlong Java_org_rocksdb_ReadOptions_snapshot( * Method: readTier * Signature: (J)B */ -jbyte Java_org_rocksdb_ReadOptions_readTier( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_ReadOptions_readTier(JNIEnv*, jobject, jlong jhandle) { return static_cast( reinterpret_cast(jhandle)->read_tier); } @@ -8303,8 +8305,8 @@ jbyte Java_org_rocksdb_ReadOptions_readTier( * Method: setReadTier * Signature: (JB)V */ -void Java_org_rocksdb_ReadOptions_setReadTier( - JNIEnv*, jobject, jlong jhandle, jbyte jread_tier) { +void Java_org_rocksdb_ReadOptions_setReadTier(JNIEnv*, jobject, jlong jhandle, + jbyte jread_tier) { reinterpret_cast(jhandle)->read_tier = static_cast(jread_tier); } @@ -8326,8 +8328,8 @@ void Java_org_rocksdb_ReadOptions_setIterateUpperBound( * Method: iterateUpperBound * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_iterateUpperBound( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_iterateUpperBound(JNIEnv*, jobject, + jlong jhandle) { auto& upper_bound_slice_handle = reinterpret_cast(jhandle) ->iterate_upper_bound; @@ -8351,8 +8353,8 @@ void Java_org_rocksdb_ReadOptions_setIterateLowerBound( * Method: iterateLowerBound * Signature: (J)J */ -jlong Java_org_rocksdb_ReadOptions_iterateLowerBound( - JNIEnv*, jobject, jlong jhandle) { +jlong Java_org_rocksdb_ReadOptions_iterateLowerBound(JNIEnv*, jobject, + jlong jhandle) { auto& lower_bound_slice_handle = reinterpret_cast(jhandle) ->iterate_lower_bound; @@ -8516,8 +8518,7 @@ void Java_org_rocksdb_ReadOptions_setValueSizeSoftLimit( * Method: newComparatorOptions * Signature: ()J */ -jlong Java_org_rocksdb_ComparatorOptions_newComparatorOptions( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_ComparatorOptions_newComparatorOptions(JNIEnv*, jclass) { auto* comparator_opt = new ROCKSDB_NAMESPACE::ComparatorJniCallbackOptions(); return GET_CPLUSPLUS_POINTER(comparator_opt); } @@ -8528,7 +8529,7 @@ jlong Java_org_rocksdb_ComparatorOptions_newComparatorOptions( * Signature: (J)B */ jbyte Java_org_rocksdb_ComparatorOptions_reusedSynchronisationType( - JNIEnv *, jobject, jlong jhandle) { + JNIEnv*, jobject, jlong jhandle) { auto* comparator_opt = reinterpret_cast( jhandle); @@ -8557,8 +8558,8 @@ void Java_org_rocksdb_ComparatorOptions_setReusedSynchronisationType( * Method: useDirectBuffer * Signature: (J)Z */ -jboolean Java_org_rocksdb_ComparatorOptions_useDirectBuffer( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_ComparatorOptions_useDirectBuffer(JNIEnv*, jobject, + jlong jhandle) { return static_cast( reinterpret_cast( jhandle) @@ -8581,8 +8582,8 @@ void Java_org_rocksdb_ComparatorOptions_setUseDirectBuffer( * Method: maxReusedBufferSize * Signature: (J)I */ -jint Java_org_rocksdb_ComparatorOptions_maxReusedBufferSize( - JNIEnv*, jobject, jlong jhandle) { +jint Java_org_rocksdb_ComparatorOptions_maxReusedBufferSize(JNIEnv*, jobject, + jlong jhandle) { return static_cast( reinterpret_cast( jhandle) @@ -8605,8 +8606,8 @@ void Java_org_rocksdb_ComparatorOptions_setMaxReusedBufferSize( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_ComparatorOptions_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_ComparatorOptions_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* comparator_opt = reinterpret_cast( jhandle); @@ -8622,8 +8623,7 @@ void Java_org_rocksdb_ComparatorOptions_disposeInternal( * Method: newFlushOptions * Signature: ()J */ -jlong Java_org_rocksdb_FlushOptions_newFlushOptions( - JNIEnv*, jclass) { +jlong Java_org_rocksdb_FlushOptions_newFlushOptions(JNIEnv*, jclass) { auto* flush_opt = new ROCKSDB_NAMESPACE::FlushOptions(); return GET_CPLUSPLUS_POINTER(flush_opt); } @@ -8633,8 +8633,9 @@ jlong Java_org_rocksdb_FlushOptions_newFlushOptions( * Method: setWaitForFlush * Signature: (JZ)V */ -void Java_org_rocksdb_FlushOptions_setWaitForFlush( - JNIEnv*, jobject, jlong jhandle, jboolean jwait) { +void Java_org_rocksdb_FlushOptions_setWaitForFlush(JNIEnv*, jobject, + jlong jhandle, + jboolean jwait) { reinterpret_cast(jhandle)->wait = static_cast(jwait); } @@ -8644,8 +8645,8 @@ void Java_org_rocksdb_FlushOptions_setWaitForFlush( * Method: waitForFlush * Signature: (J)Z */ -jboolean Java_org_rocksdb_FlushOptions_waitForFlush( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_FlushOptions_waitForFlush(JNIEnv*, jobject, + jlong jhandle) { return reinterpret_cast(jhandle)->wait; } @@ -8666,8 +8667,8 @@ void Java_org_rocksdb_FlushOptions_setAllowWriteStall( * Method: allowWriteStall * Signature: (J)Z */ -jboolean Java_org_rocksdb_FlushOptions_allowWriteStall( - JNIEnv*, jobject, jlong jhandle) { +jboolean Java_org_rocksdb_FlushOptions_allowWriteStall(JNIEnv*, jobject, + jlong jhandle) { auto* flush_options = reinterpret_cast(jhandle); return static_cast(flush_options->allow_write_stall); @@ -8678,8 +8679,8 @@ jboolean Java_org_rocksdb_FlushOptions_allowWriteStall( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_FlushOptions_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_FlushOptions_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* flush_opt = reinterpret_cast(jhandle); assert(flush_opt != nullptr); delete flush_opt; diff --git a/java/rocksjni/options_util.cc b/java/rocksjni/options_util.cc index f529c548b92..1a5fb9bb5ac 100644 --- a/java/rocksjni/options_util.cc +++ b/java/rocksjni/options_util.cc @@ -6,14 +6,15 @@ // This file implements the "bridge" between Java and C++ and enables // calling C++ ROCKSDB_NAMESPACE::OptionsUtil methods from Java side. +#include "rocksdb/utilities/options_util.h" + #include + #include #include "include/org_rocksdb_OptionsUtil.h" - #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "rocksdb/utilities/options_util.h" #include "rocksjni/portal.h" void build_column_family_descriptor_list( diff --git a/java/rocksjni/persistent_cache.cc b/java/rocksjni/persistent_cache.cc index ff930a74e94..295d9179804 100644 --- a/java/rocksjni/persistent_cache.cc +++ b/java/rocksjni/persistent_cache.cc @@ -23,8 +23,8 @@ * Signature: (JLjava/lang/String;JJZ)J */ jlong Java_org_rocksdb_PersistentCache_newPersistentCache( - JNIEnv* env, jclass, jlong jenv_handle, jstring jpath, - jlong jsz, jlong jlogger_handle, jboolean joptimized_for_nvm) { + JNIEnv* env, jclass, jlong jenv_handle, jstring jpath, jlong jsz, + jlong jlogger_handle, jboolean joptimized_for_nvm) { auto* rocks_env = reinterpret_cast(jenv_handle); jboolean has_exception = JNI_FALSE; std::string path = @@ -51,8 +51,8 @@ jlong Java_org_rocksdb_PersistentCache_newPersistentCache( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_PersistentCache_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_PersistentCache_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* cache = reinterpret_cast*>( jhandle); diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 16f6d73d348..1a72507a935 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -73,14 +73,13 @@ class JavaClass { }; // Native class template -template class RocksDBNativeClass : public JavaClass { -}; +template +class RocksDBNativeClass : public JavaClass {}; // Native class template for sub-classes of RocksMutableObject -template class NativeRocksMutableObject - : public RocksDBNativeClass { +template +class NativeRocksMutableObject : public RocksDBNativeClass { public: - /** * Gets the Java Method ID for the * RocksMutableObject#setNativeHandle(long, boolean) method @@ -92,12 +91,11 @@ template class NativeRocksMutableObject */ static jmethodID getSetNativeHandleMethod(JNIEnv* env) { static jclass jclazz = DERIVED::getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { return nullptr; } - static jmethodID mid = env->GetMethodID( - jclazz, "setNativeHandle", "(JZ)V"); + static jmethodID mid = env->GetMethodID(jclazz, "setNativeHandle", "(JZ)V"); assert(mid != nullptr); return mid; } @@ -114,16 +112,16 @@ template class NativeRocksMutableObject * @return true if a Java exception is pending, false otherwise */ static bool setHandle(JNIEnv* env, jobject jobj, PTR ptr, - jboolean java_owns_handle) { + jboolean java_owns_handle) { assert(jobj != nullptr); static jmethodID mid = getSetNativeHandleMethod(env); - if(mid == nullptr) { + if (mid == nullptr) { return true; // signal exception } env->CallVoidMethod(jobj, mid, GET_CPLUSPLUS_POINTER(ptr), java_owns_handle); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { return true; // signal exception } @@ -132,7 +130,8 @@ template class NativeRocksMutableObject }; // Java Exception template -template class JavaException : public JavaClass { +template +class JavaException : public JavaClass { public: /** * Create and throw a java exception with the provided message @@ -144,16 +143,18 @@ template class JavaException : public JavaClass { */ static bool ThrowNew(JNIEnv* env, const std::string& msg) { jclass jclazz = DERIVED::getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class - std::cerr << "JavaException::ThrowNew - Error: unexpected exception!" << std::endl; + std::cerr << "JavaException::ThrowNew - Error: unexpected exception!" + << std::endl; return env->ExceptionCheck(); } const jint rs = env->ThrowNew(jclazz, msg.c_str()); - if(rs != JNI_OK) { + if (rs != JNI_OK) { // exception could not be thrown - std::cerr << "JavaException::ThrowNew - Fatal: could not throw exception!" << std::endl; + std::cerr << "JavaException::ThrowNew - Fatal: could not throw exception!" + << std::endl; return env->ExceptionCheck(); } @@ -162,8 +163,8 @@ template class JavaException : public JavaClass { }; // The portal class for java.lang.IllegalArgumentException -class IllegalArgumentExceptionJni : - public JavaException { +class IllegalArgumentExceptionJni + : public JavaException { public: /** * Get the Java Class java.lang.IllegalArgumentException @@ -196,9 +197,11 @@ class IllegalArgumentExceptionJni : // get the IllegalArgumentException class jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class - std::cerr << "IllegalArgumentExceptionJni::ThrowNew/class - Error: unexpected exception!" << std::endl; + std::cerr << "IllegalArgumentExceptionJni::ThrowNew/class - Error: " + "unexpected exception!" + << std::endl; return env->ExceptionCheck(); } @@ -232,13 +235,12 @@ class CodeJni : public JavaClass { */ static jmethodID getValueMethod(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = - env->GetMethodID(jclazz, "getValue", "()b"); + static jmethodID mid = env->GetMethodID(jclazz, "getValue", "()b"); assert(mid != nullptr); return mid; } @@ -270,13 +272,12 @@ class SubCodeJni : public JavaClass { */ static jmethodID getValueMethod(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = - env->GetMethodID(jclazz, "getValue", "()b"); + static jmethodID mid = env->GetMethodID(jclazz, "getValue", "()b"); assert(mid != nullptr); return mid; } @@ -335,7 +336,7 @@ class StatusJni */ static jmethodID getCodeMethod(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -356,13 +357,13 @@ class StatusJni */ static jmethodID getSubCodeMethod(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = - env->GetMethodID(jclazz, "getSubCode", "()Lorg/rocksdb/Status$SubCode;"); + static jmethodID mid = env->GetMethodID(jclazz, "getSubCode", + "()Lorg/rocksdb/Status$SubCode;"); assert(mid != nullptr); return mid; } @@ -377,7 +378,7 @@ class StatusJni */ static jmethodID getStateMethod(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -400,14 +401,14 @@ class StatusJni */ static jobject construct(JNIEnv* env, const Status& status) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } jmethodID mid = env->GetMethodID(jclazz, "", "(BBLjava/lang/String;)V"); - if(mid == nullptr) { + if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; } @@ -417,8 +418,8 @@ class StatusJni if (status.getState() != nullptr) { const char* const state = status.getState(); jstate = env->NewStringUTF(state); - if(env->ExceptionCheck()) { - if(jstate != nullptr) { + if (env->ExceptionCheck()) { + if (jstate != nullptr) { env->DeleteLocalRef(jstate); } return nullptr; @@ -427,16 +428,16 @@ class StatusJni jobject jstatus = env->NewObject(jclazz, mid, toJavaStatusCode(status.code()), - toJavaStatusSubCode(status.subcode()), jstate); - if(env->ExceptionCheck()) { + toJavaStatusSubCode(status.subcode()), jstate); + if (env->ExceptionCheck()) { // exception occurred - if(jstate != nullptr) { + if (jstate != nullptr) { env->DeleteLocalRef(jstate); } return nullptr; } - if(jstate != nullptr) { + if (jstate != nullptr) { env->DeleteLocalRef(jstate); } @@ -517,24 +518,24 @@ class StatusJni std::unique_ptr status; switch (jcode_value) { case 0x0: - //Ok + // Ok status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::OK())); break; case 0x1: - //NotFound + // NotFound status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::NotFound( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); break; case 0x2: - //Corruption + // Corruption status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Corruption( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); break; case 0x3: - //NotSupported + // NotSupported status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status( ROCKSDB_NAMESPACE::Status::NotSupported( @@ -542,7 +543,7 @@ class StatusJni jsub_code_value)))); break; case 0x4: - //InvalidArgument + // InvalidArgument status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status( ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -550,13 +551,13 @@ class StatusJni jsub_code_value)))); break; case 0x5: - //IOError + // IOError status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::IOError( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); break; case 0x6: - //MergeInProgress + // MergeInProgress status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status( ROCKSDB_NAMESPACE::Status::MergeInProgress( @@ -564,13 +565,13 @@ class StatusJni jsub_code_value)))); break; case 0x7: - //Incomplete + // Incomplete status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Incomplete( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); break; case 0x8: - //ShutdownInProgress + // ShutdownInProgress status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status( ROCKSDB_NAMESPACE::Status::ShutdownInProgress( @@ -578,31 +579,31 @@ class StatusJni jsub_code_value)))); break; case 0x9: - //TimedOut + // TimedOut status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::TimedOut( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); break; case 0xA: - //Aborted + // Aborted status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Aborted( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); break; case 0xB: - //Busy + // Busy status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Busy( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); break; case 0xC: - //Expired + // Expired status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::Expired( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); break; case 0xD: - //TryAgain + // TryAgain status = std::unique_ptr( new ROCKSDB_NAMESPACE::Status(ROCKSDB_NAMESPACE::Status::TryAgain( ROCKSDB_NAMESPACE::SubCodeJni::toCppSubCode(jsub_code_value)))); @@ -719,8 +720,7 @@ class StatusJni }; // The portal class for org.rocksdb.RocksDBException -class RocksDBExceptionJni : - public JavaException { +class RocksDBExceptionJni : public JavaException { public: /** * Get the Java Class org.rocksdb.RocksDBException @@ -778,60 +778,71 @@ class RocksDBExceptionJni : // get the RocksDBException class jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class - std::cerr << "RocksDBExceptionJni::ThrowNew/class - Error: unexpected exception!" << std::endl; + std::cerr << "RocksDBExceptionJni::ThrowNew/class - Error: unexpected " + "exception!" + << std::endl; return env->ExceptionCheck(); } // get the constructor of org.rocksdb.RocksDBException jmethodID mid = env->GetMethodID(jclazz, "", "(Lorg/rocksdb/Status;)V"); - if(mid == nullptr) { + if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError - std::cerr << "RocksDBExceptionJni::ThrowNew/cstr - Error: unexpected exception!" << std::endl; + std::cerr + << "RocksDBExceptionJni::ThrowNew/cstr - Error: unexpected exception!" + << std::endl; return env->ExceptionCheck(); } // get the Java status object jobject jstatus = StatusJni::construct(env, s); - if(jstatus == nullptr) { + if (jstatus == nullptr) { // exception occcurred - std::cerr << "RocksDBExceptionJni::ThrowNew/StatusJni - Error: unexpected exception!" << std::endl; + std::cerr << "RocksDBExceptionJni::ThrowNew/StatusJni - Error: " + "unexpected exception!" + << std::endl; return env->ExceptionCheck(); } // construct the RocksDBException - jthrowable rocksdb_exception = reinterpret_cast(env->NewObject(jclazz, mid, jstatus)); - if(env->ExceptionCheck()) { - if(jstatus != nullptr) { + jthrowable rocksdb_exception = + reinterpret_cast(env->NewObject(jclazz, mid, jstatus)); + if (env->ExceptionCheck()) { + if (jstatus != nullptr) { env->DeleteLocalRef(jstatus); } - if(rocksdb_exception != nullptr) { + if (rocksdb_exception != nullptr) { env->DeleteLocalRef(rocksdb_exception); } - std::cerr << "RocksDBExceptionJni::ThrowNew/NewObject - Error: unexpected exception!" << std::endl; + std::cerr << "RocksDBExceptionJni::ThrowNew/NewObject - Error: " + "unexpected exception!" + << std::endl; return true; } // throw the RocksDBException const jint rs = env->Throw(rocksdb_exception); - if(rs != JNI_OK) { + if (rs != JNI_OK) { // exception could not be thrown - std::cerr << "RocksDBExceptionJni::ThrowNew - Fatal: could not throw exception!" << std::endl; - if(jstatus != nullptr) { + std::cerr + << "RocksDBExceptionJni::ThrowNew - Fatal: could not throw exception!" + << std::endl; + if (jstatus != nullptr) { env->DeleteLocalRef(jstatus); } - if(rocksdb_exception != nullptr) { + if (rocksdb_exception != nullptr) { env->DeleteLocalRef(rocksdb_exception); } return env->ExceptionCheck(); } - if(jstatus != nullptr) { + if (jstatus != nullptr) { env->DeleteLocalRef(jstatus); } - if(rocksdb_exception != nullptr) { + if (rocksdb_exception != nullptr) { env->DeleteLocalRef(rocksdb_exception); } @@ -858,79 +869,92 @@ class RocksDBExceptionJni : // get the RocksDBException class jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class - std::cerr << "RocksDBExceptionJni::ThrowNew/class - Error: unexpected exception!" << std::endl; + std::cerr << "RocksDBExceptionJni::ThrowNew/class - Error: unexpected " + "exception!" + << std::endl; return env->ExceptionCheck(); } // get the constructor of org.rocksdb.RocksDBException - jmethodID mid = - env->GetMethodID(jclazz, "", "(Ljava/lang/String;Lorg/rocksdb/Status;)V"); - if(mid == nullptr) { + jmethodID mid = env->GetMethodID( + jclazz, "", "(Ljava/lang/String;Lorg/rocksdb/Status;)V"); + if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError - std::cerr << "RocksDBExceptionJni::ThrowNew/cstr - Error: unexpected exception!" << std::endl; + std::cerr + << "RocksDBExceptionJni::ThrowNew/cstr - Error: unexpected exception!" + << std::endl; return env->ExceptionCheck(); } jstring jmsg = env->NewStringUTF(msg.c_str()); - if(jmsg == nullptr) { + if (jmsg == nullptr) { // exception thrown: OutOfMemoryError - std::cerr << "RocksDBExceptionJni::ThrowNew/msg - Error: unexpected exception!" << std::endl; + std::cerr + << "RocksDBExceptionJni::ThrowNew/msg - Error: unexpected exception!" + << std::endl; return env->ExceptionCheck(); } // get the Java status object jobject jstatus = StatusJni::construct(env, s); - if(jstatus == nullptr) { + if (jstatus == nullptr) { // exception occcurred - std::cerr << "RocksDBExceptionJni::ThrowNew/StatusJni - Error: unexpected exception!" << std::endl; - if(jmsg != nullptr) { + std::cerr << "RocksDBExceptionJni::ThrowNew/StatusJni - Error: " + "unexpected exception!" + << std::endl; + if (jmsg != nullptr) { env->DeleteLocalRef(jmsg); } return env->ExceptionCheck(); } // construct the RocksDBException - jthrowable rocksdb_exception = reinterpret_cast(env->NewObject(jclazz, mid, jmsg, jstatus)); - if(env->ExceptionCheck()) { - if(jstatus != nullptr) { + jthrowable rocksdb_exception = reinterpret_cast( + env->NewObject(jclazz, mid, jmsg, jstatus)); + if (env->ExceptionCheck()) { + if (jstatus != nullptr) { env->DeleteLocalRef(jstatus); } - if(jmsg != nullptr) { + if (jmsg != nullptr) { env->DeleteLocalRef(jmsg); } - if(rocksdb_exception != nullptr) { + if (rocksdb_exception != nullptr) { env->DeleteLocalRef(rocksdb_exception); } - std::cerr << "RocksDBExceptionJni::ThrowNew/NewObject - Error: unexpected exception!" << std::endl; + std::cerr << "RocksDBExceptionJni::ThrowNew/NewObject - Error: " + "unexpected exception!" + << std::endl; return true; } // throw the RocksDBException const jint rs = env->Throw(rocksdb_exception); - if(rs != JNI_OK) { + if (rs != JNI_OK) { // exception could not be thrown - std::cerr << "RocksDBExceptionJni::ThrowNew - Fatal: could not throw exception!" << std::endl; - if(jstatus != nullptr) { + std::cerr + << "RocksDBExceptionJni::ThrowNew - Fatal: could not throw exception!" + << std::endl; + if (jstatus != nullptr) { env->DeleteLocalRef(jstatus); } - if(jmsg != nullptr) { + if (jmsg != nullptr) { env->DeleteLocalRef(jmsg); } - if(rocksdb_exception != nullptr) { + if (rocksdb_exception != nullptr) { env->DeleteLocalRef(rocksdb_exception); } return env->ExceptionCheck(); } - if(jstatus != nullptr) { + if (jstatus != nullptr) { env->DeleteLocalRef(jstatus); } - if(jmsg != nullptr) { + if (jmsg != nullptr) { env->DeleteLocalRef(jmsg); } - if(rocksdb_exception != nullptr) { + if (rocksdb_exception != nullptr) { env->DeleteLocalRef(rocksdb_exception); } @@ -947,7 +971,7 @@ class RocksDBExceptionJni : */ static jmethodID getStatusMethod(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -960,26 +984,26 @@ class RocksDBExceptionJni : static std::unique_ptr toCppStatus( JNIEnv* env, jthrowable jrocksdb_exception) { - if(!env->IsInstanceOf(jrocksdb_exception, getJClass(env))) { + if (!env->IsInstanceOf(jrocksdb_exception, getJClass(env))) { // not an instance of RocksDBException return nullptr; } // get the java status object jmethodID mid = getStatusMethod(env); - if(mid == nullptr) { + if (mid == nullptr) { // exception occurred accessing class or method return nullptr; } jobject jstatus = env->CallObjectMethod(jrocksdb_exception, mid); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception occurred return nullptr; } - if(jstatus == nullptr) { - return nullptr; // no status available + if (jstatus == nullptr) { + return nullptr; // no status available } return ROCKSDB_NAMESPACE::StatusJni::toCppStatus(env, jstatus); @@ -1038,7 +1062,7 @@ class ListJni : public JavaClass { */ static jmethodID getIteratorMethod(JNIEnv* env) { jclass jlist_clazz = getListClass(env); - if(jlist_clazz == nullptr) { + if (jlist_clazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -1059,7 +1083,7 @@ class ListJni : public JavaClass { */ static jmethodID getHasNextMethod(JNIEnv* env) { jclass jiterator_clazz = getIteratorClass(env); - if(jiterator_clazz == nullptr) { + if (jiterator_clazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -1079,7 +1103,7 @@ class ListJni : public JavaClass { */ static jmethodID getNextMethod(JNIEnv* env) { jclass jiterator_clazz = getIteratorClass(env); - if(jiterator_clazz == nullptr) { + if (jiterator_clazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -1100,7 +1124,7 @@ class ListJni : public JavaClass { */ static jmethodID getArrayListConstructorMethodId(JNIEnv* env) { jclass jarray_list_clazz = getArrayListClass(env); - if(jarray_list_clazz == nullptr) { + if (jarray_list_clazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -1120,7 +1144,7 @@ class ListJni : public JavaClass { */ static jmethodID getListAddMethodId(JNIEnv* env) { jclass jlist_clazz = getListClass(env); - if(jlist_clazz == nullptr) { + if (jlist_clazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -1171,7 +1195,7 @@ class ByteJni : public JavaClass { */ static jobjectArray new2dByteArray(JNIEnv* env, const jsize len) { jclass clazz = getArrayJClass(env); - if(clazz == nullptr) { + if (clazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -1189,7 +1213,7 @@ class ByteJni : public JavaClass { */ static jmethodID getByteValueMethod(JNIEnv* env) { jclass clazz = getJClass(env); - if(clazz == nullptr) { + if (clazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -1204,8 +1228,8 @@ class ByteJni : public JavaClass { * * @param env A pointer to the Java environment * - * @return A constructing Byte object or nullptr if the class or method id could not - * be retrieved, or an exception occurred + * @return A constructing Byte object or nullptr if the class or method id + * could not be retrieved, or an exception occurred */ static jobject valueOf(JNIEnv* env, jbyte jprimitive_byte) { jclass clazz = getJClass(env); @@ -1230,7 +1254,6 @@ class ByteJni : public JavaClass { return jbyte_obj; } - }; // The portal class for java.nio.ByteBuffer @@ -1260,7 +1283,7 @@ class ByteBufferJni : public JavaClass { * be retrieved */ static jmethodID getAllocateMethodId(JNIEnv* env, - jclass jbytebuffer_clazz = nullptr) { + jclass jbytebuffer_clazz = nullptr) { const jclass jclazz = jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz; if (jclazz == nullptr) { @@ -1268,8 +1291,8 @@ class ByteBufferJni : public JavaClass { return nullptr; } - static jmethodID mid = env->GetStaticMethodID( - jclazz, "allocate", "(I)Ljava/nio/ByteBuffer;"); + static jmethodID mid = + env->GetStaticMethodID(jclazz, "allocate", "(I)Ljava/nio/ByteBuffer;"); assert(mid != nullptr); return mid; } @@ -1283,10 +1306,10 @@ class ByteBufferJni : public JavaClass { * be retrieved */ static jmethodID getArrayMethodId(JNIEnv* env, - jclass jbytebuffer_clazz = nullptr) { + jclass jbytebuffer_clazz = nullptr) { const jclass jclazz = jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz; - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -1296,9 +1319,9 @@ class ByteBufferJni : public JavaClass { return mid; } - static jobject construct( - JNIEnv* env, const bool direct, const size_t capacity, - jclass jbytebuffer_clazz = nullptr) { + static jobject construct(JNIEnv* env, const bool direct, + const size_t capacity, + jclass jbytebuffer_clazz = nullptr) { return constructWith(env, direct, nullptr, capacity, jbytebuffer_clazz); } @@ -1311,7 +1334,8 @@ class ByteBufferJni : public JavaClass { buf = new char[capacity]; allocated = true; } - jobject jbuf = env->NewDirectByteBuffer(const_cast(buf), static_cast(capacity)); + jobject jbuf = env->NewDirectByteBuffer(const_cast(buf), + static_cast(capacity)); if (jbuf == nullptr) { // exception occurred if (allocated) { @@ -1322,14 +1346,16 @@ class ByteBufferJni : public JavaClass { return jbuf; } else { const jclass jclazz = - jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz; + jbytebuffer_clazz == nullptr ? getJClass(env) : jbytebuffer_clazz; if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - const jmethodID jmid_allocate = getAllocateMethodId(env, jbytebuffer_clazz); + const jmethodID jmid_allocate = + getAllocateMethodId(env, jbytebuffer_clazz); if (jmid_allocate == nullptr) { - // exception occurred accessing class, or NoSuchMethodException or OutOfMemoryError + // exception occurred accessing class, or NoSuchMethodException or + // OutOfMemoryError return nullptr; } const jobject jbuf = env->CallStaticObjectMethod( @@ -1353,9 +1379,9 @@ class ByteBufferJni : public JavaClass { env->GetPrimitiveArrayCritical(jarray, &is_copy)); if (ja == nullptr) { // exception occurred - env->DeleteLocalRef(jarray); - env->DeleteLocalRef(jbuf); - return nullptr; + env->DeleteLocalRef(jarray); + env->DeleteLocalRef(jbuf); + return nullptr; } memcpy(ja, const_cast(buf), capacity); @@ -1370,10 +1396,11 @@ class ByteBufferJni : public JavaClass { } static jbyteArray array(JNIEnv* env, const jobject& jbyte_buffer, - jclass jbytebuffer_clazz = nullptr) { + jclass jbytebuffer_clazz = nullptr) { const jmethodID mid = getArrayMethodId(env, jbytebuffer_clazz); if (mid == nullptr) { - // exception occurred accessing class, or NoSuchMethodException or OutOfMemoryError + // exception occurred accessing class, or NoSuchMethodException or + // OutOfMemoryError return nullptr; } const jobject jarray = env->CallObjectMethod(jbyte_buffer, mid); @@ -1469,7 +1496,7 @@ class LongJni : public JavaClass { // The portal class for java.lang.StringBuilder class StringBuilderJni : public JavaClass { - public: + public: /** * Get the Java Class java.lang.StringBuilder * @@ -1493,14 +1520,13 @@ class StringBuilderJni : public JavaClass { */ static jmethodID getListAddMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = - env->GetMethodID(jclazz, "append", - "(Ljava/lang/String;)Ljava/lang/StringBuilder;"); + static jmethodID mid = env->GetMethodID( + jclazz, "append", "(Ljava/lang/String;)Ljava/lang/StringBuilder;"); assert(mid != nullptr); return mid; } @@ -1516,22 +1542,22 @@ class StringBuilderJni : public JavaClass { * an exception occurs */ static jobject append(JNIEnv* env, jobject jstring_builder, - const char* c_str) { + const char* c_str) { jmethodID mid = getListAddMethodId(env); - if(mid == nullptr) { + if (mid == nullptr) { // exception occurred accessing class or method return nullptr; } jstring new_value_str = env->NewStringUTF(c_str); - if(new_value_str == nullptr) { + if (new_value_str == nullptr) { // exception thrown: OutOfMemoryError return nullptr; } jobject jresult_string_builder = env->CallObjectMethod(jstring_builder, mid, new_value_str); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception occurred env->DeleteLocalRef(new_value_str); return nullptr; @@ -1544,846 +1570,863 @@ class StringBuilderJni : public JavaClass { // various utility functions for working with RocksDB and JNI class JniUtil { public: - /** - * Detect if jlong overflows size_t - * - * @param jvalue the jlong value - * - * @return - */ - inline static Status check_if_jlong_fits_size_t(const jlong& jvalue) { - Status s = Status::OK(); - if (static_cast(jvalue) > std::numeric_limits::max()) { - s = Status::InvalidArgument(Slice("jlong overflows 32 bit value.")); - } - return s; - } - - /** - * Obtains a reference to the JNIEnv from - * the JVM - * - * If the current thread is not attached to the JavaVM - * then it will be attached so as to retrieve the JNIEnv - * - * If a thread is attached, it must later be manually - * released by calling JavaVM::DetachCurrentThread. - * This can be handled by always matching calls to this - * function with calls to {@link JniUtil::releaseJniEnv(JavaVM*, jboolean)} - * - * @param jvm (IN) A pointer to the JavaVM instance - * @param attached (OUT) A pointer to a boolean which - * will be set to JNI_TRUE if we had to attach the thread - * - * @return A pointer to the JNIEnv or nullptr if a fatal error - * occurs and the JNIEnv cannot be retrieved - */ - static JNIEnv* getJniEnv(JavaVM* jvm, jboolean* attached) { - assert(jvm != nullptr); - - JNIEnv *env; - const jint env_rs = jvm->GetEnv(reinterpret_cast(&env), - JNI_VERSION_1_6); - - if(env_rs == JNI_OK) { - // current thread is already attached, return the JNIEnv - *attached = JNI_FALSE; + /** + * Detect if jlong overflows size_t + * + * @param jvalue the jlong value + * + * @return + */ + inline static Status check_if_jlong_fits_size_t(const jlong& jvalue) { + Status s = Status::OK(); + if (static_cast(jvalue) > std::numeric_limits::max()) { + s = Status::InvalidArgument(Slice("jlong overflows 32 bit value.")); + } + return s; + } + + /** + * Obtains a reference to the JNIEnv from + * the JVM + * + * If the current thread is not attached to the JavaVM + * then it will be attached so as to retrieve the JNIEnv + * + * If a thread is attached, it must later be manually + * released by calling JavaVM::DetachCurrentThread. + * This can be handled by always matching calls to this + * function with calls to {@link JniUtil::releaseJniEnv(JavaVM*, jboolean)} + * + * @param jvm (IN) A pointer to the JavaVM instance + * @param attached (OUT) A pointer to a boolean which + * will be set to JNI_TRUE if we had to attach the thread + * + * @return A pointer to the JNIEnv or nullptr if a fatal error + * occurs and the JNIEnv cannot be retrieved + */ + static JNIEnv* getJniEnv(JavaVM* jvm, jboolean* attached) { + assert(jvm != nullptr); + + JNIEnv* env; + const jint env_rs = + jvm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6); + + if (env_rs == JNI_OK) { + // current thread is already attached, return the JNIEnv + *attached = JNI_FALSE; + return env; + } else if (env_rs == JNI_EDETACHED) { + // current thread is not attached, attempt to attach + const jint rs_attach = + jvm->AttachCurrentThread(reinterpret_cast(&env), NULL); + if (rs_attach == JNI_OK) { + *attached = JNI_TRUE; return env; - } else if(env_rs == JNI_EDETACHED) { - // current thread is not attached, attempt to attach - const jint rs_attach = jvm->AttachCurrentThread(reinterpret_cast(&env), NULL); - if(rs_attach == JNI_OK) { - *attached = JNI_TRUE; - return env; - } else { - // error, could not attach the thread - std::cerr << "JniUtil::getJniEnv - Fatal: could not attach current thread to JVM!" << std::endl; - return nullptr; - } - } else if(env_rs == JNI_EVERSION) { - // error, JDK does not support JNI_VERSION_1_6+ - std::cerr << "JniUtil::getJniEnv - Fatal: JDK does not support JNI_VERSION_1_6" << std::endl; - return nullptr; } else { - std::cerr << "JniUtil::getJniEnv - Fatal: Unknown error: env_rs=" << env_rs << std::endl; + // error, could not attach the thread + std::cerr << "JniUtil::getJniEnv - Fatal: could not attach current " + "thread to JVM!" + << std::endl; return nullptr; } + } else if (env_rs == JNI_EVERSION) { + // error, JDK does not support JNI_VERSION_1_6+ + std::cerr + << "JniUtil::getJniEnv - Fatal: JDK does not support JNI_VERSION_1_6" + << std::endl; + return nullptr; + } else { + std::cerr << "JniUtil::getJniEnv - Fatal: Unknown error: env_rs=" + << env_rs << std::endl; + return nullptr; } + } - /** - * Counterpart to {@link JniUtil::getJniEnv(JavaVM*, jboolean*)} - * - * Detachess the current thread from the JVM if it was previously - * attached - * - * @param jvm (IN) A pointer to the JavaVM instance - * @param attached (IN) JNI_TRUE if we previously had to attach the thread - * to the JavaVM to get the JNIEnv - */ - static void releaseJniEnv(JavaVM* jvm, jboolean& attached) { - assert(jvm != nullptr); - if(attached == JNI_TRUE) { - const jint rs_detach = jvm->DetachCurrentThread(); - assert(rs_detach == JNI_OK); - if(rs_detach != JNI_OK) { - std::cerr << "JniUtil::getJniEnv - Warn: Unable to detach current thread from JVM!" << std::endl; - } + /** + * Counterpart to {@link JniUtil::getJniEnv(JavaVM*, jboolean*)} + * + * Detachess the current thread from the JVM if it was previously + * attached + * + * @param jvm (IN) A pointer to the JavaVM instance + * @param attached (IN) JNI_TRUE if we previously had to attach the thread + * to the JavaVM to get the JNIEnv + */ + static void releaseJniEnv(JavaVM* jvm, jboolean& attached) { + assert(jvm != nullptr); + if (attached == JNI_TRUE) { + const jint rs_detach = jvm->DetachCurrentThread(); + assert(rs_detach == JNI_OK); + if (rs_detach != JNI_OK) { + std::cerr << "JniUtil::getJniEnv - Warn: Unable to detach current " + "thread from JVM!" + << std::endl; } } + } - /** - * Copies a Java String[] to a C++ std::vector - * - * @param env (IN) A pointer to the java environment - * @param jss (IN) The Java String array to copy - * @param has_exception (OUT) will be set to JNI_TRUE - * if an OutOfMemoryError or ArrayIndexOutOfBoundsException - * exception occurs - * - * @return A std::vector containing copies of the Java strings - */ - static std::vector copyStrings(JNIEnv* env, - jobjectArray jss, jboolean* has_exception) { - return ROCKSDB_NAMESPACE::JniUtil::copyStrings( - env, jss, env->GetArrayLength(jss), has_exception); - } - - /** - * Copies a Java String[] to a C++ std::vector - * - * @param env (IN) A pointer to the java environment - * @param jss (IN) The Java String array to copy - * @param jss_len (IN) The length of the Java String array to copy - * @param has_exception (OUT) will be set to JNI_TRUE - * if an OutOfMemoryError or ArrayIndexOutOfBoundsException - * exception occurs - * - * @return A std::vector containing copies of the Java strings - */ - static std::vector copyStrings(JNIEnv* env, - jobjectArray jss, const jsize jss_len, jboolean* has_exception) { - std::vector strs; - strs.reserve(jss_len); - for (jsize i = 0; i < jss_len; i++) { - jobject js = env->GetObjectArrayElement(jss, i); - if(env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - *has_exception = JNI_TRUE; - return strs; - } - - jstring jstr = static_cast(js); - const char* str = env->GetStringUTFChars(jstr, nullptr); - if(str == nullptr) { - // exception thrown: OutOfMemoryError - env->DeleteLocalRef(js); - *has_exception = JNI_TRUE; - return strs; - } - - strs.push_back(std::string(str)); + /** + * Copies a Java String[] to a C++ std::vector + * + * @param env (IN) A pointer to the java environment + * @param jss (IN) The Java String array to copy + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError or ArrayIndexOutOfBoundsException + * exception occurs + * + * @return A std::vector containing copies of the Java strings + */ + static std::vector copyStrings(JNIEnv* env, jobjectArray jss, + jboolean* has_exception) { + return ROCKSDB_NAMESPACE::JniUtil::copyStrings( + env, jss, env->GetArrayLength(jss), has_exception); + } - env->ReleaseStringUTFChars(jstr, str); - env->DeleteLocalRef(js); + /** + * Copies a Java String[] to a C++ std::vector + * + * @param env (IN) A pointer to the java environment + * @param jss (IN) The Java String array to copy + * @param jss_len (IN) The length of the Java String array to copy + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError or ArrayIndexOutOfBoundsException + * exception occurs + * + * @return A std::vector containing copies of the Java strings + */ + static std::vector copyStrings(JNIEnv* env, jobjectArray jss, + const jsize jss_len, + jboolean* has_exception) { + std::vector strs; + strs.reserve(jss_len); + for (jsize i = 0; i < jss_len; i++) { + jobject js = env->GetObjectArrayElement(jss, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + *has_exception = JNI_TRUE; + return strs; } - *has_exception = JNI_FALSE; - return strs; - } - - /** - * Copies a jstring to a C-style null-terminated byte string - * and releases the original jstring - * - * The jstring is copied as UTF-8 - * - * If an exception occurs, then JNIEnv::ExceptionCheck() - * will have been called - * - * @param env (IN) A pointer to the java environment - * @param js (IN) The java string to copy - * @param has_exception (OUT) will be set to JNI_TRUE - * if an OutOfMemoryError exception occurs - * - * @return A pointer to the copied string, or a - * nullptr if has_exception == JNI_TRUE - */ - static std::unique_ptr copyString(JNIEnv* env, jstring js, - jboolean* has_exception) { - const char *utf = env->GetStringUTFChars(js, nullptr); - if(utf == nullptr) { + jstring jstr = static_cast(js); + const char* str = env->GetStringUTFChars(jstr, nullptr); + if (str == nullptr) { // exception thrown: OutOfMemoryError - env->ExceptionCheck(); - *has_exception = JNI_TRUE; - return nullptr; - } else if(env->ExceptionCheck()) { - // exception thrown - env->ReleaseStringUTFChars(js, utf); + env->DeleteLocalRef(js); *has_exception = JNI_TRUE; - return nullptr; + return strs; } - const jsize utf_len = env->GetStringUTFLength(js); - std::unique_ptr str(new char[utf_len + 1]); // Note: + 1 is needed for the c_str null terminator - std::strcpy(str.get(), utf); + strs.push_back(std::string(str)); + + env->ReleaseStringUTFChars(jstr, str); + env->DeleteLocalRef(js); + } + + *has_exception = JNI_FALSE; + return strs; + } + + /** + * Copies a jstring to a C-style null-terminated byte string + * and releases the original jstring + * + * The jstring is copied as UTF-8 + * + * If an exception occurs, then JNIEnv::ExceptionCheck() + * will have been called + * + * @param env (IN) A pointer to the java environment + * @param js (IN) The java string to copy + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + * + * @return A pointer to the copied string, or a + * nullptr if has_exception == JNI_TRUE + */ + static std::unique_ptr copyString(JNIEnv* env, jstring js, + jboolean* has_exception) { + const char* utf = env->GetStringUTFChars(js, nullptr); + if (utf == nullptr) { + // exception thrown: OutOfMemoryError + env->ExceptionCheck(); + *has_exception = JNI_TRUE; + return nullptr; + } else if (env->ExceptionCheck()) { + // exception thrown env->ReleaseStringUTFChars(js, utf); - *has_exception = JNI_FALSE; - return str; - } - - /** - * Copies a jstring to a std::string - * and releases the original jstring - * - * If an exception occurs, then JNIEnv::ExceptionCheck() - * will have been called - * - * @param env (IN) A pointer to the java environment - * @param js (IN) The java string to copy - * @param has_exception (OUT) will be set to JNI_TRUE - * if an OutOfMemoryError exception occurs - * - * @return A std:string copy of the jstring, or an - * empty std::string if has_exception == JNI_TRUE - */ - static std::string copyStdString(JNIEnv* env, jstring js, - jboolean* has_exception) { - const char *utf = env->GetStringUTFChars(js, nullptr); - if(utf == nullptr) { - // exception thrown: OutOfMemoryError - env->ExceptionCheck(); - *has_exception = JNI_TRUE; - return std::string(); - } else if(env->ExceptionCheck()) { - // exception thrown - env->ReleaseStringUTFChars(js, utf); - *has_exception = JNI_TRUE; - return std::string(); - } + *has_exception = JNI_TRUE; + return nullptr; + } + + const jsize utf_len = env->GetStringUTFLength(js); + std::unique_ptr str( + new char[utf_len + + 1]); // Note: + 1 is needed for the c_str null terminator + std::strcpy(str.get(), utf); + env->ReleaseStringUTFChars(js, utf); + *has_exception = JNI_FALSE; + return str; + } - std::string name(utf); + /** + * Copies a jstring to a std::string + * and releases the original jstring + * + * If an exception occurs, then JNIEnv::ExceptionCheck() + * will have been called + * + * @param env (IN) A pointer to the java environment + * @param js (IN) The java string to copy + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + * + * @return A std:string copy of the jstring, or an + * empty std::string if has_exception == JNI_TRUE + */ + static std::string copyStdString(JNIEnv* env, jstring js, + jboolean* has_exception) { + const char* utf = env->GetStringUTFChars(js, nullptr); + if (utf == nullptr) { + // exception thrown: OutOfMemoryError + env->ExceptionCheck(); + *has_exception = JNI_TRUE; + return std::string(); + } else if (env->ExceptionCheck()) { + // exception thrown env->ReleaseStringUTFChars(js, utf); - *has_exception = JNI_FALSE; - return name; - } - - /** - * Copies bytes from a std::string to a jByteArray - * - * @param env A pointer to the java environment - * @param bytes The bytes to copy - * - * @return the Java byte[], or nullptr if an exception occurs - * - * @throws RocksDBException thrown - * if memory size to copy exceeds general java specific array size limitation. - */ - static jbyteArray copyBytes(JNIEnv* env, std::string bytes) { - return createJavaByteArrayWithSizeCheck(env, bytes.c_str(), bytes.size()); - } - - /** - * Given a Java byte[][] which is an array of java.lang.Strings - * where each String is a byte[], the passed function `string_fn` - * will be called on each String, the result is the collected by - * calling the passed function `collector_fn` - * - * @param env (IN) A pointer to the java environment - * @param jbyte_strings (IN) A Java array of Strings expressed as bytes - * @param string_fn (IN) A transform function to call for each String - * @param collector_fn (IN) A collector which is called for the result - * of each `string_fn` - * @param has_exception (OUT) will be set to JNI_TRUE - * if an ArrayIndexOutOfBoundsException or OutOfMemoryError - * exception occurs - */ - template static void byteStrings(JNIEnv* env, - jobjectArray jbyte_strings, - std::function string_fn, - std::function collector_fn, - jboolean *has_exception) { - const jsize jlen = env->GetArrayLength(jbyte_strings); - - for(jsize i = 0; i < jlen; i++) { - jobject jbyte_string_obj = env->GetObjectArrayElement(jbyte_strings, i); - if(env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - *has_exception = JNI_TRUE; // signal error - return; - } + *has_exception = JNI_TRUE; + return std::string(); + } - jbyteArray jbyte_string_ary = - reinterpret_cast(jbyte_string_obj); - T result = byteString(env, jbyte_string_ary, string_fn, has_exception); + std::string name(utf); + env->ReleaseStringUTFChars(js, utf); + *has_exception = JNI_FALSE; + return name; + } - env->DeleteLocalRef(jbyte_string_obj); + /** + * Copies bytes from a std::string to a jByteArray + * + * @param env A pointer to the java environment + * @param bytes The bytes to copy + * + * @return the Java byte[], or nullptr if an exception occurs + * + * @throws RocksDBException thrown + * if memory size to copy exceeds general java specific array size + * limitation. + */ + static jbyteArray copyBytes(JNIEnv* env, std::string bytes) { + return createJavaByteArrayWithSizeCheck(env, bytes.c_str(), bytes.size()); + } - if(*has_exception == JNI_TRUE) { - // exception thrown: OutOfMemoryError - return; - } + /** + * Given a Java byte[][] which is an array of java.lang.Strings + * where each String is a byte[], the passed function `string_fn` + * will be called on each String, the result is the collected by + * calling the passed function `collector_fn` + * + * @param env (IN) A pointer to the java environment + * @param jbyte_strings (IN) A Java array of Strings expressed as bytes + * @param string_fn (IN) A transform function to call for each String + * @param collector_fn (IN) A collector which is called for the result + * of each `string_fn` + * @param has_exception (OUT) will be set to JNI_TRUE + * if an ArrayIndexOutOfBoundsException or OutOfMemoryError + * exception occurs + */ + template + static void byteStrings(JNIEnv* env, jobjectArray jbyte_strings, + std::function string_fn, + std::function collector_fn, + jboolean* has_exception) { + const jsize jlen = env->GetArrayLength(jbyte_strings); - collector_fn(i, result); + for (jsize i = 0; i < jlen; i++) { + jobject jbyte_string_obj = env->GetObjectArrayElement(jbyte_strings, i); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + *has_exception = JNI_TRUE; // signal error + return; } - *has_exception = JNI_FALSE; - } - - /** - * Given a Java String which is expressed as a Java Byte Array byte[], - * the passed function `string_fn` will be called on the String - * and the result returned - * - * @param env (IN) A pointer to the java environment - * @param jbyte_string_ary (IN) A Java String expressed in bytes - * @param string_fn (IN) A transform function to call on the String - * @param has_exception (OUT) will be set to JNI_TRUE - * if an OutOfMemoryError exception occurs - */ - template static T byteString(JNIEnv* env, - jbyteArray jbyte_string_ary, - std::function string_fn, - jboolean* has_exception) { - const jsize jbyte_string_len = env->GetArrayLength(jbyte_string_ary); - return byteString(env, jbyte_string_ary, jbyte_string_len, string_fn, - has_exception); - } - - /** - * Given a Java String which is expressed as a Java Byte Array byte[], - * the passed function `string_fn` will be called on the String - * and the result returned - * - * @param env (IN) A pointer to the java environment - * @param jbyte_string_ary (IN) A Java String expressed in bytes - * @param jbyte_string_len (IN) The length of the Java String - * expressed in bytes - * @param string_fn (IN) A transform function to call on the String - * @param has_exception (OUT) will be set to JNI_TRUE - * if an OutOfMemoryError exception occurs - */ - template static T byteString(JNIEnv* env, - jbyteArray jbyte_string_ary, const jsize jbyte_string_len, - std::function string_fn, - jboolean* has_exception) { - jbyte* jbyte_string = - env->GetByteArrayElements(jbyte_string_ary, nullptr); - if(jbyte_string == nullptr) { + jbyteArray jbyte_string_ary = + reinterpret_cast(jbyte_string_obj); + T result = byteString(env, jbyte_string_ary, string_fn, has_exception); + + env->DeleteLocalRef(jbyte_string_obj); + + if (*has_exception == JNI_TRUE) { // exception thrown: OutOfMemoryError - *has_exception = JNI_TRUE; - return nullptr; // signal error + return; } - T result = - string_fn(reinterpret_cast(jbyte_string), jbyte_string_len); + collector_fn(i, result); + } - env->ReleaseByteArrayElements(jbyte_string_ary, jbyte_string, JNI_ABORT); + *has_exception = JNI_FALSE; + } + + /** + * Given a Java String which is expressed as a Java Byte Array byte[], + * the passed function `string_fn` will be called on the String + * and the result returned + * + * @param env (IN) A pointer to the java environment + * @param jbyte_string_ary (IN) A Java String expressed in bytes + * @param string_fn (IN) A transform function to call on the String + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + */ + template + static T byteString(JNIEnv* env, jbyteArray jbyte_string_ary, + std::function string_fn, + jboolean* has_exception) { + const jsize jbyte_string_len = env->GetArrayLength(jbyte_string_ary); + return byteString(env, jbyte_string_ary, jbyte_string_len, string_fn, + has_exception); + } - *has_exception = JNI_FALSE; - return result; + /** + * Given a Java String which is expressed as a Java Byte Array byte[], + * the passed function `string_fn` will be called on the String + * and the result returned + * + * @param env (IN) A pointer to the java environment + * @param jbyte_string_ary (IN) A Java String expressed in bytes + * @param jbyte_string_len (IN) The length of the Java String + * expressed in bytes + * @param string_fn (IN) A transform function to call on the String + * @param has_exception (OUT) will be set to JNI_TRUE + * if an OutOfMemoryError exception occurs + */ + template + static T byteString(JNIEnv* env, jbyteArray jbyte_string_ary, + const jsize jbyte_string_len, + std::function string_fn, + jboolean* has_exception) { + jbyte* jbyte_string = env->GetByteArrayElements(jbyte_string_ary, nullptr); + if (jbyte_string == nullptr) { + // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; + return nullptr; // signal error } - /** - * Converts a std::vector to a Java byte[][] where each Java String - * is expressed as a Java Byte Array byte[]. - * - * @param env A pointer to the java environment - * @param strings A vector of Strings - * - * @return A Java array of Strings expressed as bytes, - * or nullptr if an exception is thrown - */ - static jobjectArray stringsBytes(JNIEnv* env, std::vector strings) { - jclass jcls_ba = ByteJni::getArrayJClass(env); - if(jcls_ba == nullptr) { - // exception occurred - return nullptr; - } + T result = + string_fn(reinterpret_cast(jbyte_string), jbyte_string_len); - const jsize len = static_cast(strings.size()); + env->ReleaseByteArrayElements(jbyte_string_ary, jbyte_string, JNI_ABORT); - jobjectArray jbyte_strings = env->NewObjectArray(len, jcls_ba, nullptr); - if(jbyte_strings == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; - } + *has_exception = JNI_FALSE; + return result; + } - for (jsize i = 0; i < len; i++) { - std::string *str = &strings[i]; - const jsize str_len = static_cast(str->size()); + /** + * Converts a std::vector to a Java byte[][] where each Java String + * is expressed as a Java Byte Array byte[]. + * + * @param env A pointer to the java environment + * @param strings A vector of Strings + * + * @return A Java array of Strings expressed as bytes, + * or nullptr if an exception is thrown + */ + static jobjectArray stringsBytes(JNIEnv* env, + std::vector strings) { + jclass jcls_ba = ByteJni::getArrayJClass(env); + if (jcls_ba == nullptr) { + // exception occurred + return nullptr; + } - jbyteArray jbyte_string_ary = env->NewByteArray(str_len); - if(jbyte_string_ary == nullptr) { - // exception thrown: OutOfMemoryError - env->DeleteLocalRef(jbyte_strings); - return nullptr; - } + const jsize len = static_cast(strings.size()); - env->SetByteArrayRegion( - jbyte_string_ary, 0, str_len, - const_cast(reinterpret_cast(str->c_str()))); - if(env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - env->DeleteLocalRef(jbyte_string_ary); - env->DeleteLocalRef(jbyte_strings); - return nullptr; - } + jobjectArray jbyte_strings = env->NewObjectArray(len, jcls_ba, nullptr); + if (jbyte_strings == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } - env->SetObjectArrayElement(jbyte_strings, i, jbyte_string_ary); - if(env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - // or ArrayStoreException - env->DeleteLocalRef(jbyte_string_ary); - env->DeleteLocalRef(jbyte_strings); - return nullptr; - } + for (jsize i = 0; i < len; i++) { + std::string* str = &strings[i]; + const jsize str_len = static_cast(str->size()); - env->DeleteLocalRef(jbyte_string_ary); + jbyteArray jbyte_string_ary = env->NewByteArray(str_len); + if (jbyte_string_ary == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jbyte_strings); + return nullptr; } - return jbyte_strings; - } - - /** - * Converts a std::vector to a Java String[]. - * - * @param env A pointer to the java environment - * @param strings A vector of Strings - * - * @return A Java array of Strings, - * or nullptr if an exception is thrown - */ - static jobjectArray toJavaStrings(JNIEnv* env, - const std::vector* strings) { - jclass jcls_str = env->FindClass("java/lang/String"); - if(jcls_str == nullptr) { - // exception occurred + env->SetByteArrayRegion( + jbyte_string_ary, 0, str_len, + const_cast(reinterpret_cast(str->c_str()))); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jbyte_string_ary); + env->DeleteLocalRef(jbyte_strings); return nullptr; } - const jsize len = static_cast(strings->size()); - - jobjectArray jstrings = env->NewObjectArray(len, jcls_str, nullptr); - if(jstrings == nullptr) { - // exception thrown: OutOfMemoryError + env->SetObjectArrayElement(jbyte_strings, i, jbyte_string_ary); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + // or ArrayStoreException + env->DeleteLocalRef(jbyte_string_ary); + env->DeleteLocalRef(jbyte_strings); return nullptr; } - for (jsize i = 0; i < len; i++) { - const std::string *str = &((*strings)[i]); - jstring js = ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, str); - if (js == nullptr) { - env->DeleteLocalRef(jstrings); - return nullptr; - } + env->DeleteLocalRef(jbyte_string_ary); + } - env->SetObjectArrayElement(jstrings, i, js); - if(env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - // or ArrayStoreException - env->DeleteLocalRef(js); - env->DeleteLocalRef(jstrings); - return nullptr; - } - } + return jbyte_strings; + } - return jstrings; - } - - /** - * Creates a Java UTF String from a C++ std::string - * - * @param env A pointer to the java environment - * @param string the C++ std::string - * @param treat_empty_as_null true if empty strings should be treated as null - * - * @return the Java UTF string, or nullptr if the provided string - * is null (or empty and treat_empty_as_null is set), or if an - * exception occurs allocating the Java String. - */ - static jstring toJavaString(JNIEnv* env, const std::string* string, - const bool treat_empty_as_null = false) { - if (string == nullptr) { - return nullptr; - } + /** + * Converts a std::vector to a Java String[]. + * + * @param env A pointer to the java environment + * @param strings A vector of Strings + * + * @return A Java array of Strings, + * or nullptr if an exception is thrown + */ + static jobjectArray toJavaStrings(JNIEnv* env, + const std::vector* strings) { + jclass jcls_str = env->FindClass("java/lang/String"); + if (jcls_str == nullptr) { + // exception occurred + return nullptr; + } - if (treat_empty_as_null && string->empty()) { - return nullptr; - } + const jsize len = static_cast(strings->size()); - return env->NewStringUTF(string->c_str()); - } - - /** - * Copies bytes to a new jByteArray with the check of java array size limitation. - * - * @param bytes pointer to memory to copy to a new jByteArray - * @param size number of bytes to copy - * - * @return the Java byte[], or nullptr if an exception occurs - * - * @throws RocksDBException thrown - * if memory size to copy exceeds general java array size limitation to avoid overflow. - */ - static jbyteArray createJavaByteArrayWithSizeCheck(JNIEnv* env, const char* bytes, const size_t size) { - // Limitation for java array size is vm specific - // In general it cannot exceed Integer.MAX_VALUE (2^31 - 1) - // Current HotSpot VM limitation for array size is Integer.MAX_VALUE - 5 (2^31 - 1 - 5) - // It means that the next call to env->NewByteArray can still end with - // OutOfMemoryError("Requested array size exceeds VM limit") coming from VM - static const size_t MAX_JARRAY_SIZE = (static_cast(1)) << 31; - if(size > MAX_JARRAY_SIZE) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, "Requested array size exceeds VM limit"); - return nullptr; - } + jobjectArray jstrings = env->NewObjectArray(len, jcls_str, nullptr); + if (jstrings == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; + } - const jsize jlen = static_cast(size); - jbyteArray jbytes = env->NewByteArray(jlen); - if(jbytes == nullptr) { - // exception thrown: OutOfMemoryError + for (jsize i = 0; i < len; i++) { + const std::string* str = &((*strings)[i]); + jstring js = ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, str); + if (js == nullptr) { + env->DeleteLocalRef(jstrings); return nullptr; } - env->SetByteArrayRegion(jbytes, 0, jlen, - const_cast(reinterpret_cast(bytes))); - if(env->ExceptionCheck()) { + env->SetObjectArrayElement(jstrings, i, js); + if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException - env->DeleteLocalRef(jbytes); + // or ArrayStoreException + env->DeleteLocalRef(js); + env->DeleteLocalRef(jstrings); return nullptr; } + } - return jbytes; - } - - /** - * Copies bytes from a ROCKSDB_NAMESPACE::Slice to a jByteArray - * - * @param env A pointer to the java environment - * @param bytes The bytes to copy - * - * @return the Java byte[] or nullptr if an exception occurs - * - * @throws RocksDBException thrown - * if memory size to copy exceeds general java specific array size - * limitation. - */ - static jbyteArray copyBytes(JNIEnv* env, const Slice& bytes) { - return createJavaByteArrayWithSizeCheck(env, bytes.data(), bytes.size()); - } - - /* - * Helper for operations on a key and value - * for example WriteBatch->Put - * - * TODO(AR) could be used for RocksDB->Put etc. - */ - static std::unique_ptr kv_op( - std::function - op, - JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { - jbyte* key = env->GetByteArrayElements(jkey, nullptr); - if(env->ExceptionCheck()) { - // exception thrown: OutOfMemoryError - return nullptr; - } + return jstrings; + } - jbyte* value = env->GetByteArrayElements(jvalue, nullptr); - if(env->ExceptionCheck()) { - // exception thrown: OutOfMemoryError - if(key != nullptr) { - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); - } - return nullptr; - } + /** + * Creates a Java UTF String from a C++ std::string + * + * @param env A pointer to the java environment + * @param string the C++ std::string + * @param treat_empty_as_null true if empty strings should be treated as null + * + * @return the Java UTF string, or nullptr if the provided string + * is null (or empty and treat_empty_as_null is set), or if an + * exception occurs allocating the Java String. + */ + static jstring toJavaString(JNIEnv* env, const std::string* string, + const bool treat_empty_as_null = false) { + if (string == nullptr) { + return nullptr; + } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), - jkey_len); - ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), - jvalue_len); + if (treat_empty_as_null && string->empty()) { + return nullptr; + } - auto status = op(key_slice, value_slice); + return env->NewStringUTF(string->c_str()); + } - if(value != nullptr) { - env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); - } - if(key != nullptr) { - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); - } + /** + * Copies bytes to a new jByteArray with the check of java array size + * limitation. + * + * @param bytes pointer to memory to copy to a new jByteArray + * @param size number of bytes to copy + * + * @return the Java byte[], or nullptr if an exception occurs + * + * @throws RocksDBException thrown + * if memory size to copy exceeds general java array size limitation to + * avoid overflow. + */ + static jbyteArray createJavaByteArrayWithSizeCheck(JNIEnv* env, + const char* bytes, + const size_t size) { + // Limitation for java array size is vm specific + // In general it cannot exceed Integer.MAX_VALUE (2^31 - 1) + // Current HotSpot VM limitation for array size is Integer.MAX_VALUE - 5 + // (2^31 - 1 - 5) It means that the next call to env->NewByteArray can still + // end with OutOfMemoryError("Requested array size exceeds VM limit") coming + // from VM + static const size_t MAX_JARRAY_SIZE = (static_cast(1)) << 31; + if (size > MAX_JARRAY_SIZE) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Requested array size exceeds VM limit"); + return nullptr; + } - return std::unique_ptr( - new ROCKSDB_NAMESPACE::Status(status)); + const jsize jlen = static_cast(size); + jbyteArray jbytes = env->NewByteArray(jlen); + if (jbytes == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; } - /* - * Helper for operations on a key - * for example WriteBatch->Delete - * - * TODO(AR) could be used for RocksDB->Delete etc. - */ - static std::unique_ptr k_op( - std::function op, - JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len) { - jbyte* key = env->GetByteArrayElements(jkey, nullptr); - if (env->ExceptionCheck()) { - // exception thrown: OutOfMemoryError - return nullptr; - } + env->SetByteArrayRegion( + jbytes, 0, jlen, + const_cast(reinterpret_cast(bytes))); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + env->DeleteLocalRef(jbytes); + return nullptr; + } - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), - jkey_len); + return jbytes; + } - auto status = op(key_slice); + /** + * Copies bytes from a ROCKSDB_NAMESPACE::Slice to a jByteArray + * + * @param env A pointer to the java environment + * @param bytes The bytes to copy + * + * @return the Java byte[] or nullptr if an exception occurs + * + * @throws RocksDBException thrown + * if memory size to copy exceeds general java specific array size + * limitation. + */ + static jbyteArray copyBytes(JNIEnv* env, const Slice& bytes) { + return createJavaByteArrayWithSizeCheck(env, bytes.data(), bytes.size()); + } + + /* + * Helper for operations on a key and value + * for example WriteBatch->Put + * + * TODO(AR) could be used for RocksDB->Put etc. + */ + static std::unique_ptr kv_op( + std::function + op, + JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len, + jbyteArray jvalue, jint jvalue_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + return nullptr; + } + jbyte* value = env->GetByteArrayElements(jvalue, nullptr); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError if (key != nullptr) { env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); } + return nullptr; + } - return std::unique_ptr( - new ROCKSDB_NAMESPACE::Status(status)); - } - - /* - * Helper for operations on a key which is a region of an array - * Used to extract the common code from seek/seekForPrev. - * Possible that it can be generalised from that. - * - * We use GetByteArrayRegion to copy the key region of the whole array into - * a char[] We suspect this is not much slower than GetByteArrayElements, - * which probably copies anyway. - */ - static void k_op_region(std::function op, - JNIEnv* env, jbyteArray jkey, jint jkey_off, - jint jkey_len) { - const std::unique_ptr key(new char[jkey_len]); - if (key == nullptr) { - jclass oom_class = env->FindClass("/lang/java/OutOfMemoryError"); - env->ThrowNew(oom_class, - "Memory allocation failed in RocksDB JNI function"); - return; - } - env->GetByteArrayRegion(jkey, jkey_off, jkey_len, - reinterpret_cast(key.get())); - if (env->ExceptionCheck()) { - // exception thrown: OutOfMemoryError - return; - } + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); + ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast(value), + jvalue_len); - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key.get()), - jkey_len); - op(key_slice); - } - - /* - * Helper for operations on a value - * for example WriteBatchWithIndex->GetFromBatch - */ - static jbyteArray v_op(std::function - op, - JNIEnv* env, jbyteArray jkey, jint jkey_len) { - jbyte* key = env->GetByteArrayElements(jkey, nullptr); - if(env->ExceptionCheck()) { - // exception thrown: OutOfMemoryError - return nullptr; - } + auto status = op(key_slice, value_slice); - ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), - jkey_len); + if (value != nullptr) { + env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); + } + if (key != nullptr) { + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + } - std::string value; - ROCKSDB_NAMESPACE::Status s = op(key_slice, &value); + return std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(status)); + } - if(key != nullptr) { - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); - } + /* + * Helper for operations on a key + * for example WriteBatch->Delete + * + * TODO(AR) could be used for RocksDB->Delete etc. + */ + static std::unique_ptr k_op( + std::function op, + JNIEnv* env, jobject /*jobj*/, jbyteArray jkey, jint jkey_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + return nullptr; + } - if (s.IsNotFound()) { - return nullptr; - } + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); - if (s.ok()) { - jbyteArray jret_value = - env->NewByteArray(static_cast(value.size())); - if(jret_value == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; - } + auto status = op(key_slice); - env->SetByteArrayRegion(jret_value, 0, static_cast(value.size()), - const_cast(reinterpret_cast(value.c_str()))); - if(env->ExceptionCheck()) { - // exception thrown: ArrayIndexOutOfBoundsException - if(jret_value != nullptr) { - env->DeleteLocalRef(jret_value); - } - return nullptr; - } + if (key != nullptr) { + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + } - return jret_value; - } + return std::unique_ptr( + new ROCKSDB_NAMESPACE::Status(status)); + } - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - return nullptr; - } - - /** - * Creates a vector of C++ pointers from - * a Java array of C++ pointer addresses. - * - * @param env (IN) A pointer to the java environment - * @param pointers (IN) A Java array of C++ pointer addresses - * @param has_exception (OUT) will be set to JNI_TRUE - * if an ArrayIndexOutOfBoundsException or OutOfMemoryError - * exception occurs. - * - * @return A vector of C++ pointers. - */ - template static std::vector fromJPointers( - JNIEnv* env, jlongArray jptrs, jboolean *has_exception) { - const jsize jptrs_len = env->GetArrayLength(jptrs); - std::vector ptrs; - jlong* jptr = env->GetLongArrayElements(jptrs, nullptr); - if (jptr == nullptr) { - // exception thrown: OutOfMemoryError - *has_exception = JNI_TRUE; - return ptrs; - } - ptrs.reserve(jptrs_len); - for (jsize i = 0; i < jptrs_len; i++) { - ptrs.push_back(reinterpret_cast(jptr[i])); - } - env->ReleaseLongArrayElements(jptrs, jptr, JNI_ABORT); - return ptrs; + /* + * Helper for operations on a key which is a region of an array + * Used to extract the common code from seek/seekForPrev. + * Possible that it can be generalised from that. + * + * We use GetByteArrayRegion to copy the key region of the whole array into + * a char[] We suspect this is not much slower than GetByteArrayElements, + * which probably copies anyway. + */ + static void k_op_region(std::function op, + JNIEnv* env, jbyteArray jkey, jint jkey_off, + jint jkey_len) { + const std::unique_ptr key(new char[jkey_len]); + if (key == nullptr) { + jclass oom_class = env->FindClass("/lang/java/OutOfMemoryError"); + env->ThrowNew(oom_class, + "Memory allocation failed in RocksDB JNI function"); + return; } + env->GetByteArrayRegion(jkey, jkey_off, jkey_len, + reinterpret_cast(key.get())); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + return; + } + + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key.get()), + jkey_len); + op(key_slice); + } - /** - * Creates a Java array of C++ pointer addresses - * from a vector of C++ pointers. - * - * @param env (IN) A pointer to the java environment - * @param pointers (IN) A vector of C++ pointers - * @param has_exception (OUT) will be set to JNI_TRUE - * if an ArrayIndexOutOfBoundsException or OutOfMemoryError - * exception occurs - * - * @return Java array of C++ pointer addresses. - */ - template static jlongArray toJPointers(JNIEnv* env, - const std::vector &pointers, - jboolean *has_exception) { - const jsize len = static_cast(pointers.size()); - std::unique_ptr results(new jlong[len]); - std::transform( - pointers.begin(), pointers.end(), results.get(), - [](T* pointer) -> jlong { return GET_CPLUSPLUS_POINTER(pointer); }); - - jlongArray jpointers = env->NewLongArray(len); - if (jpointers == nullptr) { + /* + * Helper for operations on a value + * for example WriteBatchWithIndex->GetFromBatch + */ + static jbyteArray v_op(std::function + op, + JNIEnv* env, jbyteArray jkey, jint jkey_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + if (env->ExceptionCheck()) { + // exception thrown: OutOfMemoryError + return nullptr; + } + + ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); + + std::string value; + ROCKSDB_NAMESPACE::Status s = op(key_slice, &value); + + if (key != nullptr) { + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + } + + if (s.IsNotFound()) { + return nullptr; + } + + if (s.ok()) { + jbyteArray jret_value = + env->NewByteArray(static_cast(value.size())); + if (jret_value == nullptr) { // exception thrown: OutOfMemoryError - *has_exception = JNI_TRUE; return nullptr; } - env->SetLongArrayRegion(jpointers, 0, len, results.get()); + env->SetByteArrayRegion( + jret_value, 0, static_cast(value.size()), + const_cast(reinterpret_cast(value.c_str()))); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException - *has_exception = JNI_TRUE; - env->DeleteLocalRef(jpointers); + if (jret_value != nullptr) { + env->DeleteLocalRef(jret_value); + } return nullptr; } - *has_exception = JNI_FALSE; - - return jpointers; - } - - /* - * Helper for operations on a key and value - * for example WriteBatch->Put - * - * TODO(AR) could be extended to cover returning ROCKSDB_NAMESPACE::Status - * from `op` and used for RocksDB->Put etc. - */ - static void kv_op_direct(std::function - op, - JNIEnv* env, jobject jkey, jint jkey_off, - jint jkey_len, jobject jval, jint jval_off, - jint jval_len) { - char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); - if (key == nullptr || - env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, "Invalid key argument"); - return; - } + return jret_value; + } + + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; + } + + /** + * Creates a vector of C++ pointers from + * a Java array of C++ pointer addresses. + * + * @param env (IN) A pointer to the java environment + * @param pointers (IN) A Java array of C++ pointer addresses + * @param has_exception (OUT) will be set to JNI_TRUE + * if an ArrayIndexOutOfBoundsException or OutOfMemoryError + * exception occurs. + * + * @return A vector of C++ pointers. + */ + template + static std::vector fromJPointers(JNIEnv* env, jlongArray jptrs, + jboolean* has_exception) { + const jsize jptrs_len = env->GetArrayLength(jptrs); + std::vector ptrs; + jlong* jptr = env->GetLongArrayElements(jptrs, nullptr); + if (jptr == nullptr) { + // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; + return ptrs; + } + ptrs.reserve(jptrs_len); + for (jsize i = 0; i < jptrs_len; i++) { + ptrs.push_back(reinterpret_cast(jptr[i])); + } + env->ReleaseLongArrayElements(jptrs, jptr, JNI_ABORT); + return ptrs; + } + + /** + * Creates a Java array of C++ pointer addresses + * from a vector of C++ pointers. + * + * @param env (IN) A pointer to the java environment + * @param pointers (IN) A vector of C++ pointers + * @param has_exception (OUT) will be set to JNI_TRUE + * if an ArrayIndexOutOfBoundsException or OutOfMemoryError + * exception occurs + * + * @return Java array of C++ pointer addresses. + */ + template + static jlongArray toJPointers(JNIEnv* env, const std::vector& pointers, + jboolean* has_exception) { + const jsize len = static_cast(pointers.size()); + std::unique_ptr results(new jlong[len]); + std::transform( + pointers.begin(), pointers.end(), results.get(), + [](T* pointer) -> jlong { return GET_CPLUSPLUS_POINTER(pointer); }); + + jlongArray jpointers = env->NewLongArray(len); + if (jpointers == nullptr) { + // exception thrown: OutOfMemoryError + *has_exception = JNI_TRUE; + return nullptr; + } + + env->SetLongArrayRegion(jpointers, 0, len, results.get()); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + *has_exception = JNI_TRUE; + env->DeleteLocalRef(jpointers); + return nullptr; + } + + *has_exception = JNI_FALSE; + + return jpointers; + } - char* value = reinterpret_cast(env->GetDirectBufferAddress(jval)); - if (value == nullptr || - env->GetDirectBufferCapacity(jval) < (jval_off + jval_len)) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, "Invalid value argument"); - return; - } + /* + * Helper for operations on a key and value + * for example WriteBatch->Put + * + * TODO(AR) could be extended to cover returning ROCKSDB_NAMESPACE::Status + * from `op` and used for RocksDB->Put etc. + */ + static void kv_op_direct( + std::function + op, + JNIEnv* env, jobject jkey, jint jkey_off, jint jkey_len, jobject jval, + jint jval_off, jint jval_len) { + char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); + if (key == nullptr || + env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, + "Invalid key argument"); + return; + } - key += jkey_off; - value += jval_off; - - ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); - ROCKSDB_NAMESPACE::Slice value_slice(value, jval_len); - - op(key_slice, value_slice); - } - - /* - * Helper for operations on a key and value - * for example WriteBatch->Delete - * - * TODO(AR) could be extended to cover returning ROCKSDB_NAMESPACE::Status - * from `op` and used for RocksDB->Delete etc. - */ - static void k_op_direct(std::function op, - JNIEnv* env, jobject jkey, jint jkey_off, - jint jkey_len) { - char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); - if (key == nullptr || - env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, "Invalid key argument"); - return; - } + char* value = reinterpret_cast(env->GetDirectBufferAddress(jval)); + if (value == nullptr || + env->GetDirectBufferCapacity(jval) < (jval_off + jval_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Invalid value argument"); + return; + } - key += jkey_off; + key += jkey_off; + value += jval_off; - ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); + ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); + ROCKSDB_NAMESPACE::Slice value_slice(value, jval_len); - return op(key_slice); - } + op(key_slice, value_slice); + } - template - static jint copyToDirect(JNIEnv* env, T& source, jobject jtarget, - jint jtarget_off, jint jtarget_len) { - char* target = - reinterpret_cast(env->GetDirectBufferAddress(jtarget)); - if (target == nullptr || - env->GetDirectBufferCapacity(jtarget) < (jtarget_off + jtarget_len)) { - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( - env, "Invalid target argument"); - return 0; - } + /* + * Helper for operations on a key and value + * for example WriteBatch->Delete + * + * TODO(AR) could be extended to cover returning ROCKSDB_NAMESPACE::Status + * from `op` and used for RocksDB->Delete etc. + */ + static void k_op_direct(std::function op, + JNIEnv* env, jobject jkey, jint jkey_off, + jint jkey_len) { + char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); + if (key == nullptr || + env->GetDirectBufferCapacity(jkey) < (jkey_off + jkey_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, + "Invalid key argument"); + return; + } - target += jtarget_off; + key += jkey_off; - const jint cvalue_len = static_cast(source.size()); - const jint length = std::min(jtarget_len, cvalue_len); + ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); - memcpy(target, source.data(), length); + return op(key_slice); + } - return cvalue_len; + template + static jint copyToDirect(JNIEnv* env, T& source, jobject jtarget, + jint jtarget_off, jint jtarget_len) { + char* target = + reinterpret_cast(env->GetDirectBufferAddress(jtarget)); + if (target == nullptr || + env->GetDirectBufferCapacity(jtarget) < (jtarget_off + jtarget_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Invalid target argument"); + return 0; } + + target += jtarget_off; + + const jint cvalue_len = static_cast(source.size()); + const jint length = std::min(jtarget_len, cvalue_len); + + memcpy(target, source.data(), length); + + return cvalue_len; + } }; class MapJni : public JavaClass { @@ -2411,13 +2454,14 @@ class MapJni : public JavaClass { */ static jmethodID getMapPutMethodId(JNIEnv* env) { jclass jlist_clazz = getJClass(env); - if(jlist_clazz == nullptr) { + if (jlist_clazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = - env->GetMethodID(jlist_clazz, "put", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); + static jmethodID mid = env->GetMethodID( + jlist_clazz, "put", + "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); assert(mid != nullptr); return mid; } @@ -2459,7 +2503,8 @@ class HashMapJni : public JavaClass { return nullptr; } - jobject jhash_map = env->NewObject(jclazz, mid, static_cast(initial_capacity)); + jobject jhash_map = + env->NewObject(jclazz, mid, static_cast(initial_capacity)); if (env->ExceptionCheck()) { return nullptr; } @@ -2474,15 +2519,21 @@ class HashMapJni : public JavaClass { * if an error occurs during the mapping */ template - using FnMapKV = std::function> (const std::pair&)>; + using FnMapKV = + std::function>(const std::pair&)>; - // template ::value_type, std::pair>::value, int32_t>::type = 0> - // static void putAll(JNIEnv* env, const jobject jhash_map, I iterator, const FnMapKV &fn_map_kv) { + // template ::value_type, std::pair>::value, + // int32_t>::type = 0> static void putAll(JNIEnv* env, const jobject + // jhash_map, I iterator, const FnMapKV &fn_map_kv) { /** * Returns true if it succeeds, false if an error occurs */ - template - static bool putAll(JNIEnv* env, const jobject jhash_map, iterator_type iterator, iterator_type end, const FnMapKV &fn_map_kv) { + template + static bool putAll(JNIEnv* env, const jobject jhash_map, + iterator_type iterator, iterator_type end, + const FnMapKV& fn_map_kv) { const jmethodID jmid_put = ROCKSDB_NAMESPACE::MapJni::getMapPutMethodId(env); if (jmid_put == nullptr) { @@ -2490,10 +2541,11 @@ class HashMapJni : public JavaClass { } for (auto it = iterator; it != end; ++it) { - const std::unique_ptr> result = fn_map_kv(*it); + const std::unique_ptr> result = + fn_map_kv(*it); if (result == nullptr) { - // an error occurred during fn_map_kv - return false; + // an error occurred during fn_map_kv + return false; } env->CallObjectMethod(jhash_map, jmid_put, result->first, result->second); if (env->ExceptionCheck()) { @@ -2512,14 +2564,17 @@ class HashMapJni : public JavaClass { } /** - * Creates a java.util.Map from a std::map + * Creates a java.util.Map from a std::map * * @param env A pointer to the Java environment * @param map the Cpp map * - * @return a reference to the Java java.util.Map object, or nullptr if an exception occcurred + * @return a reference to the Java java.util.Map object, or nullptr if an + * exception occcurred */ - static jobject fromCppMap(JNIEnv* env, const std::map* map) { + static jobject fromCppMap(JNIEnv* env, + const std::map* map) { if (map == nullptr) { return nullptr; } @@ -2564,14 +2619,17 @@ class HashMapJni : public JavaClass { } /** - * Creates a java.util.Map from a std::map + * Creates a java.util.Map from a std::map * * @param env A pointer to the Java environment * @param map the Cpp map * - * @return a reference to the Java java.util.Map object, or nullptr if an exception occcurred + * @return a reference to the Java java.util.Map object, or nullptr if an + * exception occcurred */ - static jobject fromCppMap(JNIEnv* env, const std::map* map) { + static jobject fromCppMap(JNIEnv* env, + const std::map* map) { if (map == nullptr) { return nullptr; } @@ -2619,14 +2677,17 @@ class HashMapJni : public JavaClass { } /** - * Creates a java.util.Map from a std::map + * Creates a java.util.Map from a std::map * * @param env A pointer to the Java environment * @param map the Cpp map * - * @return a reference to the Java java.util.Map object, or nullptr if an exception occcurred + * @return a reference to the Java java.util.Map object, or nullptr if an + * exception occcurred */ - static jobject fromCppMap(JNIEnv* env, const std::map* map) { + static jobject fromCppMap(JNIEnv* env, + const std::map* map) { if (map == nullptr) { return nullptr; } @@ -2669,15 +2730,17 @@ class HashMapJni : public JavaClass { return jhash_map; } - /** + /** * Creates a java.util.Map from a std::map * * @param env A pointer to the Java environment * @param map the Cpp map * - * @return a reference to the Java java.util.Map object, or nullptr if an exception occcurred + * @return a reference to the Java java.util.Map object, or nullptr if an + * exception occcurred */ - static jobject fromCppMap(JNIEnv* env, const std::map* map) { + static jobject fromCppMap(JNIEnv* env, + const std::map* map) { if (map == nullptr) { return nullptr; } @@ -2807,7 +2870,7 @@ class ColumnFamilyOptionsJni static jobject construct(JNIEnv* env, const ColumnFamilyOptions* cfoptions) { auto* cfo = new ROCKSDB_NAMESPACE::ColumnFamilyOptions(*cfoptions); jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -2893,7 +2956,7 @@ class WriteBatchJni */ static jobject construct(JNIEnv* env, const WriteBatch* wb) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -2929,8 +2992,7 @@ class WriteBatchHandlerJni * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return RocksDBNativeClass::getJClass(env, - "org/rocksdb/WriteBatch$Handler"); + return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteBatch$Handler"); } /** @@ -2943,7 +3005,7 @@ class WriteBatchHandlerJni */ static jmethodID getPutCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -2963,7 +3025,7 @@ class WriteBatchHandlerJni */ static jmethodID getPutMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -2983,7 +3045,7 @@ class WriteBatchHandlerJni */ static jmethodID getMergeCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3003,7 +3065,7 @@ class WriteBatchHandlerJni */ static jmethodID getMergeMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3023,7 +3085,7 @@ class WriteBatchHandlerJni */ static jmethodID getDeleteCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3043,7 +3105,7 @@ class WriteBatchHandlerJni */ static jmethodID getDeleteMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3063,7 +3125,7 @@ class WriteBatchHandlerJni */ static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3083,7 +3145,7 @@ class WriteBatchHandlerJni */ static jmethodID getSingleDeleteMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3143,7 +3205,7 @@ class WriteBatchHandlerJni */ static jmethodID getLogDataMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3163,7 +3225,7 @@ class WriteBatchHandlerJni */ static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3183,7 +3245,7 @@ class WriteBatchHandlerJni */ static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3203,7 +3265,7 @@ class WriteBatchHandlerJni */ static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3223,7 +3285,7 @@ class WriteBatchHandlerJni */ static jmethodID getMarkNoopMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3243,7 +3305,7 @@ class WriteBatchHandlerJni */ static jmethodID getMarkRollbackMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3263,7 +3325,7 @@ class WriteBatchHandlerJni */ static jmethodID getMarkCommitMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3304,7 +3366,7 @@ class WriteBatchHandlerJni */ static jmethodID getContinueMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3340,7 +3402,7 @@ class WriteBatchSavePointJni : public JavaClass { */ static jmethodID getConstructorMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3360,9 +3422,9 @@ class WriteBatchSavePointJni : public JavaClass { * @return A reference to a Java org.rocksdb.WriteBatch.SavePoint object, or * nullptr if an an exception occurs */ - static jobject construct(JNIEnv* env, const SavePoint &save_point) { + static jobject construct(JNIEnv* env, const SavePoint& save_point) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3373,10 +3435,10 @@ class WriteBatchSavePointJni : public JavaClass { return nullptr; } - jobject jsave_point = env->NewObject(jclazz, mid, - static_cast(save_point.size), - static_cast(save_point.count), - static_cast(save_point.content_flags)); + jobject jsave_point = + env->NewObject(jclazz, mid, static_cast(save_point.size), + static_cast(save_point.count), + static_cast(save_point.content_flags)); if (env->ExceptionCheck()) { return nullptr; } @@ -3401,7 +3463,7 @@ class WriteBatchWithIndexJni */ static jclass getJClass(JNIEnv* env) { return RocksDBNativeClass::getJClass(env, - "org/rocksdb/WriteBatchWithIndex"); + "org/rocksdb/WriteBatchWithIndex"); } }; @@ -3431,7 +3493,7 @@ class HistogramDataJni : public JavaClass { */ static jmethodID getConstructorMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3546,8 +3608,7 @@ class ColumnFamilyHandleJni * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return RocksDBNativeClass::getJClass(env, - "org/rocksdb/ColumnFamilyHandle"); + return RocksDBNativeClass::getJClass(env, "org/rocksdb/ColumnFamilyHandle"); } }; @@ -3606,8 +3667,8 @@ class AbstractCompactionFilterFactoryJni * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return RocksDBNativeClass::getJClass(env, - "org/rocksdb/AbstractCompactionFilterFactory"); + return RocksDBNativeClass::getJClass( + env, "org/rocksdb/AbstractCompactionFilterFactory"); } /** @@ -3620,13 +3681,13 @@ class AbstractCompactionFilterFactoryJni */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = env->GetMethodID( - jclazz, "name", "()Ljava/lang/String;"); + static jmethodID mid = + env->GetMethodID(jclazz, "name", "()Ljava/lang/String;"); assert(mid != nullptr); return mid; } @@ -3641,14 +3702,13 @@ class AbstractCompactionFilterFactoryJni */ static jmethodID getCreateCompactionFilterMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = env->GetMethodID(jclazz, - "createCompactionFilter", - "(ZZ)J"); + static jmethodID mid = + env->GetMethodID(jclazz, "createCompactionFilter", "(ZZ)J"); assert(mid != nullptr); return mid; } @@ -3661,15 +3721,15 @@ class AbstractTransactionNotifierJni AbstractTransactionNotifierJni> { public: static jclass getJClass(JNIEnv* env) { - return RocksDBNativeClass::getJClass(env, - "org/rocksdb/AbstractTransactionNotifier"); + return RocksDBNativeClass::getJClass( + env, "org/rocksdb/AbstractTransactionNotifier"); } // Get the java method `snapshotCreated` // of org.rocksdb.AbstractTransactionNotifier. static jmethodID getSnapshotCreatedMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3693,8 +3753,7 @@ class AbstractComparatorJniBridge : public JavaClass { * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env, - "org/rocksdb/AbstractComparatorJniBridge"); + return JavaClass::getJClass(env, "org/rocksdb/AbstractComparatorJniBridge"); } /** @@ -3709,7 +3768,8 @@ class AbstractComparatorJniBridge : public JavaClass { static jmethodID getCompareInternalMethodId(JNIEnv* env, jclass jclazz) { static jmethodID mid = env->GetStaticMethodID(jclazz, "compareInternal", - "(Lorg/rocksdb/AbstractComparator;Ljava/nio/ByteBuffer;ILjava/nio/ByteBuffer;I)I"); + "(Lorg/rocksdb/AbstractComparator;Ljava/nio/" + "ByteBuffer;ILjava/nio/ByteBuffer;I)I"); assert(mid != nullptr); return mid; } @@ -3723,10 +3783,12 @@ class AbstractComparatorJniBridge : public JavaClass { * @return The Java Method ID or nullptr if the class or method id could not * be retrieved */ - static jmethodID getFindShortestSeparatorInternalMethodId(JNIEnv* env, jclass jclazz) { + static jmethodID getFindShortestSeparatorInternalMethodId(JNIEnv* env, + jclass jclazz) { static jmethodID mid = env->GetStaticMethodID(jclazz, "findShortestSeparatorInternal", - "(Lorg/rocksdb/AbstractComparator;Ljava/nio/ByteBuffer;ILjava/nio/ByteBuffer;I)I"); + "(Lorg/rocksdb/AbstractComparator;Ljava/nio/" + "ByteBuffer;ILjava/nio/ByteBuffer;I)I"); assert(mid != nullptr); return mid; } @@ -3740,10 +3802,11 @@ class AbstractComparatorJniBridge : public JavaClass { * @return The Java Method ID or nullptr if the class or method id could not * be retrieved */ - static jmethodID getFindShortSuccessorInternalMethodId(JNIEnv* env, jclass jclazz) { - static jmethodID mid = - env->GetStaticMethodID(jclazz, "findShortSuccessorInternal", - "(Lorg/rocksdb/AbstractComparator;Ljava/nio/ByteBuffer;I)I"); + static jmethodID getFindShortSuccessorInternalMethodId(JNIEnv* env, + jclass jclazz) { + static jmethodID mid = env->GetStaticMethodID( + jclazz, "findShortSuccessorInternal", + "(Lorg/rocksdb/AbstractComparator;Ljava/nio/ByteBuffer;I)I"); assert(mid != nullptr); return mid; } @@ -3764,8 +3827,7 @@ class AbstractComparatorJni * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return RocksDBNativeClass::getJClass(env, - "org/rocksdb/AbstractComparator"); + return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractComparator"); } /** @@ -3778,7 +3840,7 @@ class AbstractComparatorJni */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -3837,19 +3899,19 @@ class SliceJni */ static jobject construct0(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } static jmethodID mid = env->GetMethodID(jclazz, "", "()V"); - if(mid == nullptr) { + if (mid == nullptr) { // exception occurred accessing method return nullptr; } jobject jslice = env->NewObject(jclazz, mid); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { return nullptr; } @@ -3885,19 +3947,19 @@ class DirectSliceJni */ static jobject construct0(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } static jmethodID mid = env->GetMethodID(jclazz, "", "()V"); - if(mid == nullptr) { + if (mid == nullptr) { // exception occurred accessing method return nullptr; } jobject jdirect_slice = env->NewObject(jclazz, mid); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { return nullptr; } @@ -3938,14 +4000,14 @@ class BackupInfoJni : public JavaClass { uint64_t size, uint32_t number_files, const std::string& app_metadata) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } static jmethodID mid = env->GetMethodID(jclazz, "", "(IJJILjava/lang/String;)V"); - if(mid == nullptr) { + if (mid == nullptr) { // exception occurred accessing method return nullptr; } @@ -3961,7 +4023,7 @@ class BackupInfoJni : public JavaClass { jobject jbackup_info = env->NewObject(jclazz, mid, backup_id, timestamp, size, number_files, japp_metadata); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { env->DeleteLocalRef(japp_metadata); return nullptr; } @@ -3983,23 +4045,23 @@ class BackupInfoListJni { * if an exception occurs */ static jobject getBackupInfo(JNIEnv* env, - std::vector backup_infos) { + std::vector backup_infos) { jclass jarray_list_clazz = ROCKSDB_NAMESPACE::ListJni::getArrayListClass(env); - if(jarray_list_clazz == nullptr) { + if (jarray_list_clazz == nullptr) { // exception occurred accessing class return nullptr; } jmethodID cstr_mid = ROCKSDB_NAMESPACE::ListJni::getArrayListConstructorMethodId(env); - if(cstr_mid == nullptr) { + if (cstr_mid == nullptr) { // exception occurred accessing method return nullptr; } jmethodID add_mid = ROCKSDB_NAMESPACE::ListJni::getListAddMethodId(env); - if(add_mid == nullptr) { + if (add_mid == nullptr) { // exception occurred accessing method return nullptr; } @@ -4007,7 +4069,7 @@ class BackupInfoListJni { // create java list jobject jbackup_info_handle_list = env->NewObject(jarray_list_clazz, cstr_mid, backup_infos.size()); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception occurred constructing object return nullptr; } @@ -4020,12 +4082,12 @@ class BackupInfoListJni { jobject obj = ROCKSDB_NAMESPACE::BackupInfoJni::construct0( env, backup_info.backup_id, backup_info.timestamp, backup_info.size, backup_info.number_files, backup_info.app_metadata); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception occurred constructing object - if(obj != nullptr) { + if (obj != nullptr) { env->DeleteLocalRef(obj); } - if(jbackup_info_handle_list != nullptr) { + if (jbackup_info_handle_list != nullptr) { env->DeleteLocalRef(jbackup_info_handle_list); } return nullptr; @@ -4033,12 +4095,12 @@ class BackupInfoListJni { jboolean rs = env->CallBooleanMethod(jbackup_info_handle_list, add_mid, obj); - if(env->ExceptionCheck() || rs == JNI_FALSE) { + if (env->ExceptionCheck() || rs == JNI_FALSE) { // exception occurred calling method, or could not add - if(obj != nullptr) { + if (obj != nullptr) { env->DeleteLocalRef(obj); } - if(jbackup_info_handle_list != nullptr) { + if (jbackup_info_handle_list != nullptr) { env->DeleteLocalRef(jbackup_info_handle_list); } return nullptr; @@ -4075,14 +4137,13 @@ class WBWIRocksIteratorJni : public JavaClass { */ static jfieldID getWriteEntryField(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jfieldID fid = - env->GetFieldID(jclazz, "entry", - "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;"); + static jfieldID fid = env->GetFieldID( + jclazz, "entry", "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;"); assert(fid != nullptr); return fid; } @@ -4100,7 +4161,7 @@ class WBWIRocksIteratorJni : public JavaClass { assert(jwbwi_rocks_iterator != nullptr); jfieldID jwrite_entry_field = getWriteEntryField(env); - if(jwrite_entry_field == nullptr) { + if (jwrite_entry_field == nullptr) { // exception occurred accessing the field return nullptr; } @@ -4122,9 +4183,7 @@ class WriteTypeJni : public JavaClass { * @return A reference to the enum field value or a nullptr if * the enum field value could not be retrieved */ - static jobject PUT(JNIEnv* env) { - return getEnum(env, "PUT"); - } + static jobject PUT(JNIEnv* env) { return getEnum(env, "PUT"); } /** * Get the MERGE enum field value of WBWIRocksIterator.WriteType @@ -4134,9 +4193,7 @@ class WriteTypeJni : public JavaClass { * @return A reference to the enum field value or a nullptr if * the enum field value could not be retrieved */ - static jobject MERGE(JNIEnv* env) { - return getEnum(env, "MERGE"); - } + static jobject MERGE(JNIEnv* env) { return getEnum(env, "MERGE"); } /** * Get the DELETE enum field value of WBWIRocksIterator.WriteType @@ -4146,9 +4203,7 @@ class WriteTypeJni : public JavaClass { * @return A reference to the enum field value or a nullptr if * the enum field value could not be retrieved */ - static jobject DELETE(JNIEnv* env) { - return getEnum(env, "DELETE"); - } + static jobject DELETE(JNIEnv* env) { return getEnum(env, "DELETE"); } /** * Get the LOG enum field value of WBWIRocksIterator.WriteType @@ -4158,9 +4213,7 @@ class WriteTypeJni : public JavaClass { * @return A reference to the enum field value or a nullptr if * the enum field value could not be retrieved */ - static jobject LOG(JNIEnv* env) { - return getEnum(env, "LOG"); - } + static jobject LOG(JNIEnv* env) { return getEnum(env, "LOG"); } // Returns the equivalent org.rocksdb.WBWIRocksIterator.WriteType for the // provided C++ ROCKSDB_NAMESPACE::WriteType enum @@ -4210,18 +4263,17 @@ class WriteTypeJni : public JavaClass { */ static jobject getEnum(JNIEnv* env, const char name[]) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - jfieldID jfid = - env->GetStaticFieldID(jclazz, name, - "Lorg/rocksdb/WBWIRocksIterator$WriteType;"); - if(env->ExceptionCheck()) { + jfieldID jfid = env->GetStaticFieldID( + jclazz, name, "Lorg/rocksdb/WBWIRocksIterator$WriteType;"); + if (env->ExceptionCheck()) { // exception occurred while getting field return nullptr; - } else if(jfid == nullptr) { + } else if (jfid == nullptr) { return nullptr; } @@ -4243,85 +4295,82 @@ class WriteEntryJni : public JavaClass { * ClassFormatError, ClassCircularityError, NoClassDefFoundError, * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ - static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator$WriteEntry"); - } + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, + "org/rocksdb/WBWIRocksIterator$WriteEntry"); + } }; // The portal class for org.rocksdb.InfoLogLevel class InfoLogLevelJni : public JavaClass { public: - /** - * Get the DEBUG_LEVEL enum field value of InfoLogLevel - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject DEBUG_LEVEL(JNIEnv* env) { - return getEnum(env, "DEBUG_LEVEL"); - } - - /** - * Get the INFO_LEVEL enum field value of InfoLogLevel - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject INFO_LEVEL(JNIEnv* env) { - return getEnum(env, "INFO_LEVEL"); - } - - /** - * Get the WARN_LEVEL enum field value of InfoLogLevel - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject WARN_LEVEL(JNIEnv* env) { - return getEnum(env, "WARN_LEVEL"); - } - - /** - * Get the ERROR_LEVEL enum field value of InfoLogLevel - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject ERROR_LEVEL(JNIEnv* env) { - return getEnum(env, "ERROR_LEVEL"); - } - - /** - * Get the FATAL_LEVEL enum field value of InfoLogLevel - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject FATAL_LEVEL(JNIEnv* env) { - return getEnum(env, "FATAL_LEVEL"); - } - - /** - * Get the HEADER_LEVEL enum field value of InfoLogLevel - * - * @param env A pointer to the Java environment - * - * @return A reference to the enum field value or a nullptr if - * the enum field value could not be retrieved - */ - static jobject HEADER_LEVEL(JNIEnv* env) { - return getEnum(env, "HEADER_LEVEL"); - } + /** + * Get the DEBUG_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject DEBUG_LEVEL(JNIEnv* env) { + return getEnum(env, "DEBUG_LEVEL"); + } + + /** + * Get the INFO_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject INFO_LEVEL(JNIEnv* env) { return getEnum(env, "INFO_LEVEL"); } + + /** + * Get the WARN_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject WARN_LEVEL(JNIEnv* env) { return getEnum(env, "WARN_LEVEL"); } + + /** + * Get the ERROR_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject ERROR_LEVEL(JNIEnv* env) { + return getEnum(env, "ERROR_LEVEL"); + } + + /** + * Get the FATAL_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject FATAL_LEVEL(JNIEnv* env) { + return getEnum(env, "FATAL_LEVEL"); + } + + /** + * Get the HEADER_LEVEL enum field value of InfoLogLevel + * + * @param env A pointer to the Java environment + * + * @return A reference to the enum field value or a nullptr if + * the enum field value could not be retrieved + */ + static jobject HEADER_LEVEL(JNIEnv* env) { + return getEnum(env, "HEADER_LEVEL"); + } private: /** @@ -4348,17 +4397,17 @@ class InfoLogLevelJni : public JavaClass { */ static jobject getEnum(JNIEnv* env, const char name[]) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } jfieldID jfid = env->GetStaticFieldID(jclazz, name, "Lorg/rocksdb/InfoLogLevel;"); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception occurred while getting field return nullptr; - } else if(jfid == nullptr) { + } else if (jfid == nullptr) { return nullptr; } @@ -4396,14 +4445,13 @@ class LoggerJni */ static jmethodID getLogMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = - env->GetMethodID(jclazz, "log", - "(Lorg/rocksdb/InfoLogLevel;Ljava/lang/String;)V"); + static jmethodID mid = env->GetMethodID( + jclazz, "log", "(Lorg/rocksdb/InfoLogLevel;Ljava/lang/String;)V"); assert(mid != nullptr); return mid; } @@ -4411,7 +4459,7 @@ class LoggerJni // The portal class for org.rocksdb.TransactionLogIterator.BatchResult class BatchResultJni : public JavaClass { - public: + public: /** * Get the Java Class org.rocksdb.TransactionLogIterator.BatchResult * @@ -4422,8 +4470,8 @@ class BatchResultJni : public JavaClass { * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env, - "org/rocksdb/TransactionLogIterator$BatchResult"); + return JavaClass::getJClass( + env, "org/rocksdb/TransactionLogIterator$BatchResult"); } /** @@ -4441,21 +4489,20 @@ class BatchResultJni : public JavaClass { static jobject construct(JNIEnv* env, ROCKSDB_NAMESPACE::BatchResult& batch_result) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - jmethodID mid = env->GetMethodID( - jclazz, "", "(JJ)V"); - if(mid == nullptr) { + jmethodID mid = env->GetMethodID(jclazz, "", "(JJ)V"); + if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; } - jobject jbatch_result = env->NewObject(jclazz, mid, - batch_result.sequence, batch_result.writeBatchPtr.get()); - if(jbatch_result == nullptr) { + jobject jbatch_result = env->NewObject(jclazz, mid, batch_result.sequence, + batch_result.writeBatchPtr.get()); + if (jbatch_result == nullptr) { // exception thrown: InstantiationException or OutOfMemoryError return nullptr; } @@ -4473,7 +4520,7 @@ class BottommostLevelCompactionJni { static jint toJavaBottommostLevelCompaction( const ROCKSDB_NAMESPACE::BottommostLevelCompaction& bottommost_level_compaction) { - switch(bottommost_level_compaction) { + switch (bottommost_level_compaction) { case ROCKSDB_NAMESPACE::BottommostLevelCompaction::kSkip: return 0x0; case ROCKSDB_NAMESPACE::BottommostLevelCompaction:: @@ -4492,7 +4539,7 @@ class BottommostLevelCompactionJni { // enum for the provided Java org.rocksdb.BottommostLevelCompaction static ROCKSDB_NAMESPACE::BottommostLevelCompaction toCppBottommostLevelCompaction(jint bottommost_level_compaction) { - switch(bottommost_level_compaction) { + switch (bottommost_level_compaction) { case 0x0: return ROCKSDB_NAMESPACE::BottommostLevelCompaction::kSkip; case 0x1: @@ -4517,7 +4564,7 @@ class CompactionStopStyleJni { // C++ ROCKSDB_NAMESPACE::CompactionStopStyle enum static jbyte toJavaCompactionStopStyle( const ROCKSDB_NAMESPACE::CompactionStopStyle& compaction_stop_style) { - switch(compaction_stop_style) { + switch (compaction_stop_style) { case ROCKSDB_NAMESPACE::CompactionStopStyle:: kCompactionStopStyleSimilarSize: return 0x0; @@ -4533,7 +4580,7 @@ class CompactionStopStyleJni { // the provided Java org.rocksdb.CompactionStopStyle static ROCKSDB_NAMESPACE::CompactionStopStyle toCppCompactionStopStyle( jbyte jcompaction_stop_style) { - switch(jcompaction_stop_style) { + switch (jcompaction_stop_style) { case 0x0: return ROCKSDB_NAMESPACE::CompactionStopStyle:: kCompactionStopStyleSimilarSize; @@ -4555,7 +4602,7 @@ class CompressionTypeJni { // C++ ROCKSDB_NAMESPACE::CompressionType enum static jbyte toJavaCompressionType( const ROCKSDB_NAMESPACE::CompressionType& compression_type) { - switch(compression_type) { + switch (compression_type) { case ROCKSDB_NAMESPACE::CompressionType::kNoCompression: return 0x0; case ROCKSDB_NAMESPACE::CompressionType::kSnappyCompression: @@ -4582,7 +4629,7 @@ class CompressionTypeJni { // provided Java org.rocksdb.CompressionType static ROCKSDB_NAMESPACE::CompressionType toCppCompressionType( jbyte jcompression_type) { - switch(jcompression_type) { + switch (jcompression_type) { case 0x0: return ROCKSDB_NAMESPACE::CompressionType::kNoCompression; case 0x1: @@ -4613,7 +4660,7 @@ class CompactionPriorityJni { // C++ ROCKSDB_NAMESPACE::CompactionPri enum static jbyte toJavaCompactionPriority( const ROCKSDB_NAMESPACE::CompactionPri& compaction_priority) { - switch(compaction_priority) { + switch (compaction_priority) { case ROCKSDB_NAMESPACE::CompactionPri::kByCompensatedSize: return 0x0; case ROCKSDB_NAMESPACE::CompactionPri::kOldestLargestSeqFirst: @@ -4633,7 +4680,7 @@ class CompactionPriorityJni { // provided Java org.rocksdb.CompactionPriority static ROCKSDB_NAMESPACE::CompactionPri toCppCompactionPriority( jbyte jcompaction_priority) { - switch(jcompaction_priority) { + switch (jcompaction_priority) { case 0x0: return ROCKSDB_NAMESPACE::CompactionPri::kByCompensatedSize; case 0x1: @@ -4658,7 +4705,7 @@ class AccessHintJni { // C++ ROCKSDB_NAMESPACE::DBOptions::AccessHint enum static jbyte toJavaAccessHint( const ROCKSDB_NAMESPACE::DBOptions::AccessHint& access_hint) { - switch(access_hint) { + switch (access_hint) { case ROCKSDB_NAMESPACE::DBOptions::AccessHint::NONE: return 0x0; case ROCKSDB_NAMESPACE::DBOptions::AccessHint::NORMAL: @@ -4677,7 +4724,7 @@ class AccessHintJni { // for the provided Java org.rocksdb.AccessHint static ROCKSDB_NAMESPACE::DBOptions::AccessHint toCppAccessHint( jbyte jaccess_hint) { - switch(jaccess_hint) { + switch (jaccess_hint) { case 0x0: return ROCKSDB_NAMESPACE::DBOptions::AccessHint::NONE; case 0x1: @@ -4700,7 +4747,7 @@ class WALRecoveryModeJni { // C++ ROCKSDB_NAMESPACE::WALRecoveryMode enum static jbyte toJavaWALRecoveryMode( const ROCKSDB_NAMESPACE::WALRecoveryMode& wal_recovery_mode) { - switch(wal_recovery_mode) { + switch (wal_recovery_mode) { case ROCKSDB_NAMESPACE::WALRecoveryMode::kTolerateCorruptedTailRecords: return 0x0; case ROCKSDB_NAMESPACE::WALRecoveryMode::kAbsoluteConsistency: @@ -4719,7 +4766,7 @@ class WALRecoveryModeJni { // provided Java org.rocksdb.WALRecoveryMode static ROCKSDB_NAMESPACE::WALRecoveryMode toCppWALRecoveryMode( jbyte jwal_recovery_mode) { - switch(jwal_recovery_mode) { + switch (jwal_recovery_mode) { case 0x0: return ROCKSDB_NAMESPACE::WALRecoveryMode:: kTolerateCorruptedTailRecords; @@ -4742,7 +4789,7 @@ class TickerTypeJni { // Returns the equivalent org.rocksdb.TickerType for the provided // C++ ROCKSDB_NAMESPACE::Tickers enum static jbyte toJavaTickerType(const ROCKSDB_NAMESPACE::Tickers& tickers) { - switch(tickers) { + switch (tickers) { case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_MISS: return 0x0; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_HIT: @@ -5127,7 +5174,7 @@ class TickerTypeJni { // Returns the equivalent C++ ROCKSDB_NAMESPACE::Tickers enum for the // provided Java org.rocksdb.TickerType static ROCKSDB_NAMESPACE::Tickers toCppTickers(jbyte jticker_type) { - switch(jticker_type) { + switch (jticker_type) { case 0x0: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_MISS; case 0x1: @@ -5520,7 +5567,7 @@ class HistogramTypeJni { // C++ ROCKSDB_NAMESPACE::Histograms enum static jbyte toJavaHistogramsType( const ROCKSDB_NAMESPACE::Histograms& histograms) { - switch(histograms) { + switch (histograms) { case ROCKSDB_NAMESPACE::Histograms::DB_GET: return 0x0; case ROCKSDB_NAMESPACE::Histograms::DB_WRITE: @@ -5583,7 +5630,8 @@ class HistogramTypeJni { return 0x1D; case ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS: return 0x1E; - // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor version compatibility. + // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor + // version compatibility. case ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME: return 0x20; case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_KEY_SIZE: @@ -5648,7 +5696,7 @@ class HistogramTypeJni { // Returns the equivalent C++ ROCKSDB_NAMESPACE::Histograms enum for the // provided Java org.rocksdb.HistogramsType static ROCKSDB_NAMESPACE::Histograms toCppHistograms(jbyte jhistograms_type) { - switch(jhistograms_type) { + switch (jhistograms_type) { case 0x0: return ROCKSDB_NAMESPACE::Histograms::DB_GET; case 0x1: @@ -5711,7 +5759,8 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::DECOMPRESSION_TIMES_NANOS; case 0x1E: return ROCKSDB_NAMESPACE::Histograms::READ_NUM_MERGE_OPERANDS; - // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor version compatibility. + // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor + // version compatibility. case 0x20: return ROCKSDB_NAMESPACE::Histograms::FLUSH_TIME; case 0x21: @@ -5782,7 +5831,7 @@ class StatsLevelJni { // C++ ROCKSDB_NAMESPACE::StatsLevel enum static jbyte toJavaStatsLevel( const ROCKSDB_NAMESPACE::StatsLevel& stats_level) { - switch(stats_level) { + switch (stats_level) { case ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers: return 0x0; case ROCKSDB_NAMESPACE::StatsLevel::kExceptTimeForMutex: @@ -5799,7 +5848,7 @@ class StatsLevelJni { // Returns the equivalent C++ ROCKSDB_NAMESPACE::StatsLevel enum for the // provided Java org.rocksdb.StatsLevel static ROCKSDB_NAMESPACE::StatsLevel toCppStatsLevel(jbyte jstats_level) { - switch(jstats_level) { + switch (jstats_level) { case 0x0: return ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers; case 0x1: @@ -5821,7 +5870,7 @@ class RateLimiterModeJni { // C++ ROCKSDB_NAMESPACE::RateLimiter::Mode enum static jbyte toJavaRateLimiterMode( const ROCKSDB_NAMESPACE::RateLimiter::Mode& rate_limiter_mode) { - switch(rate_limiter_mode) { + switch (rate_limiter_mode) { case ROCKSDB_NAMESPACE::RateLimiter::Mode::kReadsOnly: return 0x0; case ROCKSDB_NAMESPACE::RateLimiter::Mode::kWritesOnly: @@ -5839,7 +5888,7 @@ class RateLimiterModeJni { // the provided Java org.rocksdb.RateLimiterMode static ROCKSDB_NAMESPACE::RateLimiter::Mode toCppRateLimiterMode( jbyte jrate_limiter_mode) { - switch(jrate_limiter_mode) { + switch (jrate_limiter_mode) { case 0x0: return ROCKSDB_NAMESPACE::RateLimiter::Mode::kReadsOnly; case 0x1: @@ -5856,44 +5905,44 @@ class RateLimiterModeJni { // The portal class for org.rocksdb.MemoryUsageType class MemoryUsageTypeJni { -public: - // Returns the equivalent org.rocksdb.MemoryUsageType for the provided - // C++ ROCKSDB_NAMESPACE::MemoryUtil::UsageType enum - static jbyte toJavaMemoryUsageType( - const ROCKSDB_NAMESPACE::MemoryUtil::UsageType& usage_type) { - switch (usage_type) { - case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableTotal: - return 0x0; - case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableUnFlushed: - return 0x1; - case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kTableReadersTotal: - return 0x2; - case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kCacheTotal: - return 0x3; - default: - // undefined: use kNumUsageTypes - return 0x4; - } - } - - // Returns the equivalent C++ ROCKSDB_NAMESPACE::MemoryUtil::UsageType enum for - // the provided Java org.rocksdb.MemoryUsageType - static ROCKSDB_NAMESPACE::MemoryUtil::UsageType toCppMemoryUsageType( - jbyte usage_type) { - switch (usage_type) { - case 0x0: - return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableTotal; - case 0x1: - return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableUnFlushed; - case 0x2: - return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kTableReadersTotal; - case 0x3: - return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kCacheTotal; - default: - // undefined/default: use kNumUsageTypes - return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kNumUsageTypes; - } - } + public: + // Returns the equivalent org.rocksdb.MemoryUsageType for the provided + // C++ ROCKSDB_NAMESPACE::MemoryUtil::UsageType enum + static jbyte toJavaMemoryUsageType( + const ROCKSDB_NAMESPACE::MemoryUtil::UsageType& usage_type) { + switch (usage_type) { + case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableTotal: + return 0x0; + case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableUnFlushed: + return 0x1; + case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kTableReadersTotal: + return 0x2; + case ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kCacheTotal: + return 0x3; + default: + // undefined: use kNumUsageTypes + return 0x4; + } + } + + // Returns the equivalent C++ ROCKSDB_NAMESPACE::MemoryUtil::UsageType enum + // for the provided Java org.rocksdb.MemoryUsageType + static ROCKSDB_NAMESPACE::MemoryUtil::UsageType toCppMemoryUsageType( + jbyte usage_type) { + switch (usage_type) { + case 0x0: + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableTotal; + case 0x1: + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kMemTableUnFlushed; + case 0x2: + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kTableReadersTotal; + case 0x3: + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kCacheTotal; + default: + // undefined/default: use kNumUsageTypes + return ROCKSDB_NAMESPACE::MemoryUtil::UsageType::kNumUsageTypes; + } + } }; // The portal class for org.rocksdb.Transaction @@ -5909,8 +5958,7 @@ class TransactionJni : public JavaClass { * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env, - "org/rocksdb/Transaction"); + return JavaClass::getJClass(env, "org/rocksdb/Transaction"); } /** @@ -5926,31 +5974,33 @@ class TransactionJni : public JavaClass { * org.rocksdb.Transaction.WaitingTransactions object, * or nullptr if an an exception occurs */ - static jobject newWaitingTransactions(JNIEnv* env, jobject jtransaction, - const uint32_t column_family_id, const std::string &key, - const std::vector &transaction_ids) { + static jobject newWaitingTransactions( + JNIEnv* env, jobject jtransaction, const uint32_t column_family_id, + const std::string& key, + const std::vector& transaction_ids) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } jmethodID mid = env->GetMethodID( - jclazz, "newWaitingTransactions", "(JLjava/lang/String;[J)Lorg/rocksdb/Transaction$WaitingTransactions;"); - if(mid == nullptr) { + jclazz, "newWaitingTransactions", + "(JLjava/lang/String;[J)Lorg/rocksdb/Transaction$WaitingTransactions;"); + if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; } jstring jkey = env->NewStringUTF(key.c_str()); - if(jkey == nullptr) { + if (jkey == nullptr) { // exception thrown: OutOfMemoryError return nullptr; } const size_t len = transaction_ids.size(); jlongArray jtransaction_ids = env->NewLongArray(static_cast(len)); - if(jtransaction_ids == nullptr) { + if (jtransaction_ids == nullptr) { // exception thrown: OutOfMemoryError env->DeleteLocalRef(jkey); return nullptr; @@ -5958,21 +6008,22 @@ class TransactionJni : public JavaClass { jboolean is_copy; jlong* body = env->GetLongArrayElements(jtransaction_ids, &is_copy); - if(body == nullptr) { - // exception thrown: OutOfMemoryError - env->DeleteLocalRef(jkey); - env->DeleteLocalRef(jtransaction_ids); - return nullptr; + if (body == nullptr) { + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jkey); + env->DeleteLocalRef(jtransaction_ids); + return nullptr; } - for(size_t i = 0; i < len; ++i) { + for (size_t i = 0; i < len; ++i) { body[i] = static_cast(transaction_ids[i]); } env->ReleaseLongArrayElements(jtransaction_ids, body, is_copy == JNI_TRUE ? 0 : JNI_ABORT); - jobject jwaiting_transactions = env->CallObjectMethod(jtransaction, - mid, static_cast(column_family_id), jkey, jtransaction_ids); - if(env->ExceptionCheck()) { + jobject jwaiting_transactions = env->CallObjectMethod( + jtransaction, mid, static_cast(column_family_id), jkey, + jtransaction_ids); + if (env->ExceptionCheck()) { // exception thrown: InstantiationException or OutOfMemoryError env->DeleteLocalRef(jkey); env->DeleteLocalRef(jtransaction_ids); @@ -5986,18 +6037,17 @@ class TransactionJni : public JavaClass { // The portal class for org.rocksdb.TransactionDB class TransactionDBJni : public JavaClass { public: - /** - * Get the Java Class org.rocksdb.TransactionDB - * - * @param env A pointer to the Java environment - * - * @return The Java Class or nullptr if one of the - * ClassFormatError, ClassCircularityError, NoClassDefFoundError, - * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown - */ + /** + * Get the Java Class org.rocksdb.TransactionDB + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env, - "org/rocksdb/TransactionDB"); + return JavaClass::getJClass(env, "org/rocksdb/TransactionDB"); } /** @@ -6019,29 +6069,30 @@ class TransactionDBJni : public JavaClass { const uint32_t column_family_id, const std::string& waiting_key, const bool exclusive) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } jmethodID mid = env->GetMethodID( - jclazz, "newDeadlockInfo", "(JJLjava/lang/String;Z)Lorg/rocksdb/TransactionDB$DeadlockInfo;"); - if(mid == nullptr) { + jclazz, "newDeadlockInfo", + "(JJLjava/lang/String;Z)Lorg/rocksdb/TransactionDB$DeadlockInfo;"); + if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; } jstring jwaiting_key = env->NewStringUTF(waiting_key.c_str()); - if(jwaiting_key == nullptr) { + if (jwaiting_key == nullptr) { // exception thrown: OutOfMemoryError return nullptr; } // resolve the column family id to a ColumnFamilyHandle - jobject jdeadlock_info = env->CallObjectMethod(jtransaction_db, - mid, transaction_id, static_cast(column_family_id), - jwaiting_key, exclusive); - if(env->ExceptionCheck()) { + jobject jdeadlock_info = env->CallObjectMethod( + jtransaction_db, mid, transaction_id, + static_cast(column_family_id), jwaiting_key, exclusive); + if (env->ExceptionCheck()) { // exception thrown: InstantiationException or OutOfMemoryError env->DeleteLocalRef(jwaiting_key); return nullptr; @@ -6101,8 +6152,7 @@ class KeyLockInfoJni : public JavaClass { * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env, - "org/rocksdb/TransactionDB$KeyLockInfo"); + return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$KeyLockInfo"); } /** @@ -6120,13 +6170,13 @@ class KeyLockInfoJni : public JavaClass { static jobject construct( JNIEnv* env, const ROCKSDB_NAMESPACE::KeyLockInfo& key_lock_info) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - jmethodID mid = env->GetMethodID( - jclazz, "", "(Ljava/lang/String;[JZ)V"); + jmethodID mid = + env->GetMethodID(jclazz, "", "(Ljava/lang/String;[JZ)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -6138,7 +6188,8 @@ class KeyLockInfoJni : public JavaClass { return nullptr; } - const jsize jtransaction_ids_len = static_cast(key_lock_info.ids.size()); + const jsize jtransaction_ids_len = + static_cast(key_lock_info.ids.size()); jlongArray jtransactions_ids = env->NewLongArray(jtransaction_ids_len); if (jtransactions_ids == nullptr) { // exception thrown: OutOfMemoryError @@ -6146,9 +6197,9 @@ class KeyLockInfoJni : public JavaClass { return nullptr; } - const jobject jkey_lock_info = env->NewObject(jclazz, mid, - jkey, jtransactions_ids, key_lock_info.exclusive); - if(jkey_lock_info == nullptr) { + const jobject jkey_lock_info = env->NewObject( + jclazz, mid, jkey, jtransactions_ids, key_lock_info.exclusive); + if (jkey_lock_info == nullptr) { // exception thrown: InstantiationException or OutOfMemoryError env->DeleteLocalRef(jtransactions_ids); env->DeleteLocalRef(jkey); @@ -6171,8 +6222,8 @@ class DeadlockInfoJni : public JavaClass { * ClassFormatError, ClassCircularityError, NoClassDefFoundError, * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ - static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env,"org/rocksdb/TransactionDB$DeadlockInfo"); + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$DeadlockInfo"); } }; @@ -6189,8 +6240,7 @@ class DeadlockPathJni : public JavaClass { * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env, - "org/rocksdb/TransactionDB$DeadlockPath"); + return JavaClass::getJClass(env, "org/rocksdb/TransactionDB$DeadlockPath"); } /** @@ -6202,24 +6252,23 @@ class DeadlockPathJni : public JavaClass { * org.rocksdb.TransactionDB.DeadlockPath object, * or nullptr if an an exception occurs */ - static jobject construct(JNIEnv* env, - const jobjectArray jdeadlock_infos, const bool limit_exceeded) { + static jobject construct(JNIEnv* env, const jobjectArray jdeadlock_infos, + const bool limit_exceeded) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - jmethodID mid = env->GetMethodID( - jclazz, "", "([LDeadlockInfo;Z)V"); + jmethodID mid = env->GetMethodID(jclazz, "", "([LDeadlockInfo;Z)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; } - const jobject jdeadlock_path = env->NewObject(jclazz, mid, - jdeadlock_infos, limit_exceeded); - if(jdeadlock_path == nullptr) { + const jobject jdeadlock_path = + env->NewObject(jclazz, mid, jdeadlock_infos, limit_exceeded); + if (jdeadlock_path == nullptr) { // exception thrown: InstantiationException or OutOfMemoryError return nullptr; } @@ -6243,7 +6292,7 @@ class AbstractTableFilterJni */ static jmethodID getFilterMethod(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } @@ -6654,6 +6703,8 @@ class ChecksumTypeJni { return ROCKSDB_NAMESPACE::ChecksumType::kxxHash; case 0x3: return ROCKSDB_NAMESPACE::ChecksumType::kxxHash64; + case 0x4: + return ROCKSDB_NAMESPACE::ChecksumType::kXXH3; default: // undefined/default return ROCKSDB_NAMESPACE::ChecksumType::kCRC32c; @@ -6957,8 +7008,7 @@ class ThreadStatusJni : public JavaClass { * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return JavaClass::getJClass(env, - "org/rocksdb/ThreadStatus"); + return JavaClass::getJClass(env, "org/rocksdb/ThreadStatus"); } /** @@ -6974,12 +7024,13 @@ class ThreadStatusJni : public JavaClass { static jobject construct( JNIEnv* env, const ROCKSDB_NAMESPACE::ThreadStatus* thread_status) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - jmethodID mid = env->GetMethodID(jclazz, "", "(JBLjava/lang/String;Ljava/lang/String;BJB[JB)V"); + jmethodID mid = env->GetMethodID( + jclazz, "", "(JBLjava/lang/String;Ljava/lang/String;BJB[JB)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -6988,23 +7039,22 @@ class ThreadStatusJni : public JavaClass { jstring jdb_name = JniUtil::toJavaString(env, &(thread_status->db_name), true); if (env->ExceptionCheck()) { - // an error occurred - return nullptr; + // an error occurred + return nullptr; } jstring jcf_name = JniUtil::toJavaString(env, &(thread_status->cf_name), true); if (env->ExceptionCheck()) { - // an error occurred - env->DeleteLocalRef(jdb_name); - return nullptr; + // an error occurred + env->DeleteLocalRef(jdb_name); + return nullptr; } // long[] const jsize len = static_cast( ROCKSDB_NAMESPACE::ThreadStatus::kNumOperationProperties); - jlongArray joperation_properties = - env->NewLongArray(len); + jlongArray joperation_properties = env->NewLongArray(len); if (joperation_properties == nullptr) { // an exception occurred env->DeleteLocalRef(jdb_name); @@ -7014,11 +7064,11 @@ class ThreadStatusJni : public JavaClass { jboolean is_copy; jlong* body = env->GetLongArrayElements(joperation_properties, &is_copy); if (body == nullptr) { - // exception thrown: OutOfMemoryError - env->DeleteLocalRef(jdb_name); - env->DeleteLocalRef(jcf_name); - env->DeleteLocalRef(joperation_properties); - return nullptr; + // exception thrown: OutOfMemoryError + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(joperation_properties); + return nullptr; } for (size_t i = 0; i < len; ++i) { body[i] = static_cast(thread_status->op_properties[i]); @@ -7026,10 +7076,9 @@ class ThreadStatusJni : public JavaClass { env->ReleaseLongArrayElements(joperation_properties, body, is_copy == JNI_TRUE ? 0 : JNI_ABORT); - jobject jcfd = env->NewObject(jclazz, mid, - static_cast(thread_status->thread_id), - ThreadTypeJni::toJavaThreadType(thread_status->thread_type), - jdb_name, + jobject jcfd = env->NewObject( + jclazz, mid, static_cast(thread_status->thread_id), + ThreadTypeJni::toJavaThreadType(thread_status->thread_type), jdb_name, jcf_name, OperationTypeJni::toJavaOperationType(thread_status->operation_type), static_cast(thread_status->op_elapsed_micros), @@ -7038,9 +7087,9 @@ class ThreadStatusJni : public JavaClass { StateTypeJni::toJavaStateType(thread_status->state_type)); if (env->ExceptionCheck()) { // exception occurred - env->DeleteLocalRef(jdb_name); - env->DeleteLocalRef(jcf_name); - env->DeleteLocalRef(joperation_properties); + env->DeleteLocalRef(jdb_name); + env->DeleteLocalRef(jcf_name); + env->DeleteLocalRef(joperation_properties); return nullptr; } @@ -7132,6 +7181,16 @@ class CompactionReasonJni { return 0x0C; case ROCKSDB_NAMESPACE::CompactionReason::kExternalSstIngestion: return 0x0D; + case ROCKSDB_NAMESPACE::CompactionReason::kPeriodicCompaction: + return 0x0E; + case ROCKSDB_NAMESPACE::CompactionReason::kChangeTemperature: + return 0x0F; + case ROCKSDB_NAMESPACE::CompactionReason::kForcedBlobGC: + return 0x11; + case ROCKSDB_NAMESPACE::CompactionReason::kRoundRobinTtl: + return 0x12; + case ROCKSDB_NAMESPACE::CompactionReason::kRefitLevel: + return 0x13; default: return 0x7F; // undefined } @@ -7176,6 +7235,12 @@ class CompactionReasonJni { return ROCKSDB_NAMESPACE::CompactionReason::kPeriodicCompaction; case 0x0F: return ROCKSDB_NAMESPACE::CompactionReason::kChangeTemperature; + case 0x11: + return ROCKSDB_NAMESPACE::CompactionReason::kForcedBlobGC; + case 0x12: + return ROCKSDB_NAMESPACE::CompactionReason::kRoundRobinTtl; + case 0x13: + return ROCKSDB_NAMESPACE::CompactionReason::kRefitLevel; default: // undefined/default return ROCKSDB_NAMESPACE::CompactionReason::kUnknown; @@ -7234,7 +7299,8 @@ class LogFileJni : public JavaClass { return nullptr; } - jmethodID mid = env->GetMethodID(jclazz, "", "(Ljava/lang/String;JBJJ)V"); + jmethodID mid = + env->GetMethodID(jclazz, "", "(Ljava/lang/String;JBJJ)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -7289,7 +7355,9 @@ class LiveFileMetaDataJni : public JavaClass { return nullptr; } - jmethodID mid = env->GetMethodID(jclazz, "", "([BILjava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V"); + jmethodID mid = env->GetMethodID( + jclazz, "", + "([BILjava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -7340,21 +7408,17 @@ class LiveFileMetaDataJni : public JavaClass { return nullptr; } - jobject jlive_file_meta_data = env->NewObject(jclazz, mid, - jcolumn_family_name, - static_cast(live_file_meta_data->level), - jfile_name, - jpath, + jobject jlive_file_meta_data = env->NewObject( + jclazz, mid, jcolumn_family_name, + static_cast(live_file_meta_data->level), jfile_name, jpath, static_cast(live_file_meta_data->size), static_cast(live_file_meta_data->smallest_seqno), - static_cast(live_file_meta_data->largest_seqno), - jsmallest_key, + static_cast(live_file_meta_data->largest_seqno), jsmallest_key, jlargest_key, static_cast(live_file_meta_data->num_reads_sampled), static_cast(live_file_meta_data->being_compacted), static_cast(live_file_meta_data->num_entries), - static_cast(live_file_meta_data->num_deletions) - ); + static_cast(live_file_meta_data->num_deletions)); if (env->ExceptionCheck()) { env->DeleteLocalRef(jcolumn_family_name); @@ -7400,7 +7464,8 @@ class SstFileMetaDataJni : public JavaClass { return nullptr; } - jmethodID mid = env->GetMethodID(jclazz, "", "(Ljava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V"); + jmethodID mid = env->GetMethodID( + jclazz, "", "(Ljava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -7440,19 +7505,15 @@ class SstFileMetaDataJni : public JavaClass { return nullptr; } - jobject jsst_file_meta_data = env->NewObject(jclazz, mid, - jfile_name, - jpath, + jobject jsst_file_meta_data = env->NewObject( + jclazz, mid, jfile_name, jpath, static_cast(sst_file_meta_data->size), static_cast(sst_file_meta_data->smallest_seqno), - static_cast(sst_file_meta_data->largest_seqno), - jsmallest_key, - jlargest_key, - static_cast(sst_file_meta_data->num_reads_sampled), + static_cast(sst_file_meta_data->largest_seqno), jsmallest_key, + jlargest_key, static_cast(sst_file_meta_data->num_reads_sampled), static_cast(sst_file_meta_data->being_compacted), static_cast(sst_file_meta_data->num_entries), - static_cast(sst_file_meta_data->num_deletions) - ); + static_cast(sst_file_meta_data->num_deletions)); if (env->ExceptionCheck()) { env->DeleteLocalRef(jfile_name); @@ -7463,10 +7524,10 @@ class SstFileMetaDataJni : public JavaClass { } // cleanup - env->DeleteLocalRef(jfile_name); - env->DeleteLocalRef(jpath); - env->DeleteLocalRef(jsmallest_key); - env->DeleteLocalRef(jlargest_key); + env->DeleteLocalRef(jfile_name); + env->DeleteLocalRef(jpath); + env->DeleteLocalRef(jsmallest_key); + env->DeleteLocalRef(jlargest_key); return jsst_file_meta_data; } @@ -7495,15 +7556,16 @@ class LevelMetaDataJni : public JavaClass { return nullptr; } - jmethodID mid = env->GetMethodID(jclazz, "", "(IJ[Lorg/rocksdb/SstFileMetaData;)V"); + jmethodID mid = env->GetMethodID(jclazz, "", + "(IJ[Lorg/rocksdb/SstFileMetaData;)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; } - const jsize jlen = - static_cast(level_meta_data->files.size()); - jobjectArray jfiles = env->NewObjectArray(jlen, SstFileMetaDataJni::getJClass(env), nullptr); + const jsize jlen = static_cast(level_meta_data->files.size()); + jobjectArray jfiles = + env->NewObjectArray(jlen, SstFileMetaDataJni::getJClass(env), nullptr); if (jfiles == nullptr) { // exception thrown: OutOfMemoryError return nullptr; @@ -7511,7 +7573,7 @@ class LevelMetaDataJni : public JavaClass { jsize i = 0; for (auto it = level_meta_data->files.begin(); - it != level_meta_data->files.end(); ++it) { + it != level_meta_data->files.end(); ++it) { jobject jfile = SstFileMetaDataJni::fromCppSstFileMetaData(env, &(*it)); if (jfile == nullptr) { // exception occurred @@ -7521,11 +7583,9 @@ class LevelMetaDataJni : public JavaClass { env->SetObjectArrayElement(jfiles, i++, jfile); } - jobject jlevel_meta_data = env->NewObject(jclazz, mid, - static_cast(level_meta_data->level), - static_cast(level_meta_data->size), - jfiles - ); + jobject jlevel_meta_data = + env->NewObject(jclazz, mid, static_cast(level_meta_data->level), + static_cast(level_meta_data->size), jfiles); if (env->ExceptionCheck()) { env->DeleteLocalRef(jfiles); @@ -7563,7 +7623,8 @@ class ColumnFamilyMetaDataJni : public JavaClass { return nullptr; } - jmethodID mid = env->GetMethodID(jclazz, "", "(JJ[B[Lorg/rocksdb/LevelMetaData;)V"); + jmethodID mid = env->GetMethodID(jclazz, "", + "(JJ[B[Lorg/rocksdb/LevelMetaData;)V"); if (mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return nullptr; @@ -7578,8 +7639,9 @@ class ColumnFamilyMetaDataJni : public JavaClass { const jsize jlen = static_cast(column_famly_meta_data->levels.size()); - jobjectArray jlevels = env->NewObjectArray(jlen, LevelMetaDataJni::getJClass(env), nullptr); - if(jlevels == nullptr) { + jobjectArray jlevels = + env->NewObjectArray(jlen, LevelMetaDataJni::getJClass(env), nullptr); + if (jlevels == nullptr) { // exception thrown: OutOfMemoryError env->DeleteLocalRef(jname); return nullptr; @@ -7587,7 +7649,7 @@ class ColumnFamilyMetaDataJni : public JavaClass { jsize i = 0; for (auto it = column_famly_meta_data->levels.begin(); - it != column_famly_meta_data->levels.end(); ++it) { + it != column_famly_meta_data->levels.end(); ++it) { jobject jlevel = LevelMetaDataJni::fromCppLevelMetaData(env, &(*it)); if (jlevel == nullptr) { // exception occurred @@ -7598,12 +7660,9 @@ class ColumnFamilyMetaDataJni : public JavaClass { env->SetObjectArrayElement(jlevels, i++, jlevel); } - jobject jcolumn_family_meta_data = env->NewObject(jclazz, mid, - static_cast(column_famly_meta_data->size), - static_cast(column_famly_meta_data->file_count), - jname, - jlevels - ); + jobject jcolumn_family_meta_data = env->NewObject( + jclazz, mid, static_cast(column_famly_meta_data->size), + static_cast(column_famly_meta_data->file_count), jname, jlevels); if (env->ExceptionCheck()) { env->DeleteLocalRef(jname); @@ -7640,7 +7699,7 @@ class AbstractTraceWriterJni */ static jclass getJClass(JNIEnv* env) { return RocksDBNativeClass::getJClass(env, - "org/rocksdb/AbstractTraceWriter"); + "org/rocksdb/AbstractTraceWriter"); } /** @@ -7653,13 +7712,12 @@ class AbstractTraceWriterJni */ static jmethodID getWriteProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = env->GetMethodID( - jclazz, "writeProxy", "(J)S"); + static jmethodID mid = env->GetMethodID(jclazz, "writeProxy", "(J)S"); assert(mid != nullptr); return mid; } @@ -7674,13 +7732,12 @@ class AbstractTraceWriterJni */ static jmethodID getCloseWriterProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = env->GetMethodID( - jclazz, "closeWriterProxy", "()S"); + static jmethodID mid = env->GetMethodID(jclazz, "closeWriterProxy", "()S"); assert(mid != nullptr); return mid; } @@ -7695,13 +7752,12 @@ class AbstractTraceWriterJni */ static jmethodID getGetFileSizeMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = env->GetMethodID( - jclazz, "getFileSize", "()J"); + static jmethodID mid = env->GetMethodID(jclazz, "getFileSize", "()J"); assert(mid != nullptr); return mid; } @@ -7722,8 +7778,7 @@ class AbstractWalFilterJni * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return RocksDBNativeClass::getJClass(env, - "org/rocksdb/AbstractWalFilter"); + return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractWalFilter"); } /** @@ -7736,14 +7791,14 @@ class AbstractWalFilterJni */ static jmethodID getColumnFamilyLogNumberMapMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = env->GetMethodID( - jclazz, "columnFamilyLogNumberMap", - "(Ljava/util/Map;Ljava/util/Map;)V"); + static jmethodID mid = + env->GetMethodID(jclazz, "columnFamilyLogNumberMap", + "(Ljava/util/Map;Ljava/util/Map;)V"); assert(mid != nullptr); return mid; } @@ -7758,13 +7813,13 @@ class AbstractWalFilterJni */ static jmethodID getLogRecordFoundProxyMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = env->GetMethodID( - jclazz, "logRecordFoundProxy", "(JLjava/lang/String;JJ)S"); + static jmethodID mid = env->GetMethodID(jclazz, "logRecordFoundProxy", + "(JLjava/lang/String;JJ)S"); assert(mid != nullptr); return mid; } @@ -7779,13 +7834,13 @@ class AbstractWalFilterJni */ static jmethodID getNameMethodId(JNIEnv* env) { jclass jclazz = getJClass(env); - if(jclazz == nullptr) { + if (jclazz == nullptr) { // exception occurred accessing class return nullptr; } - static jmethodID mid = env->GetMethodID( - jclazz, "name", "()Ljava/lang/String;"); + static jmethodID mid = + env->GetMethodID(jclazz, "name", "()Ljava/lang/String;"); assert(mid != nullptr); return mid; } @@ -7848,7 +7903,7 @@ class ReusedSynchronisationTypeJni { static jbyte toJavaReusedSynchronisationType( const ROCKSDB_NAMESPACE::ReusedSynchronisationType& reused_synchronisation_type) { - switch(reused_synchronisation_type) { + switch (reused_synchronisation_type) { case ROCKSDB_NAMESPACE::ReusedSynchronisationType::MUTEX: return 0x0; case ROCKSDB_NAMESPACE::ReusedSynchronisationType::ADAPTIVE_MUTEX: @@ -7864,7 +7919,7 @@ class ReusedSynchronisationTypeJni { // enum for the provided Java org.rocksdb.ReusedSynchronisationType static ROCKSDB_NAMESPACE::ReusedSynchronisationType toCppReusedSynchronisationType(jbyte reused_synchronisation_type) { - switch(reused_synchronisation_type) { + switch (reused_synchronisation_type) { case 0x0: return ROCKSDB_NAMESPACE::ReusedSynchronisationType::MUTEX; case 0x1: @@ -7883,7 +7938,7 @@ class SanityLevelJni { // Returns the equivalent org.rocksdb.SanityLevel for the provided // C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel enum static jbyte toJavaSanityLevel( - const ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel &sanity_level) { + const ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel& sanity_level) { switch (sanity_level) { case ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel::kSanityLevelNone: return 0x0; @@ -7898,8 +7953,8 @@ class SanityLevelJni { } } - // Returns the equivalent C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel enum for - // the provided Java org.rocksdb.SanityLevel + // Returns the equivalent C++ ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel + // enum for the provided Java org.rocksdb.SanityLevel static ROCKSDB_NAMESPACE::ConfigOptions::SanityLevel toCppSanityLevel( jbyte sanity_level) { switch (sanity_level) { diff --git a/java/rocksjni/rocksdb_exception_test.cc b/java/rocksjni/rocksdb_exception_test.cc index d0fd834baa1..67e62f72662 100644 --- a/java/rocksjni/rocksdb_exception_test.cc +++ b/java/rocksjni/rocksdb_exception_test.cc @@ -6,7 +6,6 @@ #include #include "include/org_rocksdb_RocksDBExceptionTest.h" - #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "rocksjni/portal.h" diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index fca7074edbf..ced72e84160 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -61,8 +61,9 @@ jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path, * Method: open * Signature: (JLjava/lang/String;)J */ -jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2( - JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path) { +jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(JNIEnv* env, jclass, + jlong jopt_handle, + jstring jdb_path) { return rocksdb_open_helper(env, jopt_handle, jdb_path, (ROCKSDB_NAMESPACE::Status(*)( const ROCKSDB_NAMESPACE::Options&, @@ -288,8 +289,7 @@ Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_ * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_RocksDB_disposeInternal(JNIEnv*, jobject, jlong jhandle) { auto* db = reinterpret_cast(jhandle); assert(db != nullptr); delete db; @@ -300,8 +300,8 @@ void Java_org_rocksdb_RocksDB_disposeInternal( * Method: closeDatabase * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_closeDatabase( - JNIEnv* env, jclass, jlong jhandle) { +void Java_org_rocksdb_RocksDB_closeDatabase(JNIEnv* env, jclass, + jlong jhandle) { auto* db = reinterpret_cast(jhandle); assert(db != nullptr); ROCKSDB_NAMESPACE::Status s = db->Close(); @@ -313,8 +313,9 @@ void Java_org_rocksdb_RocksDB_closeDatabase( * Method: listColumnFamilies * Signature: (JLjava/lang/String;)[[B */ -jobjectArray Java_org_rocksdb_RocksDB_listColumnFamilies( - JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path) { +jobjectArray Java_org_rocksdb_RocksDB_listColumnFamilies(JNIEnv* env, jclass, + jlong jopt_handle, + jstring jdb_path) { std::vector column_family_names; const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); if (db_path == nullptr) { @@ -339,9 +340,11 @@ jobjectArray Java_org_rocksdb_RocksDB_listColumnFamilies( * Method: createColumnFamily * Signature: (J[BIJ)J */ -jlong Java_org_rocksdb_RocksDB_createColumnFamily( - JNIEnv* env, jobject, jlong jhandle, jbyteArray jcf_name, - jint jcf_name_len, jlong jcf_options_handle) { +jlong Java_org_rocksdb_RocksDB_createColumnFamily(JNIEnv* env, jobject, + jlong jhandle, + jbyteArray jcf_name, + jint jcf_name_len, + jlong jcf_options_handle) { auto* db = reinterpret_cast(jhandle); jboolean has_exception = JNI_FALSE; const std::string cf_name = @@ -424,9 +427,9 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( jlong* jcf_options_handles_elems = env->GetLongArrayElements(jcf_options_handles, nullptr); - if(jcf_options_handles_elems == nullptr) { - // exception thrown: OutOfMemoryError - return nullptr; + if (jcf_options_handles_elems == nullptr) { + // exception thrown: OutOfMemoryError + return nullptr; } // extract the column family descriptors @@ -435,11 +438,12 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( auto* cf_options = reinterpret_cast( jcf_options_handles_elems[i]); - jbyteArray jcf_name = static_cast( - env->GetObjectArrayElement(jcf_names, i)); + jbyteArray jcf_name = + static_cast(env->GetObjectArrayElement(jcf_names, i)); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException - env->ReleaseLongArrayElements(jcf_options_handles, jcf_options_handles_elems, JNI_ABORT); + env->ReleaseLongArrayElements(jcf_options_handles, + jcf_options_handles_elems, JNI_ABORT); return nullptr; } const std::string cf_name = @@ -452,7 +456,8 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( if (has_exception == JNI_TRUE) { // exception occurred env->DeleteLocalRef(jcf_name); - env->ReleaseLongArrayElements(jcf_options_handles, jcf_options_handles_elems, JNI_ABORT); + env->ReleaseLongArrayElements(jcf_options_handles, + jcf_options_handles_elems, JNI_ABORT); return nullptr; } @@ -466,7 +471,8 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( ROCKSDB_NAMESPACE::Status s = db->CreateColumnFamilies(cf_descriptors, &cf_handles); - env->ReleaseLongArrayElements(jcf_options_handles, jcf_options_handles_elems, JNI_ABORT); + env->ReleaseLongArrayElements(jcf_options_handles, jcf_options_handles_elems, + JNI_ABORT); if (!s.ok()) { // error occurred @@ -488,9 +494,9 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( * Method: dropColumnFamily * Signature: (JJ)V; */ -void Java_org_rocksdb_RocksDB_dropColumnFamily( - JNIEnv* env, jobject, jlong jdb_handle, - jlong jcf_handle) { +void Java_org_rocksdb_RocksDB_dropColumnFamily(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jcf_handle) { auto* db_handle = reinterpret_cast(jdb_handle); auto* cf_handle = reinterpret_cast(jcf_handle); @@ -506,8 +512,7 @@ void Java_org_rocksdb_RocksDB_dropColumnFamily( * Signature: (J[J)V */ void Java_org_rocksdb_RocksDB_dropColumnFamilies( - JNIEnv* env, jobject, jlong jdb_handle, - jlongArray jcolumn_family_handles) { + JNIEnv* env, jobject, jlong jdb_handle, jlongArray jcolumn_family_handles) { auto* db_handle = reinterpret_cast(jdb_handle); std::vector cf_handles; @@ -591,15 +596,16 @@ bool rocksdb_put_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, * Method: put * Signature: (J[BII[BII)V */ -void Java_org_rocksdb_RocksDB_put__J_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len) { +void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); rocksdb_put_helper(env, db, default_write_options, nullptr, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + jkey_len, jval, jval_off, jval_len); } /* @@ -607,11 +613,12 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BII( * Method: put * Signature: (J[BII[BIIJ)V */ -void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len, - jlong jcf_handle) { +void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); @@ -619,7 +626,7 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_put_helper(env, db, default_write_options, cf_handle, jkey, - jkey_off, jkey_len, jval, jval_off, jval_len); + jkey_off, jkey_len, jval, jval_off, jval_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -632,16 +639,17 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ( * Method: put * Signature: (JJ[BII[BII)V */ -void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, - jlong jwrite_options_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len) { +void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jwrite_options_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options_handle); rocksdb_put_helper(env, db, *write_options, nullptr, jkey, jkey_off, jkey_len, - jval, jval_off, jval_len); + jval, jval_off, jval_len); } /* @@ -651,9 +659,8 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII( */ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BIIJ( JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len, - jlong jcf_handle) { + jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options_handle); @@ -661,7 +668,7 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BIIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_put_helper(env, db, *write_options, cf_handle, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + jkey_len, jval, jval_off, jval_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -744,14 +751,14 @@ bool rocksdb_delete_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, * Method: delete * Signature: (J[BII)V */ -void Java_org_rocksdb_RocksDB_delete__J_3BII( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len) { +void Java_org_rocksdb_RocksDB_delete__J_3BII(JNIEnv* env, jobject, + jlong jdb_handle, jbyteArray jkey, + jint jkey_off, jint jkey_len) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); rocksdb_delete_helper(env, db, default_write_options, nullptr, jkey, jkey_off, - jkey_len); + jkey_len); } /* @@ -759,10 +766,10 @@ void Java_org_rocksdb_RocksDB_delete__J_3BII( * Method: delete * Signature: (J[BIIJ)V */ -void Java_org_rocksdb_RocksDB_delete__J_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jlong jcf_handle) { +void Java_org_rocksdb_RocksDB_delete__J_3BIIJ(JNIEnv* env, jobject, + jlong jdb_handle, jbyteArray jkey, + jint jkey_off, jint jkey_len, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); @@ -770,7 +777,7 @@ void Java_org_rocksdb_RocksDB_delete__J_3BIIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_delete_helper(env, db, default_write_options, cf_handle, jkey, - jkey_off, jkey_len); + jkey_off, jkey_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -783,16 +790,16 @@ void Java_org_rocksdb_RocksDB_delete__J_3BIIJ( * Method: delete * Signature: (JJ[BII)V */ -void Java_org_rocksdb_RocksDB_delete__JJ_3BII( - JNIEnv* env, jobject, - jlong jdb_handle, - jlong jwrite_options, - jbyteArray jkey, jint jkey_off, jint jkey_len) { +void Java_org_rocksdb_RocksDB_delete__JJ_3BII(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jwrite_options, + jbyteArray jkey, jint jkey_off, + jint jkey_len) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options); rocksdb_delete_helper(env, db, *write_options, nullptr, jkey, jkey_off, - jkey_len); + jkey_len); } /* @@ -810,7 +817,7 @@ void Java_org_rocksdb_RocksDB_delete__JJ_3BIIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_delete_helper(env, db, *write_options, cf_handle, jkey, jkey_off, - jkey_len); + jkey_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -861,16 +868,15 @@ bool rocksdb_single_delete_helper( * Method: singleDelete * Signature: (J[BI)V */ -void Java_org_rocksdb_RocksDB_singleDelete__J_3BI( - JNIEnv* env, jobject, - jlong jdb_handle, - jbyteArray jkey, - jint jkey_len) { +void Java_org_rocksdb_RocksDB_singleDelete__J_3BI(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, + jint jkey_len) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); - rocksdb_single_delete_helper(env, db, default_write_options, nullptr, - jkey, jkey_len); + rocksdb_single_delete_helper(env, db, default_write_options, nullptr, jkey, + jkey_len); } /* @@ -878,9 +884,11 @@ void Java_org_rocksdb_RocksDB_singleDelete__J_3BI( * Method: singleDelete * Signature: (J[BIJ)V */ -void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_len, jlong jcf_handle) { +void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, + jint jkey_len, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); @@ -888,7 +896,7 @@ void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_single_delete_helper(env, db, default_write_options, cf_handle, - jkey, jkey_len); + jkey, jkey_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -901,16 +909,16 @@ void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ( * Method: singleDelete * Signature: (JJ[BIJ)V */ -void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BI( - JNIEnv* env, jobject, jlong jdb_handle, - jlong jwrite_options, - jbyteArray jkey, - jint jkey_len) { +void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BI(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jwrite_options, + jbyteArray jkey, + jint jkey_len) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options); rocksdb_single_delete_helper(env, db, *write_options, nullptr, jkey, - jkey_len); + jkey_len); } /* @@ -928,7 +936,7 @@ void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_single_delete_helper(env, db, *write_options, cf_handle, jkey, - jkey_len); + jkey_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -950,7 +958,7 @@ bool rocksdb_delete_range_helper( jint jend_key_off, jint jend_key_len) { jbyte* begin_key = new jbyte[jbegin_key_len]; env->GetByteArrayRegion(jbegin_key, jbegin_key_off, jbegin_key_len, - begin_key); + begin_key); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException delete[] begin_key; @@ -991,15 +999,15 @@ bool rocksdb_delete_range_helper( * Signature: (J[BII[BII)V */ void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len, - jbyteArray jend_key, jint jend_key_off, jint jend_key_len) { + JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jbegin_key, + jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key, + jint jend_key_off, jint jend_key_len) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); rocksdb_delete_range_helper(env, db, default_write_options, nullptr, - jbegin_key, jbegin_key_off, jbegin_key_len, - jend_key, jend_key_off, jend_key_len); + jbegin_key, jbegin_key_off, jbegin_key_len, + jend_key, jend_key_off, jend_key_len); } jint rocksdb_get_helper_direct( @@ -1052,15 +1060,14 @@ jint rocksdb_get_helper_direct( ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); - // TODO(yhchiang): we might save one memory allocation here by adding - // a DB::Get() function which takes preallocated jbyte* as input. - std::string cvalue; + ROCKSDB_NAMESPACE::PinnableSlice pinnable_value; ROCKSDB_NAMESPACE::Status s; if (column_family_handle != nullptr) { - s = db->Get(read_options, column_family_handle, key_slice, &cvalue); + s = db->Get(read_options, column_family_handle, key_slice, &pinnable_value); } else { // backwards compatibility - s = db->Get(read_options, key_slice, &cvalue); + s = db->Get(read_options, db->DefaultColumnFamily(), key_slice, + &pinnable_value); } if (s.IsNotFound()) { @@ -1080,13 +1087,14 @@ jint rocksdb_get_helper_direct( return kStatusError; } - const jint cvalue_len = static_cast(cvalue.size()); - const jint length = std::min(jval_len, cvalue_len); + const jint pinnable_value_len = static_cast(pinnable_value.size()); + const jint length = std::min(jval_len, pinnable_value_len); - memcpy(value, cvalue.c_str(), length); + memcpy(value, pinnable_value.data(), length); + pinnable_value.Reset(); *has_exception = false; - return cvalue_len; + return pinnable_value_len; } /* @@ -1095,10 +1103,9 @@ jint rocksdb_get_helper_direct( * Signature: (J[BII[BIIJ)V */ void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len, - jbyteArray jend_key, jint jend_key_off, jint jend_key_len, - jlong jcf_handle) { + JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jbegin_key, + jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key, + jint jend_key_off, jint jend_key_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); @@ -1106,8 +1113,8 @@ void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BIIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_delete_range_helper(env, db, default_write_options, cf_handle, - jbegin_key, jbegin_key_off, jbegin_key_len, - jend_key, jend_key_off, jend_key_len); + jbegin_key, jbegin_key_off, jbegin_key_len, + jend_key, jend_key_off, jend_key_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -1128,8 +1135,8 @@ void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BII( auto* write_options = reinterpret_cast(jwrite_options); rocksdb_delete_range_helper(env, db, *write_options, nullptr, jbegin_key, - jbegin_key_off, jbegin_key_len, jend_key, - jend_key_off, jend_key_len); + jbegin_key_off, jbegin_key_len, jend_key, + jend_key_off, jend_key_len); } /* @@ -1148,9 +1155,9 @@ void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BIIJ( auto* cf_handle = reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { - rocksdb_delete_range_helper(env, db, *write_options, cf_handle, - jbegin_key, jbegin_key_off, jbegin_key_len, - jend_key, jend_key_off, jend_key_len); + rocksdb_delete_range_helper(env, db, *write_options, cf_handle, jbegin_key, + jbegin_key_off, jbegin_key_len, jend_key, + jend_key_off, jend_key_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -1236,15 +1243,16 @@ bool rocksdb_merge_helper(JNIEnv* env, ROCKSDB_NAMESPACE::DB* db, * Method: merge * Signature: (J[BII[BII)V */ -void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len) { +void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = ROCKSDB_NAMESPACE::WriteOptions(); rocksdb_merge_helper(env, db, default_write_options, nullptr, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + jkey_len, jval, jval_off, jval_len); } /* @@ -1253,9 +1261,8 @@ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII( * Signature: (J[BII[BIIJ)V */ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len, + JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jkey, jint jkey_off, + jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); static const ROCKSDB_NAMESPACE::WriteOptions default_write_options = @@ -1264,7 +1271,7 @@ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_merge_helper(env, db, default_write_options, cf_handle, jkey, - jkey_off, jkey_len, jval, jval_off, jval_len); + jkey_off, jkey_len, jval, jval_off, jval_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -1279,13 +1286,13 @@ void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ( */ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BII( JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len) { + jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options_handle); rocksdb_merge_helper(env, db, *write_options, nullptr, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + jkey_len, jval, jval_off, jval_len); } /* @@ -1295,8 +1302,8 @@ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BII( */ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BIIJ( JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) { + jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len, jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options_handle); @@ -1304,7 +1311,7 @@ void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BIIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { rocksdb_merge_helper(env, db, *write_options, cf_handle, jkey, jkey_off, - jkey_len, jval, jval_off, jval_len); + jkey_len, jval, jval_off, jval_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -1363,9 +1370,9 @@ void Java_org_rocksdb_RocksDB_deleteDirect(JNIEnv* env, jobject /*jdb*/, * Method: write0 * Signature: (JJJ)V */ -void Java_org_rocksdb_RocksDB_write0( - JNIEnv* env, jobject, jlong jdb_handle, - jlong jwrite_options_handle, jlong jwb_handle) { +void Java_org_rocksdb_RocksDB_write0(JNIEnv* env, jobject, jlong jdb_handle, + jlong jwrite_options_handle, + jlong jwb_handle) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options_handle); @@ -1383,9 +1390,9 @@ void Java_org_rocksdb_RocksDB_write0( * Method: write1 * Signature: (JJJ)V */ -void Java_org_rocksdb_RocksDB_write1( - JNIEnv* env, jobject, jlong jdb_handle, - jlong jwrite_options_handle, jlong jwbwi_handle) { +void Java_org_rocksdb_RocksDB_write1(JNIEnv* env, jobject, jlong jdb_handle, + jlong jwrite_options_handle, + jlong jwbwi_handle) { auto* db = reinterpret_cast(jdb_handle); auto* write_options = reinterpret_cast(jwrite_options_handle); @@ -1418,13 +1425,13 @@ jbyteArray rocksdb_get_helper( ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); - std::string value; + ROCKSDB_NAMESPACE::PinnableSlice pinnable_value; ROCKSDB_NAMESPACE::Status s; if (column_family_handle != nullptr) { - s = db->Get(read_opt, column_family_handle, key_slice, &value); + s = db->Get(read_opt, column_family_handle, key_slice, &pinnable_value); } else { - // backwards compatibility - s = db->Get(read_opt, key_slice, &value); + s = db->Get(read_opt, db->DefaultColumnFamily(), key_slice, + &pinnable_value); } // cleanup @@ -1435,7 +1442,9 @@ jbyteArray rocksdb_get_helper( } if (s.ok()) { - jbyteArray jret_value = ROCKSDB_NAMESPACE::JniUtil::copyBytes(env, value); + jbyteArray jret_value = + ROCKSDB_NAMESPACE::JniUtil::copyBytes(env, pinnable_value); + pinnable_value.Reset(); if (jret_value == nullptr) { // exception occurred return nullptr; @@ -1452,9 +1461,10 @@ jbyteArray rocksdb_get_helper( * Method: get * Signature: (J[BII)[B */ -jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len) { +jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len) { return rocksdb_get_helper( env, reinterpret_cast(jdb_handle), ROCKSDB_NAMESPACE::ReadOptions(), nullptr, jkey, jkey_off, jkey_len); @@ -1465,9 +1475,11 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII( * Method: get * Signature: (J[BIIJ)[B */ -jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) { +jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len, + jlong jcf_handle) { auto db_handle = reinterpret_cast(jdb_handle); auto cf_handle = reinterpret_cast(jcf_handle); @@ -1487,10 +1499,11 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ( * Method: get * Signature: (JJ[BII)[B */ -jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII( - JNIEnv* env, jobject, - jlong jdb_handle, jlong jropt_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len) { +jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jropt_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len) { return rocksdb_get_helper( env, reinterpret_cast(jdb_handle), *reinterpret_cast(jropt_handle), nullptr, @@ -1503,16 +1516,16 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII( * Signature: (JJ[BIIJ)[B */ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) { + JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, + jint jkey_off, jint jkey_len, jlong jcf_handle) { auto* db_handle = reinterpret_cast(jdb_handle); auto& ro_opt = *reinterpret_cast(jropt_handle); auto* cf_handle = reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { - return rocksdb_get_helper( - env, db_handle, ro_opt, cf_handle, jkey, jkey_off, jkey_len); + return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey, jkey_off, + jkey_len); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -1540,15 +1553,13 @@ jint rocksdb_get_helper( } ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(key), jkey_len); - // TODO(yhchiang): we might save one memory allocation here by adding - // a DB::Get() function which takes preallocated jbyte* as input. - std::string cvalue; + ROCKSDB_NAMESPACE::PinnableSlice pinnable_value; ROCKSDB_NAMESPACE::Status s; if (column_family_handle != nullptr) { - s = db->Get(read_options, column_family_handle, key_slice, &cvalue); + s = db->Get(read_options, column_family_handle, key_slice, &pinnable_value); } else { - // backwards compatibility - s = db->Get(read_options, key_slice, &cvalue); + s = db->Get(read_options, db->DefaultColumnFamily(), key_slice, + &pinnable_value); } // cleanup @@ -1571,12 +1582,13 @@ jint rocksdb_get_helper( return kStatusError; } - const jint cvalue_len = static_cast(cvalue.size()); - const jint length = std::min(jval_len, cvalue_len); + const jint pinnable_value_len = static_cast(pinnable_value.size()); + const jint length = std::min(jval_len, pinnable_value_len); - env->SetByteArrayRegion( - jval, jval_off, length, - const_cast(reinterpret_cast(cvalue.c_str()))); + env->SetByteArrayRegion(jval, jval_off, length, + const_cast(reinterpret_cast( + pinnable_value.data()))); + pinnable_value.Reset(); if (env->ExceptionCheck()) { // exception thrown: OutOfMemoryError *has_exception = true; @@ -1584,7 +1596,7 @@ jint rocksdb_get_helper( } *has_exception = false; - return cvalue_len; + return pinnable_value_len; } /* @@ -1592,10 +1604,11 @@ jint rocksdb_get_helper( * Method: get * Signature: (J[BII[BII)I */ -jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len) { +jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len) { bool has_exception = false; return rocksdb_get_helper( env, reinterpret_cast(jdb_handle), @@ -1608,11 +1621,12 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII( * Method: get * Signature: (J[BII[BIIJ)I */ -jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len, - jlong jcf_handle) { +jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len, + jlong jcf_handle) { auto* db_handle = reinterpret_cast(jdb_handle); auto* cf_handle = reinterpret_cast(jcf_handle); @@ -1635,10 +1649,12 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ( * Method: get * Signature: (JJ[BII[BII)I */ -jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII( - JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len) { +jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jropt_handle, + jbyteArray jkey, jint jkey_off, + jint jkey_len, jbyteArray jval, + jint jval_off, jint jval_len) { bool has_exception = false; return rocksdb_get_helper( env, reinterpret_cast(jdb_handle), @@ -1652,9 +1668,8 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII( * Signature: (JJ[BII[BIIJ)I */ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ( - JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, - jbyteArray jkey, jint jkey_off, jint jkey_len, - jbyteArray jval, jint jval_off, jint jval_len, + JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, + jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) { auto* db_handle = reinterpret_cast(jdb_handle); auto& ro_opt = @@ -1663,10 +1678,9 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ( reinterpret_cast(jcf_handle); if (cf_handle != nullptr) { bool has_exception = false; - return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, - jkey, jkey_off, jkey_len, - jval, jval_off, jval_len, - &has_exception); + return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey, jkey_off, + jkey_len, jval, jval_off, jval_len, + &has_exception); } else { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( env, ROCKSDB_NAMESPACE::Status::InvalidArgument( @@ -1823,6 +1837,10 @@ inline bool keys_from_bytebuffers(JNIEnv* env, jobject jkey = env->GetObjectArrayElement(jkeys, i); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException + // cleanup jkey_off and jkey_len + env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); + env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); + return false; } char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); @@ -1831,6 +1849,11 @@ inline bool keys_from_bytebuffers(JNIEnv* env, env->DeleteLocalRef(jkey); } + + // cleanup jkey_off and jkey_len + env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT); + env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT); + return true; } @@ -2198,17 +2221,14 @@ bool key_may_exist_direct_helper(JNIEnv* env, jlong jdb_handle, */ jboolean Java_org_rocksdb_RocksDB_keyMayExist( JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, - jlong jread_opts_handle, - jbyteArray jkey, jint jkey_offset, jint jkey_len) { - + jlong jread_opts_handle, jbyteArray jkey, jint jkey_offset, jint jkey_len) { bool has_exception = false; std::string value; bool value_found = false; const bool exists = key_may_exist_helper( - env, jdb_handle, jcf_handle, jread_opts_handle, - jkey, jkey_offset, jkey_len, - &has_exception, &value, &value_found); + env, jdb_handle, jcf_handle, jread_opts_handle, jkey, jkey_offset, + jkey_len, &has_exception, &value, &value_found); if (has_exception) { // java exception already raised @@ -2406,8 +2426,7 @@ jobjectArray Java_org_rocksdb_RocksDB_keyMayExistFoundValue( * Method: iterator * Signature: (J)J */ -jlong Java_org_rocksdb_RocksDB_iterator__J( - JNIEnv*, jobject, jlong db_handle) { +jlong Java_org_rocksdb_RocksDB_iterator__J(JNIEnv*, jobject, jlong db_handle) { auto* db = reinterpret_cast(db_handle); return rocksdb_iterator_helper(db, ROCKSDB_NAMESPACE::ReadOptions(), nullptr); } @@ -2417,8 +2436,8 @@ jlong Java_org_rocksdb_RocksDB_iterator__J( * Method: iterator * Signature: (JJ)J */ -jlong Java_org_rocksdb_RocksDB_iterator__JJ( - JNIEnv*, jobject, jlong db_handle, jlong jread_options_handle) { +jlong Java_org_rocksdb_RocksDB_iterator__JJ(JNIEnv*, jobject, jlong db_handle, + jlong jread_options_handle) { auto* db = reinterpret_cast(db_handle); auto& read_options = *reinterpret_cast(jread_options_handle); @@ -2430,8 +2449,8 @@ jlong Java_org_rocksdb_RocksDB_iterator__JJ( * Method: iteratorCF * Signature: (JJ)J */ -jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ( - JNIEnv*, jobject, jlong db_handle, jlong jcf_handle) { +jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ(JNIEnv*, jobject, jlong db_handle, + jlong jcf_handle) { auto* db = reinterpret_cast(db_handle); auto* cf_handle = reinterpret_cast(jcf_handle); @@ -2444,9 +2463,10 @@ jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ( * Method: iteratorCF * Signature: (JJJ)J */ -jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ( - JNIEnv*, jobject, - jlong db_handle, jlong jcf_handle, jlong jread_options_handle) { +jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ(JNIEnv*, jobject, + jlong db_handle, + jlong jcf_handle, + jlong jread_options_handle) { auto* db = reinterpret_cast(db_handle); auto* cf_handle = reinterpret_cast(jcf_handle); @@ -2460,10 +2480,10 @@ jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ( * Method: iterators * Signature: (J[JJ)[J */ -jlongArray Java_org_rocksdb_RocksDB_iterators( - JNIEnv* env, jobject, jlong db_handle, - jlongArray jcolumn_family_handles, - jlong jread_options_handle) { +jlongArray Java_org_rocksdb_RocksDB_iterators(JNIEnv* env, jobject, + jlong db_handle, + jlongArray jcolumn_family_handles, + jlong jread_options_handle) { auto* db = reinterpret_cast(db_handle); auto& read_options = *reinterpret_cast(jread_options_handle); @@ -2520,8 +2540,7 @@ jlongArray Java_org_rocksdb_RocksDB_iterators( * Method: getSnapshot * Signature: (J)J */ -jlong Java_org_rocksdb_RocksDB_getSnapshot( - JNIEnv*, jobject, jlong db_handle) { +jlong Java_org_rocksdb_RocksDB_getSnapshot(JNIEnv*, jobject, jlong db_handle) { auto* db = reinterpret_cast(db_handle); const ROCKSDB_NAMESPACE::Snapshot* snapshot = db->GetSnapshot(); return GET_CPLUSPLUS_POINTER(snapshot); @@ -2531,9 +2550,8 @@ jlong Java_org_rocksdb_RocksDB_getSnapshot( * Method: releaseSnapshot * Signature: (JJ)V */ -void Java_org_rocksdb_RocksDB_releaseSnapshot( - JNIEnv*, jobject, jlong db_handle, - jlong snapshot_handle) { +void Java_org_rocksdb_RocksDB_releaseSnapshot(JNIEnv*, jobject, jlong db_handle, + jlong snapshot_handle) { auto* db = reinterpret_cast(db_handle); auto* snapshot = reinterpret_cast(snapshot_handle); @@ -2545,9 +2563,10 @@ void Java_org_rocksdb_RocksDB_releaseSnapshot( * Method: getProperty * Signature: (JJLjava/lang/String;I)Ljava/lang/String; */ -jstring Java_org_rocksdb_RocksDB_getProperty( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, - jstring jproperty, jint jproperty_len) { +jstring Java_org_rocksdb_RocksDB_getProperty(JNIEnv* env, jobject, + jlong jdb_handle, jlong jcf_handle, + jstring jproperty, + jint jproperty_len) { const char* property = env->GetStringUTFChars(jproperty, nullptr); if (property == nullptr) { // exception thrown: OutOfMemoryError @@ -2582,10 +2601,12 @@ jstring Java_org_rocksdb_RocksDB_getProperty( * Method: getMapProperty * Signature: (JJLjava/lang/String;I)Ljava/util/Map; */ -jobject Java_org_rocksdb_RocksDB_getMapProperty( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, - jstring jproperty, jint jproperty_len) { - const char* property = env->GetStringUTFChars(jproperty, nullptr); +jobject Java_org_rocksdb_RocksDB_getMapProperty(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jcf_handle, + jstring jproperty, + jint jproperty_len) { + const char* property = env->GetStringUTFChars(jproperty, nullptr); if (property == nullptr) { // exception thrown: OutOfMemoryError return nullptr; @@ -2619,9 +2640,11 @@ jobject Java_org_rocksdb_RocksDB_getMapProperty( * Method: getLongProperty * Signature: (JJLjava/lang/String;I)J */ -jlong Java_org_rocksdb_RocksDB_getLongProperty( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, - jstring jproperty, jint jproperty_len) { +jlong Java_org_rocksdb_RocksDB_getLongProperty(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jcf_handle, + jstring jproperty, + jint jproperty_len) { const char* property = env->GetStringUTFChars(jproperty, nullptr); if (property == nullptr) { // exception thrown: OutOfMemoryError @@ -2656,8 +2679,7 @@ jlong Java_org_rocksdb_RocksDB_getLongProperty( * Method: resetStats * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_resetStats( - JNIEnv *, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_resetStats(JNIEnv*, jobject, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); db->ResetStats(); } @@ -2667,9 +2689,10 @@ void Java_org_rocksdb_RocksDB_resetStats( * Method: getAggregatedLongProperty * Signature: (JLjava/lang/String;I)J */ -jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty( - JNIEnv* env, jobject, jlong db_handle, - jstring jproperty, jint jproperty_len) { +jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty(JNIEnv* env, jobject, + jlong db_handle, + jstring jproperty, + jint jproperty_len) { const char* property = env->GetStringUTFChars(jproperty, nullptr); if (property == nullptr) { return 0; @@ -2794,9 +2817,7 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateMemTableStats( db->GetApproximateMemTableStats(cf_handle, range, &count, &sizes); // prepare results - jlong results[2] = { - static_cast(count), - static_cast(sizes)}; + jlong results[2] = {static_cast(count), static_cast(sizes)}; jlongArray jsizes = env->NewLongArray(2); if (jsizes == nullptr) { @@ -2819,12 +2840,12 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateMemTableStats( * Method: compactRange * Signature: (J[BI[BIJJ)V */ -void Java_org_rocksdb_RocksDB_compactRange( - JNIEnv* env, jobject, jlong jdb_handle, - jbyteArray jbegin, jint jbegin_len, - jbyteArray jend, jint jend_len, - jlong jcompact_range_opts_handle, - jlong jcf_handle) { +void Java_org_rocksdb_RocksDB_compactRange(JNIEnv* env, jobject, + jlong jdb_handle, jbyteArray jbegin, + jint jbegin_len, jbyteArray jend, + jint jend_len, + jlong jcompact_range_opts_handle, + jlong jcf_handle) { jboolean has_exception = JNI_FALSE; std::string str_begin; @@ -2892,9 +2913,9 @@ void Java_org_rocksdb_RocksDB_compactRange( * Method: setOptions * Signature: (JJ[Ljava/lang/String;[Ljava/lang/String;)V */ -void Java_org_rocksdb_RocksDB_setOptions( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, - jobjectArray jkeys, jobjectArray jvalues) { +void Java_org_rocksdb_RocksDB_setOptions(JNIEnv* env, jobject, jlong jdb_handle, + jlong jcf_handle, jobjectArray jkeys, + jobjectArray jvalues) { const jsize len = env->GetArrayLength(jkeys); assert(len == env->GetArrayLength(jvalues)); @@ -2955,14 +2976,14 @@ void Java_org_rocksdb_RocksDB_setOptions( * Method: setDBOptions * Signature: (J[Ljava/lang/String;[Ljava/lang/String;)V */ -void Java_org_rocksdb_RocksDB_setDBOptions( - JNIEnv* env, jobject, jlong jdb_handle, - jobjectArray jkeys, jobjectArray jvalues) { +void Java_org_rocksdb_RocksDB_setDBOptions(JNIEnv* env, jobject, + jlong jdb_handle, jobjectArray jkeys, + jobjectArray jvalues) { const jsize len = env->GetArrayLength(jkeys); assert(len == env->GetArrayLength(jvalues)); std::unordered_map options_map; - for (jsize i = 0; i < len; i++) { + for (jsize i = 0; i < len; i++) { jobject jobj_key = env->GetObjectArrayElement(jkeys, i); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException @@ -3096,8 +3117,9 @@ jobjectArray Java_org_rocksdb_RocksDB_compactFiles( std::vector output_file_names; auto s = db->CompactFiles(*compaction_opts, cf_handle, input_file_names, - static_cast(joutput_level), static_cast(joutput_path_id), - &output_file_names, compaction_job_info); + static_cast(joutput_level), + static_cast(joutput_path_id), + &output_file_names, compaction_job_info); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); return nullptr; @@ -3111,10 +3133,11 @@ jobjectArray Java_org_rocksdb_RocksDB_compactFiles( * Method: cancelAllBackgroundWork * Signature: (JZ)V */ -void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork( - JNIEnv*, jobject, jlong jdb_handle, jboolean jwait) { - auto* db = reinterpret_cast(jdb_handle); - ROCKSDB_NAMESPACE::CancelAllBackgroundWork(db, jwait); +void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork(JNIEnv*, jobject, + jlong jdb_handle, + jboolean jwait) { + auto* db = reinterpret_cast(jdb_handle); + ROCKSDB_NAMESPACE::CancelAllBackgroundWork(db, jwait); } /* @@ -3122,8 +3145,8 @@ void Java_org_rocksdb_RocksDB_cancelAllBackgroundWork( * Method: pauseBackgroundWork * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_pauseBackgroundWork( - JNIEnv* env, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_pauseBackgroundWork(JNIEnv* env, jobject, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->PauseBackgroundWork(); if (!s.ok()) { @@ -3136,8 +3159,8 @@ void Java_org_rocksdb_RocksDB_pauseBackgroundWork( * Method: continueBackgroundWork * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_continueBackgroundWork( - JNIEnv* env, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_continueBackgroundWork(JNIEnv* env, jobject, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->ContinueBackgroundWork(); if (!s.ok()) { @@ -3150,8 +3173,9 @@ void Java_org_rocksdb_RocksDB_continueBackgroundWork( * Method: enableAutoCompaction * Signature: (J[J)V */ -void Java_org_rocksdb_RocksDB_enableAutoCompaction( - JNIEnv* env, jobject, jlong jdb_handle, jlongArray jcf_handles) { +void Java_org_rocksdb_RocksDB_enableAutoCompaction(JNIEnv* env, jobject, + jlong jdb_handle, + jlongArray jcf_handles) { auto* db = reinterpret_cast(jdb_handle); jboolean has_exception = JNI_FALSE; const std::vector cf_handles = @@ -3170,8 +3194,8 @@ void Java_org_rocksdb_RocksDB_enableAutoCompaction( * Method: numberLevels * Signature: (JJ)I */ -jint Java_org_rocksdb_RocksDB_numberLevels( - JNIEnv*, jobject, jlong jdb_handle, jlong jcf_handle) { +jint Java_org_rocksdb_RocksDB_numberLevels(JNIEnv*, jobject, jlong jdb_handle, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; if (jcf_handle == 0) { @@ -3188,8 +3212,9 @@ jint Java_org_rocksdb_RocksDB_numberLevels( * Method: maxMemCompactionLevel * Signature: (JJ)I */ -jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel( - JNIEnv*, jobject, jlong jdb_handle, jlong jcf_handle) { +jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(JNIEnv*, jobject, + jlong jdb_handle, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; if (jcf_handle == 0) { @@ -3206,8 +3231,9 @@ jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel( * Method: level0StopWriteTrigger * Signature: (JJ)I */ -jint Java_org_rocksdb_RocksDB_level0StopWriteTrigger( - JNIEnv*, jobject, jlong jdb_handle, jlong jcf_handle) { +jint Java_org_rocksdb_RocksDB_level0StopWriteTrigger(JNIEnv*, jobject, + jlong jdb_handle, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; if (jcf_handle == 0) { @@ -3224,8 +3250,8 @@ jint Java_org_rocksdb_RocksDB_level0StopWriteTrigger( * Method: getName * Signature: (J)Ljava/lang/String; */ -jstring Java_org_rocksdb_RocksDB_getName( - JNIEnv* env, jobject, jlong jdb_handle) { +jstring Java_org_rocksdb_RocksDB_getName(JNIEnv* env, jobject, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); std::string name = db->GetName(); return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &name, false); @@ -3236,8 +3262,7 @@ jstring Java_org_rocksdb_RocksDB_getName( * Method: getEnv * Signature: (J)J */ -jlong Java_org_rocksdb_RocksDB_getEnv( - JNIEnv*, jobject, jlong jdb_handle) { +jlong Java_org_rocksdb_RocksDB_getEnv(JNIEnv*, jobject, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); return GET_CPLUSPLUS_POINTER(db->GetEnv()); } @@ -3247,24 +3272,24 @@ jlong Java_org_rocksdb_RocksDB_getEnv( * Method: flush * Signature: (JJ[J)V */ -void Java_org_rocksdb_RocksDB_flush( - JNIEnv* env, jobject, jlong jdb_handle, jlong jflush_opts_handle, - jlongArray jcf_handles) { +void Java_org_rocksdb_RocksDB_flush(JNIEnv* env, jobject, jlong jdb_handle, + jlong jflush_opts_handle, + jlongArray jcf_handles) { auto* db = reinterpret_cast(jdb_handle); auto* flush_opts = reinterpret_cast(jflush_opts_handle); std::vector cf_handles; if (jcf_handles == nullptr) { - cf_handles.push_back(db->DefaultColumnFamily()); + cf_handles.push_back(db->DefaultColumnFamily()); } else { - jboolean has_exception = JNI_FALSE; - cf_handles = ROCKSDB_NAMESPACE::JniUtil::fromJPointers< - ROCKSDB_NAMESPACE::ColumnFamilyHandle>(env, jcf_handles, - &has_exception); - if (has_exception) { - // exception occurred - return; - } + jboolean has_exception = JNI_FALSE; + cf_handles = ROCKSDB_NAMESPACE::JniUtil::fromJPointers< + ROCKSDB_NAMESPACE::ColumnFamilyHandle>(env, jcf_handles, + &has_exception); + if (has_exception) { + // exception occurred + return; + } } auto s = db->Flush(*flush_opts, cf_handles); if (!s.ok()) { @@ -3277,8 +3302,8 @@ void Java_org_rocksdb_RocksDB_flush( * Method: flushWal * Signature: (JZ)V */ -void Java_org_rocksdb_RocksDB_flushWal( - JNIEnv* env, jobject, jlong jdb_handle, jboolean jsync) { +void Java_org_rocksdb_RocksDB_flushWal(JNIEnv* env, jobject, jlong jdb_handle, + jboolean jsync) { auto* db = reinterpret_cast(jdb_handle); auto s = db->FlushWAL(jsync == JNI_TRUE); if (!s.ok()) { @@ -3291,8 +3316,7 @@ void Java_org_rocksdb_RocksDB_flushWal( * Method: syncWal * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_syncWal( - JNIEnv* env, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_syncWal(JNIEnv* env, jobject, jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->SyncWAL(); if (!s.ok()) { @@ -3305,8 +3329,8 @@ void Java_org_rocksdb_RocksDB_syncWal( * Method: getLatestSequenceNumber * Signature: (J)V */ -jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber( - JNIEnv*, jobject, jlong jdb_handle) { +jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv*, jobject, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); return db->GetLatestSequenceNumber(); } @@ -3316,8 +3340,8 @@ jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber( * Method: disableFileDeletions * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_disableFileDeletions( - JNIEnv* env, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env, jobject, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::Status s = db->DisableFileDeletions(); if (!s.ok()) { @@ -3330,8 +3354,9 @@ void Java_org_rocksdb_RocksDB_disableFileDeletions( * Method: enableFileDeletions * Signature: (JZ)V */ -void Java_org_rocksdb_RocksDB_enableFileDeletions( - JNIEnv* env, jobject, jlong jdb_handle, jboolean jforce) { +void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env, jobject, + jlong jdb_handle, + jboolean jforce) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::Status s = db->EnableFileDeletions(jforce); if (!s.ok()) { @@ -3344,13 +3369,14 @@ void Java_org_rocksdb_RocksDB_enableFileDeletions( * Method: getLiveFiles * Signature: (JZ)[Ljava/lang/String; */ -jobjectArray Java_org_rocksdb_RocksDB_getLiveFiles( - JNIEnv* env, jobject, jlong jdb_handle, jboolean jflush_memtable) { +jobjectArray Java_org_rocksdb_RocksDB_getLiveFiles(JNIEnv* env, jobject, + jlong jdb_handle, + jboolean jflush_memtable) { auto* db = reinterpret_cast(jdb_handle); std::vector live_files; uint64_t manifest_file_size = 0; - auto s = db->GetLiveFiles( - live_files, &manifest_file_size, jflush_memtable == JNI_TRUE); + auto s = db->GetLiveFiles(live_files, &manifest_file_size, + jflush_memtable == JNI_TRUE); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); return nullptr; @@ -3368,8 +3394,8 @@ jobjectArray Java_org_rocksdb_RocksDB_getLiveFiles( * Method: getSortedWalFiles * Signature: (J)[Lorg/rocksdb/LogFile; */ -jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles( - JNIEnv* env, jobject, jlong jdb_handle) { +jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles(JNIEnv* env, jobject, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); std::vector> sorted_wal_files; auto s = db->GetSortedWalFiles(sorted_wal_files); @@ -3382,7 +3408,7 @@ jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles( const jsize jlen = static_cast(sorted_wal_files.size()); jobjectArray jsorted_wal_files = env->NewObjectArray( jlen, ROCKSDB_NAMESPACE::LogFileJni::getJClass(env), nullptr); - if(jsorted_wal_files == nullptr) { + if (jsorted_wal_files == nullptr) { // exception thrown: OutOfMemoryError return nullptr; } @@ -3416,8 +3442,9 @@ jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles( * Method: getUpdatesSince * Signature: (JJ)J */ -jlong Java_org_rocksdb_RocksDB_getUpdatesSince( - JNIEnv* env, jobject, jlong jdb_handle, jlong jsequence_number) { +jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jsequence_number) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::SequenceNumber sequence_number = static_cast(jsequence_number); @@ -3436,8 +3463,8 @@ jlong Java_org_rocksdb_RocksDB_getUpdatesSince( * Method: deleteFile * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_RocksDB_deleteFile( - JNIEnv* env, jobject, jlong jdb_handle, jstring jname) { +void Java_org_rocksdb_RocksDB_deleteFile(JNIEnv* env, jobject, jlong jdb_handle, + jstring jname) { auto* db = reinterpret_cast(jdb_handle); jboolean has_exception = JNI_FALSE; std::string name = @@ -3454,8 +3481,8 @@ void Java_org_rocksdb_RocksDB_deleteFile( * Method: getLiveFilesMetaData * Signature: (J)[Lorg/rocksdb/LiveFileMetaData; */ -jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData( - JNIEnv* env, jobject, jlong jdb_handle) { +jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData(JNIEnv* env, jobject, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); std::vector live_files_meta_data; db->GetLiveFilesMetaData(&live_files_meta_data); @@ -3464,13 +3491,14 @@ jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData( const jsize jlen = static_cast(live_files_meta_data.size()); jobjectArray jlive_files_meta_data = env->NewObjectArray( jlen, ROCKSDB_NAMESPACE::LiveFileMetaDataJni::getJClass(env), nullptr); - if(jlive_files_meta_data == nullptr) { + if (jlive_files_meta_data == nullptr) { // exception thrown: OutOfMemoryError return nullptr; } jsize i = 0; - for (auto it = live_files_meta_data.begin(); it != live_files_meta_data.end(); ++it) { + for (auto it = live_files_meta_data.begin(); it != live_files_meta_data.end(); + ++it) { jobject jlive_file_meta_data = ROCKSDB_NAMESPACE::LiveFileMetaDataJni::fromCppLiveFileMetaData(env, &(*it)); @@ -3480,7 +3508,8 @@ jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData( return nullptr; } - env->SetObjectArrayElement(jlive_files_meta_data, i++, jlive_file_meta_data); + env->SetObjectArrayElement(jlive_files_meta_data, i++, + jlive_file_meta_data); if (env->ExceptionCheck()) { // exception occurred env->DeleteLocalRef(jlive_file_meta_data); @@ -3499,8 +3528,9 @@ jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData( * Method: getColumnFamilyMetaData * Signature: (JJ)Lorg/rocksdb/ColumnFamilyMetaData; */ -jobject Java_org_rocksdb_RocksDB_getColumnFamilyMetaData( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle) { +jobject Java_org_rocksdb_RocksDB_getColumnFamilyMetaData(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; if (jcf_handle == 0) { @@ -3550,8 +3580,8 @@ void Java_org_rocksdb_RocksDB_ingestExternalFile( * Method: verifyChecksum * Signature: (J)V */ -void Java_org_rocksdb_RocksDB_verifyChecksum( - JNIEnv* env, jobject, jlong jdb_handle) { +void Java_org_rocksdb_RocksDB_verifyChecksum(JNIEnv* env, jobject, + jlong jdb_handle) { auto* db = reinterpret_cast(jdb_handle); auto s = db->VerifyChecksum(); if (!s.ok()) { @@ -3564,8 +3594,8 @@ void Java_org_rocksdb_RocksDB_verifyChecksum( * Method: getDefaultColumnFamily * Signature: (J)J */ -jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily( - JNIEnv*, jobject, jlong jdb_handle) { +jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(JNIEnv*, jobject, + jlong jdb_handle) { auto* db_handle = reinterpret_cast(jdb_handle); auto* cf_handle = db_handle->DefaultColumnFamily(); return GET_CPLUSPLUS_POINTER(cf_handle); @@ -3576,8 +3606,9 @@ jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily( * Method: getPropertiesOfAllTables * Signature: (JJ)Ljava/util/Map; */ -jobject Java_org_rocksdb_RocksDB_getPropertiesOfAllTables( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle) { +jobject Java_org_rocksdb_RocksDB_getPropertiesOfAllTables(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; if (jcf_handle == 0) { @@ -3587,8 +3618,8 @@ jobject Java_org_rocksdb_RocksDB_getPropertiesOfAllTables( reinterpret_cast(jcf_handle); } ROCKSDB_NAMESPACE::TablePropertiesCollection table_properties_collection; - auto s = db->GetPropertiesOfAllTables(cf_handle, - &table_properties_collection); + auto s = + db->GetPropertiesOfAllTables(cf_handle, &table_properties_collection); if (!s.ok()) { ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } @@ -3678,17 +3709,19 @@ jobject Java_org_rocksdb_RocksDB_getPropertiesOfTablesInRange( } ROCKSDB_NAMESPACE::TablePropertiesCollection table_properties_collection; - auto s = db->GetPropertiesOfTablesInRange( - cf_handle, ranges.get(), ranges_len, &table_properties_collection); + auto s = db->GetPropertiesOfTablesInRange(cf_handle, ranges.get(), ranges_len, + &table_properties_collection); if (!s.ok()) { // error occurred - env->ReleaseLongArrayElements(jrange_slice_handles, jrange_slice_handle, JNI_ABORT); + env->ReleaseLongArrayElements(jrange_slice_handles, jrange_slice_handle, + JNI_ABORT); ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); return nullptr; } // cleanup - env->ReleaseLongArrayElements(jrange_slice_handles, jrange_slice_handle, JNI_ABORT); + env->ReleaseLongArrayElements(jrange_slice_handles, jrange_slice_handle, + JNI_ABORT); return jrange_slice_handles; } @@ -3698,8 +3731,9 @@ jobject Java_org_rocksdb_RocksDB_getPropertiesOfTablesInRange( * Method: suggestCompactRange * Signature: (JJ)[J */ -jlongArray Java_org_rocksdb_RocksDB_suggestCompactRange( - JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle) { +jlongArray Java_org_rocksdb_RocksDB_suggestCompactRange(JNIEnv* env, jobject, + jlong jdb_handle, + jlong jcf_handle) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; if (jcf_handle == 0) { @@ -3747,8 +3781,8 @@ jlongArray Java_org_rocksdb_RocksDB_suggestCompactRange( * Method: promoteL0 * Signature: (JJI)V */ -void Java_org_rocksdb_RocksDB_promoteL0( - JNIEnv*, jobject, jlong jdb_handle, jlong jcf_handle, jint jtarget_level) { +void Java_org_rocksdb_RocksDB_promoteL0(JNIEnv*, jobject, jlong jdb_handle, + jlong jcf_handle, jint jtarget_level) { auto* db = reinterpret_cast(jdb_handle); ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; if (jcf_handle == 0) { @@ -3815,8 +3849,8 @@ void Java_org_rocksdb_RocksDB_tryCatchUpWithPrimary(JNIEnv* env, jobject, * Method: destroyDB * Signature: (Ljava/lang/String;J)V */ -void Java_org_rocksdb_RocksDB_destroyDB( - JNIEnv* env, jclass, jstring jdb_path, jlong joptions_handle) { +void Java_org_rocksdb_RocksDB_destroyDB(JNIEnv* env, jclass, jstring jdb_path, + jlong joptions_handle) { const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); if (db_path == nullptr) { // exception thrown: OutOfMemoryError diff --git a/java/rocksjni/sst_file_reader_iterator.cc b/java/rocksjni/sst_file_reader_iterator.cc index ac92285b6cb..68fa4c37c8b 100644 --- a/java/rocksjni/sst_file_reader_iterator.cc +++ b/java/rocksjni/sst_file_reader_iterator.cc @@ -176,7 +176,8 @@ jbyteArray Java_org_rocksdb_SstFileReaderIterator_key0(JNIEnv* env, * Method: value0 * Signature: (J)[B */ -jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env, jobject /*jobj*/, +jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env, + jobject /*jobj*/, jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Slice value_slice = it->value(); @@ -186,10 +187,11 @@ jbyteArray Java_org_rocksdb_SstFileReaderIterator_value0(JNIEnv* env, jobject /* if (jkeyValue == nullptr) { // exception thrown: OutOfMemoryError return nullptr; - } - env->SetByteArrayRegion(jkeyValue, 0, static_cast(value_slice.size()), - const_cast(reinterpret_cast(value_slice.data()))); - return jkeyValue; + } + env->SetByteArrayRegion( + jkeyValue, 0, static_cast(value_slice.size()), + const_cast(reinterpret_cast(value_slice.data()))); + return jkeyValue; } /* @@ -357,8 +359,9 @@ void Java_org_rocksdb_SstFileReaderIterator_seekForPrevByteArray0( * Method: refresh0 * Signature: (J)V */ -void Java_org_rocksdb_SstFileReaderIterator_refresh0(JNIEnv* env, jobject /*jobj*/, - jlong handle) { +void Java_org_rocksdb_SstFileReaderIterator_refresh0(JNIEnv* env, + jobject /*jobj*/, + jlong handle) { auto* it = reinterpret_cast(handle); ROCKSDB_NAMESPACE::Status s = it->Refresh(); diff --git a/java/rocksjni/statistics.cc b/java/rocksjni/statistics.cc index 6672cbdcfb9..bd405afa119 100644 --- a/java/rocksjni/statistics.cc +++ b/java/rocksjni/statistics.cc @@ -23,10 +23,8 @@ * Method: newStatistics * Signature: ()J */ -jlong Java_org_rocksdb_Statistics_newStatistics__( - JNIEnv* env, jclass jcls) { - return Java_org_rocksdb_Statistics_newStatistics___3BJ( - env, jcls, nullptr, 0); +jlong Java_org_rocksdb_Statistics_newStatistics__(JNIEnv* env, jclass jcls) { + return Java_org_rocksdb_Statistics_newStatistics___3BJ(env, jcls, nullptr, 0); } /* @@ -45,10 +43,10 @@ jlong Java_org_rocksdb_Statistics_newStatistics__J( * Method: newStatistics * Signature: ([B)J */ -jlong Java_org_rocksdb_Statistics_newStatistics___3B( - JNIEnv* env, jclass jcls, jbyteArray jhistograms) { - return Java_org_rocksdb_Statistics_newStatistics___3BJ( - env, jcls, jhistograms, 0); +jlong Java_org_rocksdb_Statistics_newStatistics___3B(JNIEnv* env, jclass jcls, + jbyteArray jhistograms) { + return Java_org_rocksdb_Statistics_newStatistics___3BJ(env, jcls, jhistograms, + 0); } /* @@ -57,7 +55,8 @@ jlong Java_org_rocksdb_Statistics_newStatistics___3B( * Signature: ([BJ)J */ jlong Java_org_rocksdb_Statistics_newStatistics___3BJ( - JNIEnv* env, jclass, jbyteArray jhistograms, jlong jother_statistics_handle) { + JNIEnv* env, jclass, jbyteArray jhistograms, + jlong jother_statistics_handle) { std::shared_ptr* pSptr_other_statistics = nullptr; if (jother_statistics_handle > 0) { @@ -105,8 +104,8 @@ jlong Java_org_rocksdb_Statistics_newStatistics___3BJ( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_Statistics_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_Statistics_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { if (jhandle > 0) { auto* pSptr_statistics = reinterpret_cast*>( @@ -120,8 +119,7 @@ void Java_org_rocksdb_Statistics_disposeInternal( * Method: statsLevel * Signature: (J)B */ -jbyte Java_org_rocksdb_Statistics_statsLevel( - JNIEnv*, jobject, jlong jhandle) { +jbyte Java_org_rocksdb_Statistics_statsLevel(JNIEnv*, jobject, jlong jhandle) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -135,8 +133,8 @@ jbyte Java_org_rocksdb_Statistics_statsLevel( * Method: setStatsLevel * Signature: (JB)V */ -void Java_org_rocksdb_Statistics_setStatsLevel( - JNIEnv*, jobject, jlong jhandle, jbyte jstats_level) { +void Java_org_rocksdb_Statistics_setStatsLevel(JNIEnv*, jobject, jlong jhandle, + jbyte jstats_level) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -151,8 +149,9 @@ void Java_org_rocksdb_Statistics_setStatsLevel( * Method: getTickerCount * Signature: (JB)J */ -jlong Java_org_rocksdb_Statistics_getTickerCount( - JNIEnv*, jobject, jlong jhandle, jbyte jticker_type) { +jlong Java_org_rocksdb_Statistics_getTickerCount(JNIEnv*, jobject, + jlong jhandle, + jbyte jticker_type) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -167,8 +166,9 @@ jlong Java_org_rocksdb_Statistics_getTickerCount( * Method: getAndResetTickerCount * Signature: (JB)J */ -jlong Java_org_rocksdb_Statistics_getAndResetTickerCount( - JNIEnv*, jobject, jlong jhandle, jbyte jticker_type) { +jlong Java_org_rocksdb_Statistics_getAndResetTickerCount(JNIEnv*, jobject, + jlong jhandle, + jbyte jticker_type) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -182,8 +182,9 @@ jlong Java_org_rocksdb_Statistics_getAndResetTickerCount( * Method: getHistogramData * Signature: (JB)Lorg/rocksdb/HistogramData; */ -jobject Java_org_rocksdb_Statistics_getHistogramData( - JNIEnv* env, jobject, jlong jhandle, jbyte jhistogram_type) { +jobject Java_org_rocksdb_Statistics_getHistogramData(JNIEnv* env, jobject, + jlong jhandle, + jbyte jhistogram_type) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -213,8 +214,8 @@ jobject Java_org_rocksdb_Statistics_getHistogramData( return env->NewObject(jclazz, mid, data.median, data.percentile95, data.percentile99, data.average, - data.standard_deviation, data.max, data.count, - data.sum, data.min); + data.standard_deviation, data.max, data.count, data.sum, + data.min); } /* @@ -222,8 +223,9 @@ jobject Java_org_rocksdb_Statistics_getHistogramData( * Method: getHistogramString * Signature: (JB)Ljava/lang/String; */ -jstring Java_org_rocksdb_Statistics_getHistogramString( - JNIEnv* env, jobject, jlong jhandle, jbyte jhistogram_type) { +jstring Java_org_rocksdb_Statistics_getHistogramString(JNIEnv* env, jobject, + jlong jhandle, + jbyte jhistogram_type) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -239,8 +241,7 @@ jstring Java_org_rocksdb_Statistics_getHistogramString( * Method: reset * Signature: (J)V */ -void Java_org_rocksdb_Statistics_reset( - JNIEnv* env, jobject, jlong jhandle) { +void Java_org_rocksdb_Statistics_reset(JNIEnv* env, jobject, jlong jhandle) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); @@ -256,8 +257,8 @@ void Java_org_rocksdb_Statistics_reset( * Method: toString * Signature: (J)Ljava/lang/String; */ -jstring Java_org_rocksdb_Statistics_toString( - JNIEnv* env, jobject, jlong jhandle) { +jstring Java_org_rocksdb_Statistics_toString(JNIEnv* env, jobject, + jlong jhandle) { auto* pSptr_statistics = reinterpret_cast*>( jhandle); diff --git a/java/rocksjni/statisticsjni.h b/java/rocksjni/statisticsjni.h index db3985582d4..ce823f9b128 100644 --- a/java/rocksjni/statisticsjni.h +++ b/java/rocksjni/statisticsjni.h @@ -12,8 +12,9 @@ #include #include #include -#include "rocksdb/statistics.h" + #include "monitoring/statistics.h" +#include "rocksdb/statistics.h" namespace ROCKSDB_NAMESPACE { @@ -26,8 +27,8 @@ class StatisticsJni : public StatisticsImpl { private: const std::set m_ignore_histograms; - }; +}; - } // namespace ROCKSDB_NAMESPACE +} // namespace ROCKSDB_NAMESPACE #endif // JAVA_ROCKSJNI_STATISTICSJNI_H_ diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index f85905bc692..0054e5c1fb6 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -72,9 +72,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.cache_index_and_filter_blocks_with_high_priority = static_cast(jcache_index_and_filter_blocks_with_high_priority); options.pin_l0_filter_and_index_blocks_in_cache = - static_cast(jpin_l0_filter_and_index_blocks_in_cache); + static_cast(jpin_l0_filter_and_index_blocks_in_cache); options.pin_top_level_index_and_filter = - static_cast(jpin_top_level_index_and_filter); + static_cast(jpin_top_level_index_and_filter); options.index_type = ROCKSDB_NAMESPACE::IndexTypeJni::toCppIndexType(jindex_type_value); options.data_block_index_type = @@ -131,7 +131,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.block_size = static_cast(jblock_size); options.block_size_deviation = static_cast(jblock_size_deviation); options.block_restart_interval = static_cast(jblock_restart_interval); - options.index_block_restart_interval = static_cast(jindex_block_restart_interval); + options.index_block_restart_interval = + static_cast(jindex_block_restart_interval); options.metadata_block_size = static_cast(jmetadata_block_size); options.partition_filters = static_cast(jpartition_filters); options.optimize_filters_for_memory = @@ -145,9 +146,11 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( } options.whole_key_filtering = static_cast(jwhole_key_filtering); options.verify_compression = static_cast(jverify_compression); - options.read_amp_bytes_per_bit = static_cast(jread_amp_bytes_per_bit); + options.read_amp_bytes_per_bit = + static_cast(jread_amp_bytes_per_bit); options.format_version = static_cast(jformat_version); - options.enable_index_compression = static_cast(jenable_index_compression); + options.enable_index_compression = + static_cast(jenable_index_compression); options.block_align = static_cast(jblock_align); options.index_shortening = ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode( diff --git a/java/rocksjni/table_filter_jnicallback.cc b/java/rocksjni/table_filter_jnicallback.cc index d1699548d0e..5350c5ceee2 100644 --- a/java/rocksjni/table_filter_jnicallback.cc +++ b/java/rocksjni/table_filter_jnicallback.cc @@ -7,15 +7,15 @@ // ROCKSDB_NAMESPACE::TableFilter. #include "rocksjni/table_filter_jnicallback.h" + #include "rocksjni/portal.h" namespace ROCKSDB_NAMESPACE { -TableFilterJniCallback::TableFilterJniCallback( - JNIEnv* env, jobject jtable_filter) +TableFilterJniCallback::TableFilterJniCallback(JNIEnv* env, + jobject jtable_filter) : JniCallback(env, jtable_filter) { - m_jfilter_methodid = - AbstractTableFilterJni::getFilterMethod(env); - if(m_jfilter_methodid == nullptr) { + m_jfilter_methodid = AbstractTableFilterJni::getFilterMethod(env); + if (m_jfilter_methodid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } diff --git a/java/rocksjni/table_filter_jnicallback.h b/java/rocksjni/table_filter_jnicallback.h index b5804737aeb..0ef404ca22d 100644 --- a/java/rocksjni/table_filter_jnicallback.h +++ b/java/rocksjni/table_filter_jnicallback.h @@ -10,6 +10,7 @@ #define JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_ #include + #include #include @@ -20,15 +21,14 @@ namespace ROCKSDB_NAMESPACE { class TableFilterJniCallback : public JniCallback { public: - TableFilterJniCallback( - JNIEnv* env, jobject jtable_filter); - std::function - GetTableFilterFunction(); - - private: - jmethodID m_jfilter_methodid; - std::function - m_table_filter_function; + TableFilterJniCallback(JNIEnv* env, jobject jtable_filter); + std::function + GetTableFilterFunction(); + + private: + jmethodID m_jfilter_methodid; + std::function + m_table_filter_function; }; } // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/thread_status.cc b/java/rocksjni/thread_status.cc index a5811ec177f..c600f6cd565 100644 --- a/java/rocksjni/thread_status.cc +++ b/java/rocksjni/thread_status.cc @@ -6,11 +6,12 @@ // This file implements the "bridge" between Java and C++ and enables // calling c++ ROCKSDB_NAMESPACE::ThreadStatus methods from Java side. +#include "rocksdb/thread_status.h" + #include -#include "portal.h" #include "include/org_rocksdb_ThreadStatus.h" -#include "rocksdb/thread_status.h" +#include "portal.h" /* * Class: org_rocksdb_ThreadStatus @@ -42,8 +43,8 @@ jstring Java_org_rocksdb_ThreadStatus_getOperationName( * Method: microsToStringNative * Signature: (J)Ljava/lang/String; */ -jstring Java_org_rocksdb_ThreadStatus_microsToStringNative( - JNIEnv* env, jclass, jlong jmicros) { +jstring Java_org_rocksdb_ThreadStatus_microsToStringNative(JNIEnv* env, jclass, + jlong jmicros) { auto str = ROCKSDB_NAMESPACE::ThreadStatus::MicrosToString( static_cast(jmicros)); return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &str, true); @@ -84,8 +85,7 @@ jstring Java_org_rocksdb_ThreadStatus_getOperationPropertyName( jobject Java_org_rocksdb_ThreadStatus_interpretOperationProperties( JNIEnv* env, jclass, jbyte joperation_type_value, jlongArray joperation_properties) { - - //convert joperation_properties + // convert joperation_properties const jsize len = env->GetArrayLength(joperation_properties); const std::unique_ptr op_properties(new uint64_t[len]); jlong* jop = env->GetLongArrayElements(joperation_properties, nullptr); @@ -117,8 +117,8 @@ jobject Java_org_rocksdb_ThreadStatus_interpretOperationProperties( * Method: getStateName * Signature: (B)Ljava/lang/String; */ -jstring Java_org_rocksdb_ThreadStatus_getStateName( - JNIEnv* env, jclass, jbyte jstate_type_value) { +jstring Java_org_rocksdb_ThreadStatus_getStateName(JNIEnv* env, jclass, + jbyte jstate_type_value) { auto name = ROCKSDB_NAMESPACE::ThreadStatus::GetStateName( ROCKSDB_NAMESPACE::StateTypeJni::toCppStateType(jstate_type_value)); return ROCKSDB_NAMESPACE::JniUtil::toJavaString(env, &name, true); diff --git a/java/rocksjni/trace_writer.cc b/java/rocksjni/trace_writer.cc index 0fc3afd9959..d5827639948 100644 --- a/java/rocksjni/trace_writer.cc +++ b/java/rocksjni/trace_writer.cc @@ -17,8 +17,8 @@ * Method: createNewTraceWriter * Signature: ()J */ -jlong Java_org_rocksdb_AbstractTraceWriter_createNewTraceWriter( - JNIEnv* env, jobject jobj) { +jlong Java_org_rocksdb_AbstractTraceWriter_createNewTraceWriter(JNIEnv* env, + jobject jobj) { auto* trace_writer = new ROCKSDB_NAMESPACE::TraceWriterJniCallback(env, jobj); return GET_CPLUSPLUS_POINTER(trace_writer); } diff --git a/java/rocksjni/trace_writer_jnicallback.cc b/java/rocksjni/trace_writer_jnicallback.cc index b6566d335b2..d1ed32038d2 100644 --- a/java/rocksjni/trace_writer_jnicallback.cc +++ b/java/rocksjni/trace_writer_jnicallback.cc @@ -7,29 +7,29 @@ // ROCKSDB_NAMESPACE::TraceWriter. #include "rocksjni/trace_writer_jnicallback.h" + #include "rocksjni/portal.h" namespace ROCKSDB_NAMESPACE { -TraceWriterJniCallback::TraceWriterJniCallback( - JNIEnv* env, jobject jtrace_writer) +TraceWriterJniCallback::TraceWriterJniCallback(JNIEnv* env, + jobject jtrace_writer) : JniCallback(env, jtrace_writer) { - m_jwrite_proxy_methodid = - AbstractTraceWriterJni::getWriteProxyMethodId(env); - if(m_jwrite_proxy_methodid == nullptr) { + m_jwrite_proxy_methodid = AbstractTraceWriterJni::getWriteProxyMethodId(env); + if (m_jwrite_proxy_methodid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } m_jclose_writer_proxy_methodid = AbstractTraceWriterJni::getCloseWriterProxyMethodId(env); - if(m_jclose_writer_proxy_methodid == nullptr) { + if (m_jclose_writer_proxy_methodid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } m_jget_file_size_methodid = AbstractTraceWriterJni::getGetFileSizeMethodId(env); - if(m_jget_file_size_methodid == nullptr) { + if (m_jget_file_size_methodid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } @@ -42,21 +42,22 @@ Status TraceWriterJniCallback::Write(const Slice& data) { return Status::IOError("Unable to attach JNI Environment"); } - jshort jstatus = env->CallShortMethod(m_jcallback_obj, - m_jwrite_proxy_methodid, - &data); + jshort jstatus = + env->CallShortMethod(m_jcallback_obj, m_jwrite_proxy_methodid, &data); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception thrown from CallShortMethod env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); - return Status::IOError("Unable to call AbstractTraceWriter#writeProxy(long)"); + return Status::IOError( + "Unable to call AbstractTraceWriter#writeProxy(long)"); } // unpack status code and status sub-code from jstatus jbyte jcode_value = (jstatus >> 8) & 0xFF; jbyte jsub_code_value = jstatus & 0xFF; - std::unique_ptr s = StatusJni::toCppStatus(jcode_value, jsub_code_value); + std::unique_ptr s = + StatusJni::toCppStatus(jcode_value, jsub_code_value); releaseJniEnv(attached_thread); @@ -70,20 +71,22 @@ Status TraceWriterJniCallback::Close() { return Status::IOError("Unable to attach JNI Environment"); } - jshort jstatus = env->CallShortMethod(m_jcallback_obj, - m_jclose_writer_proxy_methodid); + jshort jstatus = + env->CallShortMethod(m_jcallback_obj, m_jclose_writer_proxy_methodid); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception thrown from CallShortMethod env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); - return Status::IOError("Unable to call AbstractTraceWriter#closeWriterProxy()"); + return Status::IOError( + "Unable to call AbstractTraceWriter#closeWriterProxy()"); } // unpack status code and status sub-code from jstatus jbyte code_value = (jstatus >> 8) & 0xFF; jbyte sub_code_value = jstatus & 0xFF; - std::unique_ptr s = StatusJni::toCppStatus(code_value, sub_code_value); + std::unique_ptr s = + StatusJni::toCppStatus(code_value, sub_code_value); releaseJniEnv(attached_thread); @@ -97,10 +100,10 @@ uint64_t TraceWriterJniCallback::GetFileSize() { return 0; } - jlong jfile_size = env->CallLongMethod(m_jcallback_obj, - m_jget_file_size_methodid); + jlong jfile_size = + env->CallLongMethod(m_jcallback_obj, m_jget_file_size_methodid); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception thrown from CallLongMethod env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); diff --git a/java/rocksjni/trace_writer_jnicallback.h b/java/rocksjni/trace_writer_jnicallback.h index eb2a8b0f806..c82a3a72cf3 100644 --- a/java/rocksjni/trace_writer_jnicallback.h +++ b/java/rocksjni/trace_writer_jnicallback.h @@ -10,6 +10,7 @@ #define JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_ #include + #include #include "rocksdb/trace_reader_writer.h" @@ -19,16 +20,15 @@ namespace ROCKSDB_NAMESPACE { class TraceWriterJniCallback : public JniCallback, public TraceWriter { public: - TraceWriterJniCallback( - JNIEnv* env, jobject jtrace_writer); - virtual Status Write(const Slice& data); - virtual Status Close(); - virtual uint64_t GetFileSize(); + TraceWriterJniCallback(JNIEnv* env, jobject jtrace_writer); + virtual Status Write(const Slice& data); + virtual Status Close(); + virtual uint64_t GetFileSize(); private: - jmethodID m_jwrite_proxy_methodid; - jmethodID m_jclose_writer_proxy_methodid; - jmethodID m_jget_file_size_methodid; + jmethodID m_jwrite_proxy_methodid; + jmethodID m_jclose_writer_proxy_methodid; + jmethodID m_jget_file_size_methodid; }; } // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc index 5c01436f163..1a0a64fc7f0 100644 --- a/java/rocksjni/transaction.cc +++ b/java/rocksjni/transaction.cc @@ -190,8 +190,9 @@ jbyteArray txn_get_helper(JNIEnv* env, const FnGet& fn_get, // exception thrown: OutOfMemoryError return nullptr; } - env->SetByteArrayRegion(jret_value, 0, static_cast(value.size()), - const_cast(reinterpret_cast(value.c_str()))); + env->SetByteArrayRegion( + jret_value, 0, static_cast(value.size()), + const_cast(reinterpret_cast(value.c_str()))); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException return nullptr; @@ -251,12 +252,6 @@ std::vector txn_column_families_helper( if (jcolumn_family_handles != nullptr) { const jsize len_cols = env->GetArrayLength(jcolumn_family_handles); if (len_cols > 0) { - if (env->EnsureLocalCapacity(len_cols) != 0) { - // out of memory - *has_exception = JNI_TRUE; - return std::vector(); - } - jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr); if (jcfh == nullptr) { // exception thrown: OutOfMemoryError @@ -292,47 +287,48 @@ void free_parts( } } +void free_key_values(std::vector& keys_to_free) { + for (auto& key : keys_to_free) { + delete[] key; + } +} + // TODO(AR) consider refactoring to share this between here and rocksjni.cc // cf multi get jobjectArray txn_multi_get_helper(JNIEnv* env, const FnMultiGet& fn_multi_get, const jlong& jread_options_handle, const jobjectArray& jkey_parts) { const jsize len_key_parts = env->GetArrayLength(jkey_parts); - if (env->EnsureLocalCapacity(len_key_parts) != 0) { - // out of memory - return nullptr; - } std::vector key_parts; - std::vector> key_parts_to_free; + std::vector keys_to_free; for (int i = 0; i < len_key_parts; i++) { const jobject jk = env->GetObjectArrayElement(jkey_parts, i); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException - free_parts(env, key_parts_to_free); + free_key_values(keys_to_free); return nullptr; } jbyteArray jk_ba = reinterpret_cast(jk); const jsize len_key = env->GetArrayLength(jk_ba); - if (env->EnsureLocalCapacity(len_key) != 0) { - // out of memory - env->DeleteLocalRef(jk); - free_parts(env, key_parts_to_free); - return nullptr; - } - jbyte* jk_val = env->GetByteArrayElements(jk_ba, nullptr); + jbyte* jk_val = new jbyte[len_key]; if (jk_val == nullptr) { // exception thrown: OutOfMemoryError env->DeleteLocalRef(jk); - free_parts(env, key_parts_to_free); + free_key_values(keys_to_free); + + jclass exception_cls = (env)->FindClass("java/lang/OutOfMemoryError"); + (env)->ThrowNew(exception_cls, + "Insufficient Memory for CF handle array."); return nullptr; } + env->GetByteArrayRegion(jk_ba, 0, len_key, jk_val); ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast(jk_val), len_key); key_parts.push_back(key_slice); - - key_parts_to_free.push_back(std::make_tuple(jk_ba, jk_val, jk)); + keys_to_free.push_back(jk_val); + env->DeleteLocalRef(jk); } auto* read_options = @@ -342,7 +338,7 @@ jobjectArray txn_multi_get_helper(JNIEnv* env, const FnMultiGet& fn_multi_get, fn_multi_get(*read_options, key_parts, &value_parts); // free up allocated byte arrays - free_parts(env, key_parts_to_free); + free_key_values(keys_to_free); // prepare the results const jclass jcls_ba = env->FindClass("[B"); @@ -366,7 +362,8 @@ jobjectArray txn_multi_get_helper(JNIEnv* env, const FnMultiGet& fn_multi_get, env->SetByteArrayRegion( jentry_value, 0, static_cast(value_parts[i].size()), - const_cast(reinterpret_cast(value_parts[i].c_str()))); + const_cast( + reinterpret_cast(value_parts[i].c_str()))); if (env->ExceptionCheck()) { // exception thrown: ArrayIndexOutOfBoundsException env->DeleteLocalRef(jentry_value); @@ -647,7 +644,7 @@ void txn_write_kv_parts_helper(JNIEnv* env, const jobjectArray& jvalue_parts, const jint& jvalue_parts_len) { #ifndef DEBUG - (void) jvalue_parts_len; + (void)jvalue_parts_len; #else assert(jkey_parts_len == jvalue_parts_len); #endif @@ -656,6 +653,20 @@ void txn_write_kv_parts_helper(JNIEnv* env, auto value_parts = std::vector(); auto jparts_to_free = std::vector>(); + // Since this is fundamentally a gather write at the RocksDB level, + // it seems wrong to refactor it by copying (gathering) keys and data here, + // in order to avoid the local reference limit. + // The user needs to be a aware that there is a limit to the number of parts + // which can be gathered. + if (env->EnsureLocalCapacity(jkey_parts_len + jvalue_parts_len) != 0) { + // no space for all the jobjects we store up + env->ExceptionClear(); + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, "Insufficient JNI local references for " + + std::to_string(jkey_parts_len) + " key/value parts"); + return; + } + // convert java key_parts/value_parts byte[][] to Slice(s) for (jsize i = 0; i < jkey_parts_len; ++i) { const jobject jobj_key_part = env->GetObjectArrayElement(jkey_parts, i); @@ -674,13 +685,6 @@ void txn_write_kv_parts_helper(JNIEnv* env, const jbyteArray jba_key_part = reinterpret_cast(jobj_key_part); const jsize jkey_part_len = env->GetArrayLength(jba_key_part); - if (env->EnsureLocalCapacity(jkey_part_len) != 0) { - // out of memory - env->DeleteLocalRef(jobj_value_part); - env->DeleteLocalRef(jobj_key_part); - free_parts(env, jparts_to_free); - return; - } jbyte* jkey_part = env->GetByteArrayElements(jba_key_part, nullptr); if (jkey_part == nullptr) { // exception thrown: OutOfMemoryError @@ -693,18 +697,9 @@ void txn_write_kv_parts_helper(JNIEnv* env, const jbyteArray jba_value_part = reinterpret_cast(jobj_value_part); const jsize jvalue_part_len = env->GetArrayLength(jba_value_part); - if (env->EnsureLocalCapacity(jvalue_part_len) != 0) { - // out of memory - env->DeleteLocalRef(jobj_value_part); - env->DeleteLocalRef(jobj_key_part); - env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT); - free_parts(env, jparts_to_free); - return; - } jbyte* jvalue_part = env->GetByteArrayElements(jba_value_part, nullptr); if (jvalue_part == nullptr) { // exception thrown: OutOfMemoryError - env->ReleaseByteArrayElements(jba_value_part, jvalue_part, JNI_ABORT); env->DeleteLocalRef(jobj_value_part); env->DeleteLocalRef(jobj_key_part); env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT); @@ -909,12 +904,6 @@ void txn_write_k_parts_helper(JNIEnv* env, const jbyteArray jba_key_part = reinterpret_cast(jobj_key_part); const jsize jkey_part_len = env->GetArrayLength(jba_key_part); - if (env->EnsureLocalCapacity(jkey_part_len) != 0) { - // out of memory - env->DeleteLocalRef(jobj_key_part); - free_parts(env, jkey_parts_to_free); - return; - } jbyte* jkey_part = env->GetByteArrayElements(jba_key_part, nullptr); if (jkey_part == nullptr) { // exception thrown: OutOfMemoryError diff --git a/java/rocksjni/transaction_db.cc b/java/rocksjni/transaction_db.cc index 2066d6d7ea9..0adf856065e 100644 --- a/java/rocksjni/transaction_db.cc +++ b/java/rocksjni/transaction_db.cc @@ -26,8 +26,8 @@ * Signature: (JJLjava/lang/String;)J */ jlong Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2( - JNIEnv* env, jclass, jlong joptions_handle, - jlong jtxn_db_options_handle, jstring jdb_path) { + JNIEnv* env, jclass, jlong joptions_handle, jlong jtxn_db_options_handle, + jstring jdb_path) { auto* options = reinterpret_cast(joptions_handle); auto* txn_db_options = @@ -57,8 +57,8 @@ jlong Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2( * Signature: (JJLjava/lang/String;[[B[J)[J */ jlongArray Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2_3_3B_3J( - JNIEnv* env, jclass, jlong jdb_options_handle, - jlong jtxn_db_options_handle, jstring jdb_path, jobjectArray jcolumn_names, + JNIEnv* env, jclass, jlong jdb_options_handle, jlong jtxn_db_options_handle, + jstring jdb_path, jobjectArray jcolumn_names, jlongArray jcolumn_options_handles) { const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); if (db_path == nullptr) { @@ -67,12 +67,6 @@ jlongArray Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2_3_3B_3J( } const jsize len_cols = env->GetArrayLength(jcolumn_names); - if (env->EnsureLocalCapacity(len_cols) != 0) { - // out of memory - env->ReleaseStringUTFChars(jdb_path, db_path); - return nullptr; - } - jlong* jco = env->GetLongArrayElements(jcolumn_options_handles, nullptr); if (jco == nullptr) { // exception thrown: OutOfMemoryError @@ -99,14 +93,6 @@ jlongArray Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2_3_3B_3J( } const int jcf_name_len = env->GetArrayLength(jcn_ba); - if (env->EnsureLocalCapacity(jcf_name_len) != 0) { - // out of memory - env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT); - env->DeleteLocalRef(jcn); - env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT); - env->ReleaseStringUTFChars(jdb_path, db_path); - return nullptr; - } const std::string cf_name(reinterpret_cast(jcf_name), jcf_name_len); const ROCKSDB_NAMESPACE::ColumnFamilyOptions* cf_options = reinterpret_cast(jco[i]); @@ -161,8 +147,8 @@ jlongArray Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2_3_3B_3J( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TransactionDB_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_TransactionDB_disposeInternal(JNIEnv*, jobject, + jlong jhandle) { auto* txn_db = reinterpret_cast(jhandle); assert(txn_db != nullptr); delete txn_db; @@ -173,8 +159,8 @@ void Java_org_rocksdb_TransactionDB_disposeInternal( * Method: closeDatabase * Signature: (J)V */ -void Java_org_rocksdb_TransactionDB_closeDatabase( - JNIEnv* env, jclass, jlong jhandle) { +void Java_org_rocksdb_TransactionDB_closeDatabase(JNIEnv* env, jclass, + jlong jhandle) { auto* txn_db = reinterpret_cast(jhandle); assert(txn_db != nullptr); ROCKSDB_NAMESPACE::Status s = txn_db->Close(); @@ -270,8 +256,9 @@ jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJJ( * Method: getTransactionByName * Signature: (JLjava/lang/String;)J */ -jlong Java_org_rocksdb_TransactionDB_getTransactionByName( - JNIEnv* env, jobject, jlong jhandle, jstring jname) { +jlong Java_org_rocksdb_TransactionDB_getTransactionByName(JNIEnv* env, jobject, + jlong jhandle, + jstring jname) { auto* txn_db = reinterpret_cast(jhandle); const char* name = env->GetStringUTFChars(jname, nullptr); if (name == nullptr) { @@ -323,8 +310,8 @@ jlongArray Java_org_rocksdb_TransactionDB_getAllPreparedTransactions( * Method: getLockStatusData * Signature: (J)Ljava/util/Map; */ -jobject Java_org_rocksdb_TransactionDB_getLockStatusData( - JNIEnv* env, jobject, jlong jhandle) { +jobject Java_org_rocksdb_TransactionDB_getLockStatusData(JNIEnv* env, jobject, + jlong jhandle) { auto* txn_db = reinterpret_cast(jhandle); const std::unordered_multimap lock_status_data = txn_db->GetLockStatusData(); diff --git a/java/rocksjni/transaction_log.cc b/java/rocksjni/transaction_log.cc index aa57211eb6d..97c3bb30122 100644 --- a/java/rocksjni/transaction_log.cc +++ b/java/rocksjni/transaction_log.cc @@ -6,12 +6,13 @@ // This file implements the "bridge" between Java and C++ and enables // calling c++ ROCKSDB_NAMESPACE::Iterator methods from Java side. +#include "rocksdb/transaction_log.h" + #include #include #include #include "include/org_rocksdb_TransactionLogIterator.h" -#include "rocksdb/transaction_log.h" #include "rocksjni/portal.h" /* diff --git a/java/rocksjni/transaction_notifier_jnicallback.cc b/java/rocksjni/transaction_notifier_jnicallback.cc index cd77c60ae30..26761cabda3 100644 --- a/java/rocksjni/transaction_notifier_jnicallback.cc +++ b/java/rocksjni/transaction_notifier_jnicallback.cc @@ -13,8 +13,9 @@ namespace ROCKSDB_NAMESPACE { -TransactionNotifierJniCallback::TransactionNotifierJniCallback(JNIEnv* env, - jobject jtransaction_notifier) : JniCallback(env, jtransaction_notifier) { +TransactionNotifierJniCallback::TransactionNotifierJniCallback( + JNIEnv* env, jobject jtransaction_notifier) + : JniCallback(env, jtransaction_notifier) { // we cache the method id for the JNI callback m_jsnapshot_created_methodID = AbstractTransactionNotifierJni::getSnapshotCreatedMethodId(env); @@ -29,7 +30,7 @@ void TransactionNotifierJniCallback::SnapshotCreated( env->CallVoidMethod(m_jcallback_obj, m_jsnapshot_created_methodID, GET_CPLUSPLUS_POINTER(newSnapshot)); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception thrown from CallVoidMethod env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); diff --git a/java/rocksjni/transaction_notifier_jnicallback.h b/java/rocksjni/transaction_notifier_jnicallback.h index b3155b5a33e..089a5ee4a51 100644 --- a/java/rocksjni/transaction_notifier_jnicallback.h +++ b/java/rocksjni/transaction_notifier_jnicallback.h @@ -28,8 +28,8 @@ namespace ROCKSDB_NAMESPACE { * presented to the callback. This could be revisited in future * if performance is lacking. */ -class TransactionNotifierJniCallback: public JniCallback, - public TransactionNotifier { +class TransactionNotifierJniCallback : public JniCallback, + public TransactionNotifier { public: TransactionNotifierJniCallback(JNIEnv* env, jobject jtransaction_notifier); virtual void SnapshotCreated(const Snapshot* newSnapshot); diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc index 1b51954cfa4..1fe2083d994 100644 --- a/java/rocksjni/ttl.cc +++ b/java/rocksjni/ttl.cc @@ -25,9 +25,9 @@ * Method: open * Signature: (JLjava/lang/String;IZ)J */ -jlong Java_org_rocksdb_TtlDB_open( - JNIEnv* env, jclass, jlong joptions_handle, jstring jdb_path, jint jttl, - jboolean jread_only) { +jlong Java_org_rocksdb_TtlDB_open(JNIEnv* env, jclass, jlong joptions_handle, + jstring jdb_path, jint jttl, + jboolean jread_only) { const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); if (db_path == nullptr) { // exception thrown: OutOfMemoryError @@ -55,10 +55,11 @@ jlong Java_org_rocksdb_TtlDB_open( * Method: openCF * Signature: (JLjava/lang/String;[[B[J[IZ)[J */ -jlongArray Java_org_rocksdb_TtlDB_openCF( - JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path, - jobjectArray jcolumn_names, jlongArray jcolumn_options, - jintArray jttls, jboolean jread_only) { +jlongArray Java_org_rocksdb_TtlDB_openCF(JNIEnv* env, jclass, jlong jopt_handle, + jstring jdb_path, + jobjectArray jcolumn_names, + jlongArray jcolumn_options, + jintArray jttls, jboolean jread_only) { const char* db_path = env->GetStringUTFChars(jdb_path, nullptr); if (db_path == nullptr) { // exception thrown: OutOfMemoryError @@ -153,8 +154,7 @@ jlongArray Java_org_rocksdb_TtlDB_openCF( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_TtlDB_disposeInternal( - JNIEnv*, jobject, jlong jhandle) { +void Java_org_rocksdb_TtlDB_disposeInternal(JNIEnv*, jobject, jlong jhandle) { auto* ttl_db = reinterpret_cast(jhandle); assert(ttl_db != nullptr); delete ttl_db; @@ -165,14 +165,15 @@ void Java_org_rocksdb_TtlDB_disposeInternal( * Method: closeDatabase * Signature: (J)V */ -void Java_org_rocksdb_TtlDB_closeDatabase( - JNIEnv* /* env */, jclass, jlong /* jhandle */) { +void Java_org_rocksdb_TtlDB_closeDatabase(JNIEnv* /* env */, jclass, + jlong /* jhandle */) { // auto* ttl_db = reinterpret_cast(jhandle); // assert(ttl_db != nullptr); // ROCKSDB_NAMESPACE::Status s = ttl_db->Close(); // ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - //TODO(AR) this is disabled until https://github.com/facebook/rocksdb/issues/4818 is resolved! + // TODO(AR) this is disabled until + // https://github.com/facebook/rocksdb/issues/4818 is resolved! } /* @@ -180,9 +181,11 @@ void Java_org_rocksdb_TtlDB_closeDatabase( * Method: createColumnFamilyWithTtl * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;[BJI)J; */ -jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl( - JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jcolumn_name, - jlong jcolumn_options, jint jttl) { +jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(JNIEnv* env, jobject, + jlong jdb_handle, + jbyteArray jcolumn_name, + jlong jcolumn_options, + jint jttl) { jbyte* cfname = env->GetByteArrayElements(jcolumn_name, nullptr); if (cfname == nullptr) { // exception thrown: OutOfMemoryError diff --git a/java/rocksjni/wal_filter.cc b/java/rocksjni/wal_filter.cc index 5d9ef32fb4c..24b88afed3f 100644 --- a/java/rocksjni/wal_filter.cc +++ b/java/rocksjni/wal_filter.cc @@ -17,8 +17,8 @@ * Method: createNewWalFilter * Signature: ()J */ -jlong Java_org_rocksdb_AbstractWalFilter_createNewWalFilter( - JNIEnv* env, jobject jobj) { +jlong Java_org_rocksdb_AbstractWalFilter_createNewWalFilter(JNIEnv* env, + jobject jobj) { auto* wal_filter = new ROCKSDB_NAMESPACE::WalFilterJniCallback(env, jobj); return GET_CPLUSPLUS_POINTER(wal_filter); } diff --git a/java/rocksjni/wal_filter_jnicallback.cc b/java/rocksjni/wal_filter_jnicallback.cc index 656cc592e5d..d2e3c9076d8 100644 --- a/java/rocksjni/wal_filter_jnicallback.cc +++ b/java/rocksjni/wal_filter_jnicallback.cc @@ -12,24 +12,23 @@ #include "rocksjni/portal.h" namespace ROCKSDB_NAMESPACE { -WalFilterJniCallback::WalFilterJniCallback( - JNIEnv* env, jobject jwal_filter) +WalFilterJniCallback::WalFilterJniCallback(JNIEnv* env, jobject jwal_filter) : JniCallback(env, jwal_filter) { // Note: The name of a WalFilter will not change during it's lifetime, // so we cache it in a global var jmethodID jname_mid = AbstractWalFilterJni::getNameMethodId(env); - if(jname_mid == nullptr) { + if (jname_mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } jstring jname = (jstring)env->CallObjectMethod(m_jcallback_obj, jname_mid); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception thrown return; } jboolean has_exception = JNI_FALSE; m_name = JniUtil::copyString(env, jname, - &has_exception); // also releases jname + &has_exception); // also releases jname if (has_exception == JNI_TRUE) { // exception thrown return; @@ -37,14 +36,14 @@ WalFilterJniCallback::WalFilterJniCallback( m_column_family_log_number_map_mid = AbstractWalFilterJni::getColumnFamilyLogNumberMapMethodId(env); - if(m_column_family_log_number_map_mid == nullptr) { + if (m_column_family_log_number_map_mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } m_log_record_found_proxy_mid = AbstractWalFilterJni::getLogRecordFoundProxyMethodId(env); - if(m_log_record_found_proxy_mid == nullptr) { + if (m_log_record_found_proxy_mid == nullptr) { // exception thrown: NoSuchMethodException or OutOfMemoryError return; } @@ -63,7 +62,7 @@ void WalFilterJniCallback::ColumnFamilyLogNumberMap( ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(env, &cf_lognumber_map); if (jcf_lognumber_map == nullptr) { // exception occurred - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); return; } @@ -72,21 +71,19 @@ void WalFilterJniCallback::ColumnFamilyLogNumberMap( ROCKSDB_NAMESPACE::HashMapJni::fromCppMap(env, &cf_name_id_map); if (jcf_name_id_map == nullptr) { // exception occurred - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr env->DeleteLocalRef(jcf_lognumber_map); releaseJniEnv(attached_thread); return; } - env->CallVoidMethod(m_jcallback_obj, - m_column_family_log_number_map_mid, - jcf_lognumber_map, - jcf_name_id_map); + env->CallVoidMethod(m_jcallback_obj, m_column_family_log_number_map_mid, + jcf_lognumber_map, jcf_name_id_map); env->DeleteLocalRef(jcf_lognumber_map); env->DeleteLocalRef(jcf_name_id_map); - if(env->ExceptionCheck()) { + if (env->ExceptionCheck()) { // exception thrown from CallVoidMethod env->ExceptionDescribe(); // print out exception to stderr } @@ -94,21 +91,21 @@ void WalFilterJniCallback::ColumnFamilyLogNumberMap( releaseJniEnv(attached_thread); } - WalFilter::WalProcessingOption WalFilterJniCallback::LogRecordFound( +WalFilter::WalProcessingOption WalFilterJniCallback::LogRecordFound( unsigned long long log_number, const std::string& log_file_name, const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) { jboolean attached_thread = JNI_FALSE; JNIEnv* env = getJniEnv(&attached_thread); if (env == nullptr) { - return WalFilter::WalProcessingOption::kCorruptedRecord; + return WalFilter::WalProcessingOption::kCorruptedRecord; } - + jstring jlog_file_name = JniUtil::toJavaString(env, &log_file_name); if (jlog_file_name == nullptr) { // exception occcurred - env->ExceptionDescribe(); // print out exception to stderr + env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); - return WalFilter::WalProcessingOption::kCorruptedRecord; + return WalFilter::WalProcessingOption::kCorruptedRecord; } jshort jlog_record_found_result = env->CallShortMethod( @@ -122,7 +119,7 @@ void WalFilterJniCallback::ColumnFamilyLogNumberMap( // exception thrown from CallShortMethod env->ExceptionDescribe(); // print out exception to stderr releaseJniEnv(attached_thread); - return WalFilter::WalProcessingOption::kCorruptedRecord; + return WalFilter::WalProcessingOption::kCorruptedRecord; } // unpack WalProcessingOption and batch_changed from jlog_record_found_result @@ -137,8 +134,6 @@ void WalFilterJniCallback::ColumnFamilyLogNumberMap( jwal_processing_option_value); } -const char* WalFilterJniCallback::Name() const { - return m_name.get(); -} +const char* WalFilterJniCallback::Name() const { return m_name.get(); } } // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/wal_filter_jnicallback.h b/java/rocksjni/wal_filter_jnicallback.h index b575ddc62f5..5cdc6597866 100644 --- a/java/rocksjni/wal_filter_jnicallback.h +++ b/java/rocksjni/wal_filter_jnicallback.h @@ -10,6 +10,7 @@ #define JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_ #include + #include #include #include @@ -21,20 +22,19 @@ namespace ROCKSDB_NAMESPACE { class WalFilterJniCallback : public JniCallback, public WalFilter { public: - WalFilterJniCallback( - JNIEnv* env, jobject jwal_filter); - virtual void ColumnFamilyLogNumberMap( - const std::map& cf_lognumber_map, - const std::map& cf_name_id_map); - virtual WalFilter::WalProcessingOption LogRecordFound( - unsigned long long log_number, const std::string& log_file_name, - const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed); - virtual const char* Name() const; + WalFilterJniCallback(JNIEnv* env, jobject jwal_filter); + virtual void ColumnFamilyLogNumberMap( + const std::map& cf_lognumber_map, + const std::map& cf_name_id_map); + virtual WalFilter::WalProcessingOption LogRecordFound( + unsigned long long log_number, const std::string& log_file_name, + const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed); + virtual const char* Name() const; private: - std::unique_ptr m_name; - jmethodID m_column_family_log_number_map_mid; - jmethodID m_log_record_found_proxy_mid; + std::unique_ptr m_name; + jmethodID m_column_family_log_number_map_mid; + jmethodID m_log_record_found_proxy_mid; }; } // namespace ROCKSDB_NAMESPACE diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 9404294178a..30b9a722979 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -5,6 +5,8 @@ // // This file implements the "bridge" between Java and C++ and enables // calling c++ ROCKSDB_NAMESPACE::WriteBatch methods testing from Java side. +#include "rocksdb/write_batch.h" + #include #include "db/memtable.h" @@ -18,7 +20,6 @@ #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/status.h" -#include "rocksdb/write_batch.h" #include "rocksdb/write_buffer_manager.h" #include "rocksjni/portal.h" #include "table/scoped_arena_iterator.h" diff --git a/java/rocksjni/write_batch_with_index.cc b/java/rocksjni/write_batch_with_index.cc index b02ddb672b2..a5c3216cb32 100644 --- a/java/rocksjni/write_batch_with_index.cc +++ b/java/rocksjni/write_batch_with_index.cc @@ -947,6 +947,7 @@ jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1(JNIEnv* env, * Signature: (J)V */ void Java_org_rocksdb_WBWIRocksIterator_refresh0(JNIEnv* env) { - ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::Status::NotSupported("Refresh() is not supported"); + ROCKSDB_NAMESPACE::Status s = + ROCKSDB_NAMESPACE::Status::NotSupported("Refresh() is not supported"); ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } diff --git a/java/rocksjni/write_buffer_manager.cc b/java/rocksjni/write_buffer_manager.cc index 85a81061555..b5b7d193b57 100644 --- a/java/rocksjni/write_buffer_manager.cc +++ b/java/rocksjni/write_buffer_manager.cc @@ -34,8 +34,9 @@ jlong Java_org_rocksdb_WriteBufferManager_newWriteBufferManager( * Method: disposeInternal * Signature: (J)V */ -void Java_org_rocksdb_WriteBufferManager_disposeInternal( - JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { +void Java_org_rocksdb_WriteBufferManager_disposeInternal(JNIEnv* /*env*/, + jobject /*jobj*/, + jlong jhandle) { auto* write_buffer_manager = reinterpret_cast*>( jhandle); diff --git a/java/rocksjni/writebatchhandlerjnicallback.cc b/java/rocksjni/writebatchhandlerjnicallback.cc index b9f42f904ff..66ceabe9aea 100644 --- a/java/rocksjni/writebatchhandlerjnicallback.cc +++ b/java/rocksjni/writebatchhandlerjnicallback.cc @@ -7,58 +7,58 @@ // ROCKSDB_NAMESPACE::Comparator. #include "rocksjni/writebatchhandlerjnicallback.h" + #include "rocksjni/portal.h" namespace ROCKSDB_NAMESPACE { WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback( JNIEnv* env, jobject jWriteBatchHandler) : JniCallback(env, jWriteBatchHandler), m_env(env) { - m_jPutCfMethodId = WriteBatchHandlerJni::getPutCfMethodId(env); - if(m_jPutCfMethodId == nullptr) { + if (m_jPutCfMethodId == nullptr) { // exception thrown return; } m_jPutMethodId = WriteBatchHandlerJni::getPutMethodId(env); - if(m_jPutMethodId == nullptr) { + if (m_jPutMethodId == nullptr) { // exception thrown return; } m_jMergeCfMethodId = WriteBatchHandlerJni::getMergeCfMethodId(env); - if(m_jMergeCfMethodId == nullptr) { + if (m_jMergeCfMethodId == nullptr) { // exception thrown return; } m_jMergeMethodId = WriteBatchHandlerJni::getMergeMethodId(env); - if(m_jMergeMethodId == nullptr) { + if (m_jMergeMethodId == nullptr) { // exception thrown return; } m_jDeleteCfMethodId = WriteBatchHandlerJni::getDeleteCfMethodId(env); - if(m_jDeleteCfMethodId == nullptr) { + if (m_jDeleteCfMethodId == nullptr) { // exception thrown return; } m_jDeleteMethodId = WriteBatchHandlerJni::getDeleteMethodId(env); - if(m_jDeleteMethodId == nullptr) { + if (m_jDeleteMethodId == nullptr) { // exception thrown return; } m_jSingleDeleteCfMethodId = WriteBatchHandlerJni::getSingleDeleteCfMethodId(env); - if(m_jSingleDeleteCfMethodId == nullptr) { + if (m_jSingleDeleteCfMethodId == nullptr) { // exception thrown return; } m_jSingleDeleteMethodId = WriteBatchHandlerJni::getSingleDeleteMethodId(env); - if(m_jSingleDeleteMethodId == nullptr) { + if (m_jSingleDeleteMethodId == nullptr) { // exception thrown return; } @@ -77,46 +77,46 @@ WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback( } m_jLogDataMethodId = WriteBatchHandlerJni::getLogDataMethodId(env); - if(m_jLogDataMethodId == nullptr) { + if (m_jLogDataMethodId == nullptr) { // exception thrown return; } m_jPutBlobIndexCfMethodId = WriteBatchHandlerJni::getPutBlobIndexCfMethodId(env); - if(m_jPutBlobIndexCfMethodId == nullptr) { + if (m_jPutBlobIndexCfMethodId == nullptr) { // exception thrown return; } m_jMarkBeginPrepareMethodId = WriteBatchHandlerJni::getMarkBeginPrepareMethodId(env); - if(m_jMarkBeginPrepareMethodId == nullptr) { + if (m_jMarkBeginPrepareMethodId == nullptr) { // exception thrown return; } m_jMarkEndPrepareMethodId = WriteBatchHandlerJni::getMarkEndPrepareMethodId(env); - if(m_jMarkEndPrepareMethodId == nullptr) { + if (m_jMarkEndPrepareMethodId == nullptr) { // exception thrown return; } m_jMarkNoopMethodId = WriteBatchHandlerJni::getMarkNoopMethodId(env); - if(m_jMarkNoopMethodId == nullptr) { + if (m_jMarkNoopMethodId == nullptr) { // exception thrown return; } m_jMarkRollbackMethodId = WriteBatchHandlerJni::getMarkRollbackMethodId(env); - if(m_jMarkRollbackMethodId == nullptr) { + if (m_jMarkRollbackMethodId == nullptr) { // exception thrown return; } m_jMarkCommitMethodId = WriteBatchHandlerJni::getMarkCommitMethodId(env); - if(m_jMarkCommitMethodId == nullptr) { + if (m_jMarkCommitMethodId == nullptr) { // exception thrown return; } @@ -129,7 +129,7 @@ WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback( } m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env); - if(m_jContinueMethodId == nullptr) { + if (m_jContinueMethodId == nullptr) { // exception thrown return; } @@ -137,17 +137,12 @@ WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback( ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::PutCF( uint32_t column_family_id, const Slice& key, const Slice& value) { - auto put = [this, column_family_id] ( - jbyteArray j_key, jbyteArray j_value) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jPutCfMethodId, - static_cast(column_family_id), - j_key, - j_value); + auto put = [this, column_family_id](jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod(m_jcallback_obj, m_jPutCfMethodId, + static_cast(column_family_id), j_key, j_value); }; auto status = WriteBatchHandlerJniCallback::kv_op(key, value, put); - if(status == nullptr) { + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -157,30 +152,20 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::PutCF( } void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) { - auto put = [this] ( - jbyteArray j_key, jbyteArray j_value) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jPutMethodId, - j_key, - j_value); + auto put = [this](jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod(m_jcallback_obj, m_jPutMethodId, j_key, j_value); }; WriteBatchHandlerJniCallback::kv_op(key, value, put); } ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MergeCF( uint32_t column_family_id, const Slice& key, const Slice& value) { - auto merge = [this, column_family_id] ( - jbyteArray j_key, jbyteArray j_value) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jMergeCfMethodId, - static_cast(column_family_id), - j_key, - j_value); + auto merge = [this, column_family_id](jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod(m_jcallback_obj, m_jMergeCfMethodId, + static_cast(column_family_id), j_key, j_value); }; auto status = WriteBatchHandlerJniCallback::kv_op(key, value, merge); - if(status == nullptr) { + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -190,28 +175,20 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MergeCF( } void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) { - auto merge = [this] ( - jbyteArray j_key, jbyteArray j_value) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jMergeMethodId, - j_key, - j_value); + auto merge = [this](jbyteArray j_key, jbyteArray j_value) { + m_env->CallVoidMethod(m_jcallback_obj, m_jMergeMethodId, j_key, j_value); }; WriteBatchHandlerJniCallback::kv_op(key, value, merge); } ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::DeleteCF( uint32_t column_family_id, const Slice& key) { - auto remove = [this, column_family_id] (jbyteArray j_key) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jDeleteCfMethodId, - static_cast(column_family_id), - j_key); + auto remove = [this, column_family_id](jbyteArray j_key) { + m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteCfMethodId, + static_cast(column_family_id), j_key); }; auto status = WriteBatchHandlerJniCallback::k_op(key, remove); - if(status == nullptr) { + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -221,26 +198,20 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::DeleteCF( } void WriteBatchHandlerJniCallback::Delete(const Slice& key) { - auto remove = [this] (jbyteArray j_key) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jDeleteMethodId, - j_key); + auto remove = [this](jbyteArray j_key) { + m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteMethodId, j_key); }; WriteBatchHandlerJniCallback::k_op(key, remove); } ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::SingleDeleteCF( uint32_t column_family_id, const Slice& key) { - auto singleDelete = [this, column_family_id] (jbyteArray j_key) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jSingleDeleteCfMethodId, - static_cast(column_family_id), - j_key); + auto singleDelete = [this, column_family_id](jbyteArray j_key) { + m_env->CallVoidMethod(m_jcallback_obj, m_jSingleDeleteCfMethodId, + static_cast(column_family_id), j_key); }; auto status = WriteBatchHandlerJniCallback::k_op(key, singleDelete); - if(status == nullptr) { + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -250,28 +221,23 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::SingleDeleteCF( } void WriteBatchHandlerJniCallback::SingleDelete(const Slice& key) { - auto singleDelete = [this] (jbyteArray j_key) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jSingleDeleteMethodId, - j_key); + auto singleDelete = [this](jbyteArray j_key) { + m_env->CallVoidMethod(m_jcallback_obj, m_jSingleDeleteMethodId, j_key); }; WriteBatchHandlerJniCallback::k_op(key, singleDelete); } ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::DeleteRangeCF( uint32_t column_family_id, const Slice& beginKey, const Slice& endKey) { - auto deleteRange = [this, column_family_id] ( - jbyteArray j_beginKey, jbyteArray j_endKey) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jDeleteRangeCfMethodId, - static_cast(column_family_id), - j_beginKey, - j_endKey); + auto deleteRange = [this, column_family_id](jbyteArray j_beginKey, + jbyteArray j_endKey) { + m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteRangeCfMethodId, + static_cast(column_family_id), j_beginKey, + j_endKey); }; - auto status = WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange); - if(status == nullptr) { + auto status = + WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange); + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -281,41 +247,30 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::DeleteRangeCF( } void WriteBatchHandlerJniCallback::DeleteRange(const Slice& beginKey, - const Slice& endKey) { - auto deleteRange = [this] ( - jbyteArray j_beginKey, jbyteArray j_endKey) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jDeleteRangeMethodId, - j_beginKey, - j_endKey); + const Slice& endKey) { + auto deleteRange = [this](jbyteArray j_beginKey, jbyteArray j_endKey) { + m_env->CallVoidMethod(m_jcallback_obj, m_jDeleteRangeMethodId, j_beginKey, + j_endKey); }; WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange); } void WriteBatchHandlerJniCallback::LogData(const Slice& blob) { - auto logData = [this] (jbyteArray j_blob) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jLogDataMethodId, - j_blob); + auto logData = [this](jbyteArray j_blob) { + m_env->CallVoidMethod(m_jcallback_obj, m_jLogDataMethodId, j_blob); }; WriteBatchHandlerJniCallback::k_op(blob, logData); } ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::PutBlobIndexCF( uint32_t column_family_id, const Slice& key, const Slice& value) { - auto putBlobIndex = [this, column_family_id] ( - jbyteArray j_key, jbyteArray j_value) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jPutBlobIndexCfMethodId, - static_cast(column_family_id), - j_key, - j_value); + auto putBlobIndex = [this, column_family_id](jbyteArray j_key, + jbyteArray j_value) { + m_env->CallVoidMethod(m_jcallback_obj, m_jPutBlobIndexCfMethodId, + static_cast(column_family_id), j_key, j_value); }; auto status = WriteBatchHandlerJniCallback::kv_op(key, value, putBlobIndex); - if(status == nullptr) { + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -327,7 +282,7 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::PutBlobIndexCF( ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkBeginPrepare( bool unprepare) { #ifndef DEBUG - (void) unprepare; + (void)unprepare; #else assert(!unprepare); #endif @@ -346,7 +301,8 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkBeginPrepare( // better error code here } else { - m_env->ExceptionClear(); // clear the exception, as we have extracted the status + m_env->ExceptionClear(); // clear the exception, as we have extracted the + // status return ROCKSDB_NAMESPACE::Status(*status); } } @@ -356,15 +312,11 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkBeginPrepare( ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkEndPrepare( const Slice& xid) { - auto markEndPrepare = [this] ( - jbyteArray j_xid) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jMarkEndPrepareMethodId, - j_xid); + auto markEndPrepare = [this](jbyteArray j_xid) { + m_env->CallVoidMethod(m_jcallback_obj, m_jMarkEndPrepareMethodId, j_xid); }; auto status = WriteBatchHandlerJniCallback::k_op(xid, markEndPrepare); - if(status == nullptr) { + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -375,7 +327,8 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkEndPrepare( ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkNoop( bool empty_batch) { - m_env->CallVoidMethod(m_jcallback_obj, m_jMarkNoopMethodId, static_cast(empty_batch)); + m_env->CallVoidMethod(m_jcallback_obj, m_jMarkNoopMethodId, + static_cast(empty_batch)); // check for Exception, in-particular RocksDBException if (m_env->ExceptionCheck()) { @@ -390,7 +343,8 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkNoop( // better error code here } else { - m_env->ExceptionClear(); // clear the exception, as we have extracted the status + m_env->ExceptionClear(); // clear the exception, as we have extracted the + // status return ROCKSDB_NAMESPACE::Status(*status); } } @@ -400,15 +354,11 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkNoop( ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkRollback( const Slice& xid) { - auto markRollback = [this] ( - jbyteArray j_xid) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jMarkRollbackMethodId, - j_xid); + auto markRollback = [this](jbyteArray j_xid) { + m_env->CallVoidMethod(m_jcallback_obj, m_jMarkRollbackMethodId, j_xid); }; auto status = WriteBatchHandlerJniCallback::k_op(xid, markRollback); - if(status == nullptr) { + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -419,15 +369,11 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkRollback( ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkCommit( const Slice& xid) { - auto markCommit = [this] ( - jbyteArray j_xid) { - m_env->CallVoidMethod( - m_jcallback_obj, - m_jMarkCommitMethodId, - j_xid); + auto markCommit = [this](jbyteArray j_xid) { + m_env->CallVoidMethod(m_jcallback_obj, m_jMarkCommitMethodId, j_xid); }; auto status = WriteBatchHandlerJniCallback::k_op(xid, markCommit); - if(status == nullptr) { + if (status == nullptr) { return ROCKSDB_NAMESPACE::Status::OK(); // TODO(AR) what to do if there is // an Exception but we don't know // the ROCKSDB_NAMESPACE::Status? @@ -454,10 +400,9 @@ ROCKSDB_NAMESPACE::Status WriteBatchHandlerJniCallback::MarkCommitWithTimestamp( } bool WriteBatchHandlerJniCallback::Continue() { - jboolean jContinue = m_env->CallBooleanMethod( - m_jcallback_obj, - m_jContinueMethodId); - if(m_env->ExceptionCheck()) { + jboolean jContinue = + m_env->CallBooleanMethod(m_jcallback_obj, m_jContinueMethodId); + if (m_env->ExceptionCheck()) { // exception thrown m_env->ExceptionDescribe(); } @@ -510,7 +455,8 @@ std::unique_ptr WriteBatchHandlerJniCallback::kv_op( return nullptr; } else { - m_env->ExceptionClear(); // clear the exception, as we have extracted the status + m_env->ExceptionClear(); // clear the exception, as we have extracted the + // status return status; } } @@ -556,7 +502,8 @@ std::unique_ptr WriteBatchHandlerJniCallback::k_op( return nullptr; } else { - m_env->ExceptionClear(); // clear the exception, as we have extracted the status + m_env->ExceptionClear(); // clear the exception, as we have extracted the + // status return status; } } diff --git a/java/rocksjni/writebatchhandlerjnicallback.h b/java/rocksjni/writebatchhandlerjnicallback.h index c12ffe0d9e9..9629797ca73 100644 --- a/java/rocksjni/writebatchhandlerjnicallback.h +++ b/java/rocksjni/writebatchhandlerjnicallback.h @@ -9,11 +9,13 @@ #ifndef JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ #define JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ -#include #include + +#include #include -#include "rocksjni/jnicallback.h" + #include "rocksdb/write_batch.h" +#include "rocksjni/jnicallback.h" namespace ROCKSDB_NAMESPACE { /** @@ -23,68 +25,67 @@ namespace ROCKSDB_NAMESPACE { * which calls the appropriate Java method. * This enables Write Batch Handlers to be implemented in Java. */ -class WriteBatchHandlerJniCallback : public JniCallback, public WriteBatch::Handler { +class WriteBatchHandlerJniCallback : public JniCallback, + public WriteBatch::Handler { public: - WriteBatchHandlerJniCallback( - JNIEnv* env, jobject jWriteBackHandler); - Status PutCF(uint32_t column_family_id, const Slice& key, - const Slice& value); - void Put(const Slice& key, const Slice& value); - Status MergeCF(uint32_t column_family_id, const Slice& key, - const Slice& value); - void Merge(const Slice& key, const Slice& value); - Status DeleteCF(uint32_t column_family_id, const Slice& key); - void Delete(const Slice& key); - Status SingleDeleteCF(uint32_t column_family_id, const Slice& key); - void SingleDelete(const Slice& key); - Status DeleteRangeCF(uint32_t column_family_id, const Slice& beginKey, - const Slice& endKey); - void DeleteRange(const Slice& beginKey, const Slice& endKey); - void LogData(const Slice& blob); - Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, - const Slice& value); - Status MarkBeginPrepare(bool); - Status MarkEndPrepare(const Slice& xid); - Status MarkNoop(bool empty_batch); - Status MarkRollback(const Slice& xid); - Status MarkCommit(const Slice& xid); - Status MarkCommitWithTimestamp(const Slice& xid, const Slice& commit_ts); - bool Continue(); + WriteBatchHandlerJniCallback(JNIEnv* env, jobject jWriteBackHandler); + Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value); + void Put(const Slice& key, const Slice& value); + Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value); + void Merge(const Slice& key, const Slice& value); + Status DeleteCF(uint32_t column_family_id, const Slice& key); + void Delete(const Slice& key); + Status SingleDeleteCF(uint32_t column_family_id, const Slice& key); + void SingleDelete(const Slice& key); + Status DeleteRangeCF(uint32_t column_family_id, const Slice& beginKey, + const Slice& endKey); + void DeleteRange(const Slice& beginKey, const Slice& endKey); + void LogData(const Slice& blob); + Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key, + const Slice& value); + Status MarkBeginPrepare(bool); + Status MarkEndPrepare(const Slice& xid); + Status MarkNoop(bool empty_batch); + Status MarkRollback(const Slice& xid); + Status MarkCommit(const Slice& xid); + Status MarkCommitWithTimestamp(const Slice& xid, const Slice& commit_ts); + bool Continue(); private: - JNIEnv* m_env; - jmethodID m_jPutCfMethodId; - jmethodID m_jPutMethodId; - jmethodID m_jMergeCfMethodId; - jmethodID m_jMergeMethodId; - jmethodID m_jDeleteCfMethodId; - jmethodID m_jDeleteMethodId; - jmethodID m_jSingleDeleteCfMethodId; - jmethodID m_jSingleDeleteMethodId; - jmethodID m_jDeleteRangeCfMethodId; - jmethodID m_jDeleteRangeMethodId; - jmethodID m_jLogDataMethodId; - jmethodID m_jPutBlobIndexCfMethodId; - jmethodID m_jMarkBeginPrepareMethodId; - jmethodID m_jMarkEndPrepareMethodId; - jmethodID m_jMarkNoopMethodId; - jmethodID m_jMarkRollbackMethodId; - jmethodID m_jMarkCommitMethodId; - jmethodID m_jMarkCommitWithTimestampMethodId; - jmethodID m_jContinueMethodId; - /** - * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an - * unexpected exception occurred - */ - std::unique_ptr kv_op( - const Slice& key, const Slice& value, - std::function kvFn); - /** - * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an - * unexpected exception occurred - */ - std::unique_ptr k_op( - const Slice& key, std::function kFn); + JNIEnv* m_env; + jmethodID m_jPutCfMethodId; + jmethodID m_jPutMethodId; + jmethodID m_jMergeCfMethodId; + jmethodID m_jMergeMethodId; + jmethodID m_jDeleteCfMethodId; + jmethodID m_jDeleteMethodId; + jmethodID m_jSingleDeleteCfMethodId; + jmethodID m_jSingleDeleteMethodId; + jmethodID m_jDeleteRangeCfMethodId; + jmethodID m_jDeleteRangeMethodId; + jmethodID m_jLogDataMethodId; + jmethodID m_jPutBlobIndexCfMethodId; + jmethodID m_jMarkBeginPrepareMethodId; + jmethodID m_jMarkEndPrepareMethodId; + jmethodID m_jMarkNoopMethodId; + jmethodID m_jMarkRollbackMethodId; + jmethodID m_jMarkCommitMethodId; + jmethodID m_jMarkCommitWithTimestampMethodId; + jmethodID m_jContinueMethodId; + /** + * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an + * unexpected exception occurred + */ + std::unique_ptr kv_op( + const Slice& key, const Slice& value, + std::function kvFn); + /** + * @return A pointer to a ROCKSDB_NAMESPACE::Status or nullptr if an + * unexpected exception occurred + */ + std::unique_ptr k_op( + const Slice& key, std::function kFn); }; } // namespace ROCKSDB_NAMESPACE diff --git a/java/src/main/java/org/rocksdb/CompactionReason.java b/java/src/main/java/org/rocksdb/CompactionReason.java index 24e23445041..46ec33f3f14 100644 --- a/java/src/main/java/org/rocksdb/CompactionReason.java +++ b/java/src/main/java/org/rocksdb/CompactionReason.java @@ -88,7 +88,23 @@ public enum CompactionReason { /** * Compaction in order to move files to temperature */ - kChangeTemperature((byte) 0x0F); + kChangeTemperature((byte) 0x0F), + + /** + * Compaction scheduled to force garbage collection of blob files + */ + kForcedBlobGC((byte) 0x11), + + /** + * A special TTL compaction for RoundRobin policy, which basically the same as + * kLevelMaxLevelSize, but the goal is to compact TTLed files. + */ + kRoundRobinTtl((byte) 0x12), + + /** + * Compaction by calling DBImpl::ReFitLevel + */ + kRefitLevel((byte) 0x13); private final byte value; diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java index 54322226203..9eb5ca8738e 100644 --- a/java/src/main/java/org/rocksdb/DBOptions.java +++ b/java/src/main/java/org/rocksdb/DBOptions.java @@ -31,6 +31,7 @@ public class DBOptions extends RocksObject public DBOptions() { super(newDBOptions()); numShardBits_ = DEFAULT_NUM_SHARD_BITS; + env_ = Env.getDefault(); } /** diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index 1f1e5507a56..54f88262bd9 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -66,7 +66,7 @@ public Options(final DBOptions dbOptions, final ColumnFamilyOptions columnFamilyOptions) { super(newOptions(dbOptions.nativeHandle_, columnFamilyOptions.nativeHandle_)); - env_ = Env.getDefault(); + env_ = dbOptions.getEnv() != null ? dbOptions.getEnv() : Env.getDefault(); } /** diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java index 5da471f18f9..9ad51c7c736 100644 --- a/java/src/main/java/org/rocksdb/util/Environment.java +++ b/java/src/main/java/org/rocksdb/util/Environment.java @@ -1,21 +1,20 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. package org.rocksdb.util; +import java.io.File; import java.io.IOException; public class Environment { private static String OS = System.getProperty("os.name").toLowerCase(); private static String ARCH = System.getProperty("os.arch").toLowerCase(); - private static boolean MUSL_LIBC; + private static String MUSL_ENVIRONMENT = System.getenv("ROCKSDB_MUSL_LIBC"); - static { - try { - final Process p = new ProcessBuilder("/usr/bin/env", "sh", "-c", "ldd /usr/bin/env | grep -q musl").start(); - MUSL_LIBC = p.waitFor() == 0; - } catch (final IOException | InterruptedException e) { - MUSL_LIBC = false; - } - } + /** + * Will be lazily initialised by {@link #isMuslLibc()} instead of the previous static + * initialisation. The lazy initialisation prevents Windows from reporting suspicious behaviour of + * the JVM attempting IO on Unix paths. + */ + private static Boolean MUSL_LIBC = null; public static boolean isAarch64() { return ARCH.contains("aarch64"); @@ -50,10 +49,80 @@ public static boolean isUnix() { OS.contains("nux"); } + /** + * Determine if the environment has a musl libc. + * + * @return true if the environment has a musl libc, false otherwise. + */ public static boolean isMuslLibc() { + if (MUSL_LIBC == null) { + MUSL_LIBC = initIsMuslLibc(); + } return MUSL_LIBC; } + /** + * Determine if the environment has a musl libc. + * + * The initialisation counterpart of {@link #isMuslLibc()}. + * + * Intentionally package-private for testing. + * + * @return true if the environment has a musl libc, false otherwise. + */ + static boolean initIsMuslLibc() { + // consider explicit user setting from environment first + if ("true".equalsIgnoreCase(MUSL_ENVIRONMENT)) { + return true; + } + if ("false".equalsIgnoreCase(MUSL_ENVIRONMENT)) { + return false; + } + + // check if ldd indicates a muslc lib + try { + final Process p = + new ProcessBuilder("/usr/bin/env", "sh", "-c", "ldd /usr/bin/env | grep -q musl").start(); + if (p.waitFor() == 0) { + return true; + } + } catch (final IOException | InterruptedException e) { + // do nothing, and move on to the next check + } + + final File lib = new File("/lib"); + if (lib.exists() && lib.isDirectory() && lib.canRead()) { + // attempt the most likely musl libc name first + final String possibleMuslcLibName; + if (isPowerPC()) { + possibleMuslcLibName = "libc.musl-ppc64le.so.1"; + } else if (isAarch64()) { + possibleMuslcLibName = "libc.musl-aarch64.so.1"; + } else if (isS390x()) { + possibleMuslcLibName = "libc.musl-s390x.so.1"; + } else { + possibleMuslcLibName = "libc.musl-x86_64.so.1"; + } + final File possibleMuslcLib = new File(lib, possibleMuslcLibName); + if (possibleMuslcLib.exists() && possibleMuslcLib.canRead()) { + return true; + } + + // fallback to scanning for a musl libc + final File[] libFiles = lib.listFiles(); + if (libFiles == null) { + return false; + } + for (final File f : libFiles) { + if (f.getName().startsWith("libc.musl")) { + return true; + } + } + } + + return false; + } + public static boolean isSolaris() { return OS.contains("sunos"); } diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java index defffa6c7bb..330881764df 100644 --- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java +++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java @@ -9,6 +9,10 @@ import static org.junit.Assert.fail; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.stream.Stream; import org.junit.ClassRule; import org.junit.Ignore; import org.junit.Rule; @@ -29,7 +33,6 @@ public void cacheIndexAndFilterBlocks() { blockBasedTableConfig.setCacheIndexAndFilterBlocks(true); assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocks()). isTrue(); - } @Test @@ -95,6 +98,76 @@ public void checksumType() { assertThat(blockBasedTableConfig.checksumType()).isEqualTo(ChecksumType.kXXH3); } + @Test + public void jniPortal() throws Exception { + // Verifies that the JNI layer is correctly translating options. + // Since introspecting the options requires creating a database, the checks + // cover multiple options at the same time. + + final BlockBasedTableConfig tableConfig = new BlockBasedTableConfig(); + + tableConfig.setIndexType(IndexType.kBinarySearch); + tableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinarySearch); + tableConfig.setChecksumType(ChecksumType.kNoChecksum); + try (final Options options = new Options().setTableFormatConfig(tableConfig)) { + String opts = getOptionAsString(options); + assertThat(opts).contains("index_type=kBinarySearch"); + assertThat(opts).contains("data_block_index_type=kDataBlockBinarySearch"); + assertThat(opts).contains("checksum=kNoChecksum"); + } + + tableConfig.setIndexType(IndexType.kHashSearch); + tableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinaryAndHash); + tableConfig.setChecksumType(ChecksumType.kCRC32c); + try (final Options options = new Options().setTableFormatConfig(tableConfig)) { + options.useCappedPrefixExtractor(1); // Needed to use kHashSearch + String opts = getOptionAsString(options); + assertThat(opts).contains("index_type=kHashSearch"); + assertThat(opts).contains("data_block_index_type=kDataBlockBinaryAndHash"); + assertThat(opts).contains("checksum=kCRC32c"); + } + + tableConfig.setIndexType(IndexType.kTwoLevelIndexSearch); + tableConfig.setChecksumType(ChecksumType.kxxHash); + try (final Options options = new Options().setTableFormatConfig(tableConfig)) { + String opts = getOptionAsString(options); + assertThat(opts).contains("index_type=kTwoLevelIndexSearch"); + assertThat(opts).contains("checksum=kxxHash"); + } + + tableConfig.setIndexType(IndexType.kBinarySearchWithFirstKey); + tableConfig.setChecksumType(ChecksumType.kxxHash64); + try (final Options options = new Options().setTableFormatConfig(tableConfig)) { + String opts = getOptionAsString(options); + assertThat(opts).contains("index_type=kBinarySearchWithFirstKey"); + assertThat(opts).contains("checksum=kxxHash64"); + } + + tableConfig.setChecksumType(ChecksumType.kXXH3); + try (final Options options = new Options().setTableFormatConfig(tableConfig)) { + String opts = getOptionAsString(options); + assertThat(opts).contains("checksum=kXXH3"); + } + } + + private String getOptionAsString(Options options) throws Exception { + options.setCreateIfMissing(true); + String dbPath = dbFolder.getRoot().getAbsolutePath(); + String result; + try (final RocksDB db = RocksDB.open(options, dbPath); + final Stream pathStream = Files.walk(Paths.get(dbPath))) { + Path optionsPath = + pathStream + .filter(p -> p.getFileName().toString().startsWith("OPTIONS")) + .findAny() + .orElseThrow(() -> new AssertionError("Missing options file")); + byte[] optionsData = Files.readAllBytes(optionsPath); + result = new String(optionsData, StandardCharsets.UTF_8); + } + RocksDB.destroyDB(dbPath, options); + return result; + } + @Test public void noBlockCache() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); diff --git a/java/src/test/java/org/rocksdb/MixedOptionsTest.java b/java/src/test/java/org/rocksdb/MixedOptionsTest.java index 10c92d49dd8..4e17d04ef38 100644 --- a/java/src/test/java/org/rocksdb/MixedOptionsTest.java +++ b/java/src/test/java/org/rocksdb/MixedOptionsTest.java @@ -52,4 +52,34 @@ public void mixedOptionsTest(){ } } } + + @Test + public void mixedOptionsEnvTest() { + try (final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(); + final DBOptions dbOptions = new DBOptions()) { + assertThat(dbOptions.getEnv()).isNotNull(); + assertThat(dbOptions.getEnv()).isSameAs(Env.getDefault()); + final Env memEnv = new RocksMemEnv(Env.getDefault()); + + try (final Options options = new Options(dbOptions, cfOptions)) { + assertThat(options.getEnv()).isSameAs(Env.getDefault()); + } + + dbOptions.setEnv(memEnv); + memEnv.setBackgroundThreads(4, Priority.LOW); + Env.getDefault().setBackgroundThreads(2, Priority.HIGH); + assertThat(dbOptions.getEnv().getBackgroundThreads(Priority.LOW)).isEqualTo(4); + assertThat(dbOptions.getEnv().getBackgroundThreads(Priority.HIGH)).isEqualTo(2); + assertThat(Env.getDefault().getBackgroundThreads(Priority.LOW)).isEqualTo(4); + assertThat(Env.getDefault().getBackgroundThreads(Priority.HIGH)).isEqualTo(2); + + try (final Options options = new Options(dbOptions, cfOptions)) { + assertThat(options.getEnv().getBackgroundThreads(Priority.LOW)).isEqualTo(4); + assertThat(options.getEnv().getBackgroundThreads(Priority.HIGH)).isEqualTo(2); + + assertThat(options.getEnv()).isNotSameAs(Env.getDefault()); + assertThat(options.getEnv()).isSameAs(memEnv); + } + } + } } diff --git a/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java b/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java new file mode 100644 index 00000000000..cdfd9d3a9f1 --- /dev/null +++ b/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java @@ -0,0 +1,146 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +/** + * Test for changes made by + * transactional multiGet problem + * the tests here were previously broken by the nonsense removed by that change. + */ +@RunWith(Parameterized.class) +public class MultiColumnRegressionTest { + @Parameterized.Parameters + public static List data() { + return Arrays.asList(new Params(3, 100), new Params(3, 1000000)); + } + + public static class Params { + final int numColumns; + final int keySize; + + public Params(final int numColumns, final int keySize) { + this.numColumns = numColumns; + this.keySize = keySize; + } + } + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + private final Params params; + + public MultiColumnRegressionTest(final Params params) { + this.params = params; + } + + @Test + public void transactionDB() throws RocksDBException { + final List columnFamilyDescriptors = new ArrayList<>(); + for (int i = 0; i < params.numColumns; i++) { + StringBuilder sb = new StringBuilder(); + sb.append("cf" + i); + for (int j = 0; j < params.keySize; j++) sb.append("_cf"); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor(sb.toString().getBytes())); + } + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List columnFamilyHandles = + db.createColumnFamilies(columnFamilyDescriptors); + } + + columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes())); + final List columnFamilyHandles = new ArrayList<>(); + try (final TransactionDB tdb = TransactionDB.open(new DBOptions().setCreateIfMissing(true), + new TransactionDBOptions(), dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + final WriteOptions writeOptions = new WriteOptions(); + try (Transaction transaction = tdb.beginTransaction(writeOptions)) { + for (int i = 0; i < params.numColumns; i++) { + transaction.put( + columnFamilyHandles.get(i), ("key" + i).getBytes(), ("value" + (i - 7)).getBytes()); + } + transaction.put("key".getBytes(), "value".getBytes()); + transaction.commit(); + } + for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + columnFamilyHandle.close(); + } + } + + final List columnFamilyHandles2 = new ArrayList<>(); + try (final TransactionDB tdb = TransactionDB.open(new DBOptions().setCreateIfMissing(true), + new TransactionDBOptions(), dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles2)) { + try (Transaction transaction = tdb.beginTransaction(new WriteOptions())) { + final ReadOptions readOptions = new ReadOptions(); + for (int i = 0; i < params.numColumns; i++) { + final byte[] value = + transaction.get(columnFamilyHandles2.get(i), readOptions, ("key" + i).getBytes()); + assertThat(value).isEqualTo(("value" + (i - 7)).getBytes()); + } + transaction.commit(); + } + for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles2) { + columnFamilyHandle.close(); + } + } + } + + @Test + public void optimisticDB() throws RocksDBException { + final List columnFamilyDescriptors = new ArrayList<>(); + for (int i = 0; i < params.numColumns; i++) { + columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes())); + } + + columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes())); + final List columnFamilyHandles = new ArrayList<>(); + try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open( + new DBOptions().setCreateIfMissing(true), dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + try (Transaction transaction = otdb.beginTransaction(new WriteOptions())) { + for (int i = 0; i < params.numColumns; i++) { + transaction.put( + columnFamilyHandles.get(i), ("key" + i).getBytes(), ("value" + (i - 7)).getBytes()); + } + transaction.put("key".getBytes(), "value".getBytes()); + transaction.commit(); + } + for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + columnFamilyHandle.close(); + } + } + + final List columnFamilyHandles2 = new ArrayList<>(); + try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open( + new DBOptions().setCreateIfMissing(true), dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles2)) { + try (Transaction transaction = otdb.beginTransaction(new WriteOptions())) { + final ReadOptions readOptions = new ReadOptions(); + for (int i = 0; i < params.numColumns; i++) { + final byte[] value = + transaction.get(columnFamilyHandles2.get(i), readOptions, ("key" + i).getBytes()); + assertThat(value).isEqualTo(("value" + (i - 7)).getBytes()); + } + transaction.commit(); + } + for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles2) { + columnFamilyHandle.close(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java b/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java index 9a23be788ba..90a13e1da05 100644 --- a/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java +++ b/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java @@ -6,7 +6,6 @@ import static org.assertj.core.api.Assertions.assertThat; -import java.nio.charset.StandardCharsets; import java.util.*; import org.junit.Rule; import org.junit.Test; @@ -18,53 +17,225 @@ public class MultiGetManyKeysTest { @Parameterized.Parameters public static List data() { - return Arrays.asList(3, 250, 60000, 70000, 150000, 750000); + return Arrays.asList(2, 3, 250, 60000, 70000, 150000, 750000); } @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); - private final int keySize; + private final int numKeys; - public MultiGetManyKeysTest(final Integer keySize) { - this.keySize = keySize; + public MultiGetManyKeysTest(final Integer numKeys) { + this.numKeys = numKeys; } /** - * Test for https://github.com/facebook/rocksdb/issues/8039 + * Test for multiGet problem */ @Test public void multiGetAsListLarge() throws RocksDBException { + final List keys = generateRandomKeys(numKeys); + final Map keyValues = generateRandomKeyValues(keys, 10); + putKeysAndValues(keyValues); + + try (final Options opt = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { + final List values = db.multiGetAsList(keys); + assertKeysAndValues(keys, keyValues, values); + } + } + + /** + * Test for transactional multiGet + * problem + */ + @Test + public void multiGetAsListLargeTransactional() throws RocksDBException { + final List keys = generateRandomKeys(numKeys); + final Map keyValues = generateRandomKeyValues(keys, 10); + putKeysAndValues(keyValues); + + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDB = + TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) { + try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) { + final List values = transaction.multiGetAsList(new ReadOptions(), keys); + assertKeysAndValues(keys, keyValues, values); + } + } + } + + /** + * Test for transactional multiGet + * problem + */ + @Test + public void multiGetForUpdateAsListLargeTransactional() throws RocksDBException { + final List keys = generateRandomKeys(numKeys); + final Map keyValues = generateRandomKeyValues(keys, 10); + putKeysAndValues(keyValues); + + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDB = + TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) { + try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) { + final List values = transaction.multiGetForUpdateAsList(new ReadOptions(), keys); + assertKeysAndValues(keys, keyValues, values); + } + } + } + + /** + * Test for transactional multiGet + * problem + */ + @Test + public void multiGetAsListLargeTransactionalCF() throws RocksDBException { + final List keys = generateRandomKeys(numKeys); + final Map keyValues = generateRandomKeyValues(keys, 10); + final ColumnFamilyDescriptor columnFamilyDescriptor = + new ColumnFamilyDescriptor("cfTest".getBytes()); + putKeysAndValues(columnFamilyDescriptor, keyValues); + + final List columnFamilyDescriptors = new ArrayList<>(); + columnFamilyDescriptors.add(columnFamilyDescriptor); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes())); + final List columnFamilyHandles = new ArrayList<>(); + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDB = TransactionDB.open(new DBOptions(options), txnDbOptions, + dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) { + final List columnFamilyHandlesForMultiGet = new ArrayList<>(numKeys); + for (int i = 0; i < numKeys; i++) + columnFamilyHandlesForMultiGet.add(columnFamilyHandles.get(0)); + try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) { + final List values = + transaction.multiGetAsList(new ReadOptions(), columnFamilyHandlesForMultiGet, keys); + assertKeysAndValues(keys, keyValues, values); + } + for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + columnFamilyHandle.close(); + } + } + } + + /** + * Test for transactional multiGet + * problem + */ + @Test + public void multiGetForUpdateAsListLargeTransactionalCF() throws RocksDBException { + final List keys = generateRandomKeys(numKeys); + final Map keyValues = generateRandomKeyValues(keys, 10); + final ColumnFamilyDescriptor columnFamilyDescriptor = + new ColumnFamilyDescriptor("cfTest".getBytes()); + putKeysAndValues(columnFamilyDescriptor, keyValues); + + final List columnFamilyDescriptors = new ArrayList<>(); + columnFamilyDescriptors.add(columnFamilyDescriptor); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes())); + final List columnFamilyHandles = new ArrayList<>(); + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDB = TransactionDB.open(new DBOptions(options), txnDbOptions, + dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) { + final List columnFamilyHandlesForMultiGet = new ArrayList<>(numKeys); + for (int i = 0; i < numKeys; i++) + columnFamilyHandlesForMultiGet.add(columnFamilyHandles.get(0)); + try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) { + final List values = transaction.multiGetForUpdateAsList( + new ReadOptions(), columnFamilyHandlesForMultiGet, keys); + assertKeysAndValues(keys, keyValues, values); + } + for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + columnFamilyHandle.close(); + } + } + } + + private List generateRandomKeys(final int numKeys) { final Random rand = new Random(); final List keys = new ArrayList<>(); - for (int i = 0; i < keySize; i++) { + for (int i = 0; i < numKeys; i++) { final byte[] key = new byte[4]; rand.nextBytes(key); keys.add(key); } + return keys; + } - try (final Options opt = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { - final List values = db.multiGetAsList(keys); - assertThat(values.size()).isEqualTo(keys.size()); + private Map generateRandomKeyValues(final List keys, final int percent) { + final Random rand = new Random(); + final Map keyValues = new HashMap<>(); + for (int i = 0; i < numKeys; i++) { + if (rand.nextInt(100) < percent) { + final byte[] value = new byte[1024]; + rand.nextBytes(value); + keyValues.put(new Key(keys.get(i)), value); + } } + return keyValues; } - @Test - public void multiGetAsListCheckResults() throws RocksDBException { - try (final Options opt = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { - final List keys = new ArrayList<>(); - for (int i = 0; i < keySize; i++) { - byte[] key = ("key" + i + ":").getBytes(); - keys.add(key); - db.put(key, ("value" + i + ":").getBytes()); + private void putKeysAndValues(Map keyValues) throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + for (Map.Entry keyValue : keyValues.entrySet()) { + db.put(keyValue.getKey().get(), keyValue.getValue()); } + } + } - final List values = db.multiGetAsList(keys); - assertThat(values.size()).isEqualTo(keys.size()); - for (int i = 0; i < keySize; i++) { - assertThat(values.get(i)).isEqualTo(("value" + i + ":").getBytes()); + private void putKeysAndValues(ColumnFamilyDescriptor columnFamilyDescriptor, + Map keyValues) throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + final ColumnFamilyHandle columnFamilyHandle = + db.createColumnFamily(columnFamilyDescriptor)) { + for (Map.Entry keyValue : keyValues.entrySet()) { + db.put(columnFamilyHandle, keyValue.getKey().get(), keyValue.getValue()); + } + } + } + + private void assertKeysAndValues( + final List keys, final Map keyValues, final List values) { + assertThat(values.size()).isEqualTo(keys.size()); + for (int i = 0; i < numKeys; i++) { + final Key key = new Key(keys.get(i)); + final byte[] value = values.get(i); + if (keyValues.containsKey(key)) { + assertThat(value).isEqualTo(keyValues.get(key)); + } else { + assertThat(value).isNull(); } } } + + static private class Key { + private final byte[] bytes; + public Key(byte[] bytes) { + this.bytes = bytes; + } + + public byte[] get() { + return this.bytes; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Key key = (Key) o; + return Arrays.equals(bytes, key.bytes); + } + + @Override + public int hashCode() { + return Arrays.hashCode(bytes); + } + } } diff --git a/java/src/test/java/org/rocksdb/MultiGetTest.java b/java/src/test/java/org/rocksdb/MultiGetTest.java index 323a6b1f405..c391d81f631 100644 --- a/java/src/test/java/org/rocksdb/MultiGetTest.java +++ b/java/src/test/java/org/rocksdb/MultiGetTest.java @@ -11,12 +11,17 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import org.rocksdb.util.TestUtil; public class MultiGetTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); @Test diff --git a/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java b/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java new file mode 100644 index 00000000000..471ef07287d --- /dev/null +++ b/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java @@ -0,0 +1,164 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public class PutMultiplePartsTest { + @Parameterized.Parameters + public static List data() { + return Arrays.asList(2, 3, 250, 20000); + } + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + private final int numParts; + + public PutMultiplePartsTest(final Integer numParts) { + this.numParts = numParts; + } + + @Test + public void putUntracked() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDB = + TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) { + try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) { + final byte[][] keys = generateItems("key", ":", numParts); + final byte[][] values = generateItems("value", "", numParts); + transaction.putUntracked(keys, values); + transaction.commit(); + } + txnDB.syncWal(); + } + + validateResults(); + } + + @Test + public void put() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDB = + TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath())) { + try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) { + final byte[][] keys = generateItems("key", ":", numParts); + final byte[][] values = generateItems("value", "", numParts); + transaction.put(keys, values); + transaction.commit(); + } + txnDB.syncWal(); + } + + validateResults(); + } + + @Test + public void putUntrackedCF() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDB = + TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath()); + final ColumnFamilyHandle columnFamilyHandle = + txnDB.createColumnFamily(new ColumnFamilyDescriptor("cfTest".getBytes()))) { + try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) { + final byte[][] keys = generateItems("key", ":", numParts); + final byte[][] values = generateItems("value", "", numParts); + transaction.putUntracked(columnFamilyHandle, keys, values); + transaction.commit(); + } + txnDB.syncWal(); + } + + validateResultsCF(); + } + @Test + public void putCF() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true); + final TransactionDBOptions txnDbOptions = new TransactionDBOptions(); + final TransactionDB txnDB = + TransactionDB.open(options, txnDbOptions, dbFolder.getRoot().getAbsolutePath()); + final ColumnFamilyHandle columnFamilyHandle = + txnDB.createColumnFamily(new ColumnFamilyDescriptor("cfTest".getBytes()))) { + try (final Transaction transaction = txnDB.beginTransaction(new WriteOptions())) { + final byte[][] keys = generateItems("key", ":", numParts); + final byte[][] values = generateItems("value", "", numParts); + transaction.put(columnFamilyHandle, keys, values); + transaction.commit(); + } + txnDB.syncWal(); + } + + validateResultsCF(); + } + + private void validateResults() throws RocksDBException { + try (final RocksDB db = RocksDB.open(new Options(), dbFolder.getRoot().getAbsolutePath())) { + final List keys = generateItemsAsList("key", ":", numParts); + final byte[][] values = generateItems("value", "", numParts); + + StringBuilder singleKey = new StringBuilder(); + for (int i = 0; i < numParts; i++) { + singleKey.append(new String(keys.get(i), StandardCharsets.UTF_8)); + } + final byte[] result = db.get(singleKey.toString().getBytes()); + StringBuilder singleValue = new StringBuilder(); + for (int i = 0; i < numParts; i++) { + singleValue.append(new String(values[i], StandardCharsets.UTF_8)); + } + assertThat(result).isEqualTo(singleValue.toString().getBytes()); + } + } + + private void validateResultsCF() throws RocksDBException { + final List columnFamilyDescriptors = new ArrayList<>(); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor("cfTest".getBytes())); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes())); + final List columnFamilyHandles = new ArrayList<>(); + try (final RocksDB db = RocksDB.open(new DBOptions(), dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, columnFamilyHandles)) { + final List keys = generateItemsAsList("key", ":", numParts); + final byte[][] values = generateItems("value", "", numParts); + + StringBuilder singleKey = new StringBuilder(); + for (int i = 0; i < numParts; i++) { + singleKey.append(new String(keys.get(i), StandardCharsets.UTF_8)); + } + final byte[] result = db.get(columnFamilyHandles.get(0), singleKey.toString().getBytes()); + StringBuilder singleValue = new StringBuilder(); + for (int i = 0; i < numParts; i++) { + singleValue.append(new String(values[i], StandardCharsets.UTF_8)); + } + assertThat(result).isEqualTo(singleValue.toString().getBytes()); + } + } + + private byte[][] generateItems(final String prefix, final String suffix, final int numItems) { + return generateItemsAsList(prefix, suffix, numItems).toArray(new byte[0][0]); + } + + private List generateItemsAsList( + final String prefix, final String suffix, final int numItems) { + final List items = new ArrayList<>(); + for (int i = 0; i < numItems; i++) { + items.add((prefix + i + suffix).getBytes()); + } + return items; + } +} diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java index 422bed40c6d..488dbafe802 100644 --- a/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -1425,7 +1425,7 @@ public void getLiveFiles() throws RocksDBException { try (final RocksDB db = RocksDB.open(options, dbPath)) { final RocksDB.LiveFiles livefiles = db.getLiveFiles(true); assertThat(livefiles).isNotNull(); - assertThat(livefiles.manifestFileSize).isEqualTo(59); + assertThat(livefiles.manifestFileSize).isEqualTo(66); assertThat(livefiles.files.size()).isEqualTo(3); assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT"); assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000005"); diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java index 301dec22f07..ae340e06d5d 100644 --- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java +++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java @@ -4,28 +4,32 @@ // (found in the LICENSE.Apache file in the root directory). package org.rocksdb.util; +import static org.assertj.core.api.Assertions.assertThat; +import static org.hamcrest.Matchers.is; + +import java.lang.reflect.Field; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; -import java.lang.reflect.Field; - -import static org.assertj.core.api.Assertions.assertThat; - public class EnvironmentTest { private final static String ARCH_FIELD_NAME = "ARCH"; private final static String OS_FIELD_NAME = "OS"; + + private final static String MUSL_ENVIRONMENT_FIELD_NAME = "MUSL_ENVIRONMENT"; private final static String MUSL_LIBC_FIELD_NAME = "MUSL_LIBC"; private static String INITIAL_OS; private static String INITIAL_ARCH; - private static boolean INITIAL_MUSL_LIBC; + private static String INITIAL_MUSL_ENVIRONMENT; + private static Boolean INITIAL_MUSL_LIBC; @BeforeClass public static void saveState() { INITIAL_ARCH = getEnvironmentClassField(ARCH_FIELD_NAME); INITIAL_OS = getEnvironmentClassField(OS_FIELD_NAME); INITIAL_MUSL_LIBC = getEnvironmentClassField(MUSL_LIBC_FIELD_NAME); + INITIAL_MUSL_ENVIRONMENT = getEnvironmentClassField(MUSL_ENVIRONMENT_FIELD_NAME); } @Test @@ -236,6 +240,21 @@ public void linuxArch64() { setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false); } + @Test + public void resolveIsMuslLibc() { + setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, null); + setEnvironmentClassFields("win", "anyarch"); + assertThat(Environment.isUnix()).isFalse(); + + // with user input, will resolve to true if set as true. Even on OSs that appear absurd for + // musl. Users choice + assertThat(Environment.initIsMuslLibc()).isFalse(); + setEnvironmentClassField(MUSL_ENVIRONMENT_FIELD_NAME, "true"); + assertThat(Environment.initIsMuslLibc()).isTrue(); + setEnvironmentClassField(MUSL_ENVIRONMENT_FIELD_NAME, "false"); + assertThat(Environment.initIsMuslLibc()).isFalse(); + } + private void setEnvironmentClassFields(String osName, String osArch) { setEnvironmentClassField(OS_FIELD_NAME, osName); @@ -246,6 +265,7 @@ private void setEnvironmentClassFields(String osName, public static void restoreState() { setEnvironmentClassField(OS_FIELD_NAME, INITIAL_OS); setEnvironmentClassField(ARCH_FIELD_NAME, INITIAL_ARCH); + setEnvironmentClassField(MUSL_ENVIRONMENT_FIELD_NAME, INITIAL_MUSL_ENVIRONMENT); setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, INITIAL_MUSL_LIBC); } diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc index 66f34651833..fe095847912 100644 --- a/logging/auto_roll_logger.cc +++ b/logging/auto_roll_logger.cc @@ -90,8 +90,8 @@ void AutoRollLogger::RollLogFile() { uint64_t now = clock_->NowMicros(); std::string old_fname; do { - old_fname = OldInfoLogFileName( - dbname_, now, db_absolute_path_, db_log_dir_); + old_fname = + OldInfoLogFileName(dbname_, now, db_absolute_path_, db_log_dir_); now++; } while (fs_->FileExists(old_fname, io_options_, &io_context_).ok()); // Wait for logger_ reference count to turn to 1 as it might be pinned by @@ -173,7 +173,7 @@ std::string AutoRollLogger::ValistToString(const char* format, char buffer[MAXBUFFERSIZE]; int count = vsnprintf(buffer, MAXBUFFERSIZE, format, args); - (void) count; + (void)count; assert(count >= 0); return buffer; diff --git a/logging/auto_roll_logger.h b/logging/auto_roll_logger.h index ccbce1d9940..805925e5a8a 100644 --- a/logging/auto_roll_logger.h +++ b/logging/auto_roll_logger.h @@ -40,9 +40,7 @@ class AutoRollLogger : public Logger { virtual void LogHeader(const char* format, va_list ap) override; // check if the logger has encountered any problem. - Status GetStatus() { - return status_; - } + Status GetStatus() { return status_; } size_t GetLogFileSize() const override { if (!logger_) { @@ -101,9 +99,7 @@ class AutoRollLogger : public Logger { } // Expose the log file path for testing purpose - std::string TEST_log_fname() const { - return log_fname_; - } + std::string TEST_log_fname() const { return log_fname_; } uint64_t TEST_ctime() const { return ctime_; } @@ -134,7 +130,7 @@ class AutoRollLogger : public Logger { std::string ValistToString(const char* format, va_list args) const; // Write the logs marked as headers to the new log file void WriteHeaderInfo(); - std::string log_fname_; // Current active info log's file name. + std::string log_fname_; // Current active info log's file name. std::string dbname_; std::string db_log_dir_; std::string db_absolute_path_; diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc index e9578369a13..8e94a78c824 100644 --- a/logging/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -64,8 +64,9 @@ class AutoRollLoggerTest : public testing::Test { ASSERT_TRUE(system(deleteDbDirCmd.c_str()) == 0); std::string testDir(kTestDir); - std::replace_if(testDir.begin(), testDir.end(), - [](char ch) { return ch == '/'; }, '\\'); + std::replace_if( + testDir.begin(), testDir.end(), [](char ch) { return ch == '/'; }, + '\\'); std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir; #else std::string deleteCmd = "rm -rf " + kTestDir + " " + kTestDbDir; @@ -203,15 +204,15 @@ void AutoRollLoggerTest::RollLogFileByTimeTest( } TEST_F(AutoRollLoggerTest, RollLogFileBySize) { - InitTestDb(); - size_t log_max_size = 1024 * 5; - size_t keep_log_file_num = 10; + InitTestDb(); + size_t log_max_size = 1024 * 5; + size_t keep_log_file_num = 10; - AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), - kTestDir, "", log_max_size, 0, keep_log_file_num); + AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), kTestDir, + "", log_max_size, 0, keep_log_file_num); - RollLogFileBySizeTest(&logger, log_max_size, - kSampleMessage + ":RollLogFileBySize"); + RollLogFileBySizeTest(&logger, log_max_size, + kSampleMessage + ":RollLogFileBySize"); } TEST_F(AutoRollLoggerTest, RollLogFileByTime) { @@ -319,11 +320,10 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { options.max_log_file_size = 1024; ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); AutoRollLogger* auto_roll_logger = - dynamic_cast(logger.get()); + dynamic_cast(logger.get()); ASSERT_TRUE(auto_roll_logger); - RollLogFileBySizeTest( - auto_roll_logger, options.max_log_file_size, - kSampleMessage + ":CreateLoggerFromOptions - size"); + RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":CreateLoggerFromOptions - size"); // Only roll by Time options.env = nse.get(); @@ -331,8 +331,7 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { options.max_log_file_size = 0; options.log_file_time_to_roll = 2; ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); - auto_roll_logger = - dynamic_cast(logger.get()); + auto_roll_logger = dynamic_cast(logger.get()); RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger, options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - time"); @@ -342,8 +341,7 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { options.max_log_file_size = 1024 * 5; options.log_file_time_to_roll = 2; ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); - auto_roll_logger = - dynamic_cast(logger.get()); + auto_roll_logger = dynamic_cast(logger.get()); RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, kSampleMessage + ":CreateLoggerFromOptions - both"); RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger, @@ -527,7 +525,7 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) { } std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str()); size_t lines = std::count(std::istreambuf_iterator(inFile), - std::istreambuf_iterator(), '\n'); + std::istreambuf_iterator(), '\n'); ASSERT_EQ(log_lines, lines); inFile.close(); } @@ -567,7 +565,7 @@ TEST_F(AutoRollLoggerTest, Close) { std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str()); size_t lines = std::count(std::istreambuf_iterator(inFile), - std::istreambuf_iterator(), '\n'); + std::istreambuf_iterator(), '\n'); ASSERT_EQ(log_lines, lines); inFile.close(); } @@ -602,7 +600,6 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { // test_num == 0 -> standard call to Header() // test_num == 1 -> call to Log() with InfoLogLevel::HEADER_LEVEL for (int test_num = 0; test_num < 2; test_num++) { - InitTestDb(); AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), @@ -640,7 +637,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) { const auto oldfiles = GetOldFileNames(newfname); - ASSERT_EQ(oldfiles.size(), (size_t) 2); + ASSERT_EQ(oldfiles.size(), (size_t)2); for (auto& oldfname : oldfiles) { // verify that the files rolled over @@ -658,8 +655,8 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) { // Replace all slashes in the path so windows CompSpec does not // become confused std::string testDir(kTestDir); - std::replace_if(testDir.begin(), testDir.end(), - [](char ch) { return ch == '/'; }, '\\'); + std::replace_if( + testDir.begin(), testDir.end(), [](char ch) { return ch == '/'; }, '\\'); std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir; #else std::string deleteCmd = "rm -rf " + kTestDir; diff --git a/logging/env_logger_test.cc b/logging/env_logger_test.cc index 0406ac0f45a..467ab064f4c 100644 --- a/logging/env_logger_test.cc +++ b/logging/env_logger_test.cc @@ -5,6 +5,7 @@ // #include "logging/env_logger.h" + #include "test_util/testharness.h" #include "test_util/testutil.h" diff --git a/logging/event_logger.cc b/logging/event_logger.cc index 78bf4f8ff5b..cb9eca68716 100644 --- a/logging/event_logger.cc +++ b/logging/event_logger.cc @@ -44,9 +44,7 @@ EventLoggerStream::~EventLoggerStream() { } } -void EventLogger::Log(const JSONWriter& jwriter) { - Log(logger_, jwriter); -} +void EventLogger::Log(const JSONWriter& jwriter) { Log(logger_, jwriter); } void EventLogger::Log(Logger* logger, const JSONWriter& jwriter) { #ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT diff --git a/logging/event_logger.h b/logging/event_logger.h index 596eb0f5170..9ce982f50ed 100644 --- a/logging/event_logger.h +++ b/logging/event_logger.h @@ -5,10 +5,10 @@ #pragma once +#include #include #include #include -#include #include "logging/log_buffer.h" #include "rocksdb/env.h" @@ -157,7 +157,8 @@ class EventLoggerStream { json_writer_ = new JSONWriter(); *this << "time_micros" << std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()).count(); + std::chrono::system_clock::now().time_since_epoch()) + .count(); } } friend class EventLogger; @@ -177,9 +178,7 @@ class EventLoggerStream { // "file_size": 1909699} class EventLogger { public: - static const char* Prefix() { - return "EVENT_LOG_v1"; - } + static const char* Prefix() { return "EVENT_LOG_v1"; } explicit EventLogger(Logger* logger) : logger_(logger) {} EventLoggerStream Log() { return EventLoggerStream(logger_); } diff --git a/logging/event_logger_test.cc b/logging/event_logger_test.cc index a48bcdc0ce7..582f56ceb4d 100644 --- a/logging/event_logger_test.cc +++ b/logging/event_logger_test.cc @@ -3,9 +3,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "logging/event_logger.h" + #include -#include "logging/event_logger.h" #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { diff --git a/logging/log_buffer.cc b/logging/log_buffer.cc index 378fcbb5293..2763e617f4c 100644 --- a/logging/log_buffer.cc +++ b/logging/log_buffer.cc @@ -5,13 +5,12 @@ #include "logging/log_buffer.h" -#include "port/sys_time.h" #include "port/port.h" +#include "port/sys_time.h" namespace ROCKSDB_NAMESPACE { -LogBuffer::LogBuffer(const InfoLogLevel log_level, - Logger*info_log) +LogBuffer::LogBuffer(const InfoLogLevel log_level, Logger* info_log) : log_level_(log_level), info_log_(info_log) {} void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format, diff --git a/logging/log_buffer.h b/logging/log_buffer.h index 61d0be7df1a..92d38d10d14 100644 --- a/logging/log_buffer.h +++ b/logging/log_buffer.h @@ -6,6 +6,7 @@ #pragma once #include + #include "memory/arena.h" #include "port/sys_time.h" #include "rocksdb/env.h" @@ -35,8 +36,8 @@ class LogBuffer { private: // One log entry with its timestamp struct BufferedLog { - port::TimeVal now_tv; // Timestamp of the log - char message[1]; // Beginning of log message + port::TimeVal now_tv; // Timestamp of the log + char message[1]; // Beginning of log message }; const InfoLogLevel log_level_; diff --git a/logging/logging.h b/logging/logging.h index 5851115695f..0fa882a7867 100644 --- a/logging/logging.h +++ b/logging/logging.h @@ -15,10 +15,10 @@ // Helper macros that include information about file name and line number #define ROCKS_LOG_STRINGIFY(x) #x #define ROCKS_LOG_TOSTRING(x) ROCKS_LOG_STRINGIFY(x) -#define ROCKS_LOG_PREPEND_FILE_LINE(FMT) ("[%s:" ROCKS_LOG_TOSTRING(__LINE__) "] " FMT) +#define ROCKS_LOG_PREPEND_FILE_LINE(FMT) \ + ("[%s:" ROCKS_LOG_TOSTRING(__LINE__) "] " FMT) -inline const char* RocksLogShorterFileName(const char* file) -{ +inline const char* RocksLogShorterFileName(const char* file) { // 18 is the length of "logging/logging.h". // If the name of this file changed, please change this number, too. return file + (sizeof(__FILE__) > 18 ? sizeof(__FILE__) - 18 : 0); @@ -28,30 +28,24 @@ inline const char* RocksLogShorterFileName(const char* file) #define ROCKS_LOG_HEADER(LGR, FMT, ...) \ ROCKSDB_NAMESPACE::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__) -#define ROCKS_LOG_DEBUG(LGR, FMT, ...) \ - ROCKSDB_NAMESPACE::Log(InfoLogLevel::DEBUG_LEVEL, LGR, \ - ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ +#define ROCKS_LOG_AT_LEVEL(LGR, LVL, FMT, ...) \ + ROCKSDB_NAMESPACE::Log((LVL), (LGR), ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) -#define ROCKS_LOG_INFO(LGR, FMT, ...) \ - ROCKSDB_NAMESPACE::Log(InfoLogLevel::INFO_LEVEL, LGR, \ - ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ - RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) +#define ROCKS_LOG_DEBUG(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::DEBUG_LEVEL, FMT, ##__VA_ARGS__) -#define ROCKS_LOG_WARN(LGR, FMT, ...) \ - ROCKSDB_NAMESPACE::Log(InfoLogLevel::WARN_LEVEL, LGR, \ - ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ - RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) +#define ROCKS_LOG_INFO(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::INFO_LEVEL, FMT, ##__VA_ARGS__) -#define ROCKS_LOG_ERROR(LGR, FMT, ...) \ - ROCKSDB_NAMESPACE::Log(InfoLogLevel::ERROR_LEVEL, LGR, \ - ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ - RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) +#define ROCKS_LOG_WARN(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::WARN_LEVEL, FMT, ##__VA_ARGS__) -#define ROCKS_LOG_FATAL(LGR, FMT, ...) \ - ROCKSDB_NAMESPACE::Log(InfoLogLevel::FATAL_LEVEL, LGR, \ - ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ - RocksLogShorterFileName(__FILE__), ##__VA_ARGS__) +#define ROCKS_LOG_ERROR(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::ERROR_LEVEL, FMT, ##__VA_ARGS__) + +#define ROCKS_LOG_FATAL(LGR, FMT, ...) \ + ROCKS_LOG_AT_LEVEL((LGR), InfoLogLevel::FATAL_LEVEL, FMT, ##__VA_ARGS__) #define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...) \ ROCKSDB_NAMESPACE::LogToBuffer(LOG_BUF, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \ diff --git a/memory/allocator.h b/memory/allocator.h index 002ad5f1d8f..0d7cd60a990 100644 --- a/memory/allocator.h +++ b/memory/allocator.h @@ -13,6 +13,7 @@ #pragma once #include #include + #include "rocksdb/write_buffer_manager.h" namespace ROCKSDB_NAMESPACE { diff --git a/memory/arena.cc b/memory/arena.cc index 10b8969b4d7..0a920203dcf 100644 --- a/memory/arena.cc +++ b/memory/arena.cc @@ -8,9 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memory/arena.h" -#ifndef OS_WIN -#include -#endif + #include #include "logging/logging.h" @@ -22,16 +20,7 @@ namespace ROCKSDB_NAMESPACE { -// MSVC complains that it is already defined since it is static in the header. -#ifndef _MSC_VER -const size_t Arena::kInlineSize; -#endif - -const size_t Arena::kMinBlockSize = 4096; -const size_t Arena::kMaxBlockSize = 2u << 30; -static const int kAlignUnit = alignof(max_align_t); - -size_t OptimizeBlockSize(size_t block_size) { +size_t Arena::OptimizeBlockSize(size_t block_size) { // Make sure block_size is in optimal range block_size = std::max(Arena::kMinBlockSize, block_size); block_size = std::min(Arena::kMaxBlockSize, block_size); @@ -53,14 +42,12 @@ Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size) blocks_memory_ += alloc_bytes_remaining_; aligned_alloc_ptr_ = inline_block_; unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_; -#ifdef MAP_HUGETLB - hugetlb_size_ = huge_page_size; - if (hugetlb_size_ && kBlockSize > hugetlb_size_) { - hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; + if (MemMapping::kHugePageSupported) { + hugetlb_size_ = huge_page_size; + if (hugetlb_size_ && kBlockSize > hugetlb_size_) { + hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; + } } -#else - (void)huge_page_size; -#endif if (tracker_ != nullptr) { tracker_->Allocate(kInlineSize); } @@ -71,21 +58,6 @@ Arena::~Arena() { assert(tracker_->is_freed()); tracker_->FreeMem(); } - for (const auto& block : blocks_) { - delete[] block; - } - -#ifdef MAP_HUGETLB - for (const auto& mmap_info : huge_blocks_) { - if (mmap_info.addr_ == nullptr) { - continue; - } - auto ret = munmap(mmap_info.addr_, mmap_info.length_); - if (ret != 0) { - // TODO(sdong): Better handling - } - } -#endif } char* Arena::AllocateFallback(size_t bytes, bool aligned) { @@ -99,12 +71,10 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { // We waste the remaining space in the current block. size_t size = 0; char* block_head = nullptr; -#ifdef MAP_HUGETLB - if (hugetlb_size_) { + if (MemMapping::kHugePageSupported && hugetlb_size_ > 0) { size = hugetlb_size_; block_head = AllocateFromHugePage(size); } -#endif if (!block_head) { size = kBlockSize; block_head = AllocateNewBlock(size); @@ -123,45 +93,22 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { } char* Arena::AllocateFromHugePage(size_t bytes) { -#ifdef MAP_HUGETLB - if (hugetlb_size_ == 0) { - return nullptr; - } - // Reserve space in `huge_blocks_` before calling `mmap`. - // Use `emplace_back()` instead of `reserve()` to let std::vector manage its - // own memory and do fewer reallocations. - // - // - If `emplace_back` throws, no memory leaks because we haven't called - // `mmap` yet. - // - If `mmap` throws, no memory leaks because the vector will be cleaned up - // via RAII. - huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */); - - void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE), - (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0); - - if (addr == MAP_FAILED) { - return nullptr; - } - huge_blocks_.back() = MmapInfo(addr, bytes); - blocks_memory_ += bytes; - if (tracker_ != nullptr) { - tracker_->Allocate(bytes); + MemMapping mm = MemMapping::AllocateHuge(bytes); + auto addr = static_cast(mm.Get()); + if (addr) { + huge_blocks_.push_back(std::move(mm)); + blocks_memory_ += bytes; + if (tracker_ != nullptr) { + tracker_->Allocate(bytes); + } } - return reinterpret_cast(addr); -#else - (void)bytes; - return nullptr; -#endif + return addr; } char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, Logger* logger) { - assert((kAlignUnit & (kAlignUnit - 1)) == - 0); // Pointer size should be a power of 2 - -#ifdef MAP_HUGETLB - if (huge_page_size > 0 && bytes > 0) { + if (MemMapping::kHugePageSupported && hugetlb_size_ > 0 && + huge_page_size > 0 && bytes > 0) { // Allocate from a huge page TLB table. size_t reserved_size = ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; @@ -177,10 +124,6 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, return addr; } } -#else - (void)huge_page_size; - (void)logger; -#endif size_t current_mod = reinterpret_cast(aligned_alloc_ptr_) & (kAlignUnit - 1); @@ -200,17 +143,11 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, } char* Arena::AllocateNewBlock(size_t block_bytes) { - // Reserve space in `blocks_` before allocating memory via new. - // Use `emplace_back()` instead of `reserve()` to let std::vector manage its - // own memory and do fewer reallocations. - // - // - If `emplace_back` throws, no memory leaks because we haven't called `new` - // yet. - // - If `new` throws, no memory leaks because the vector will be cleaned up - // via RAII. - blocks_.emplace_back(nullptr); - + // NOTE: std::make_unique zero-initializes the block so is not appropriate + // here char* block = new char[block_bytes]; + blocks_.push_back(std::unique_ptr(block)); + size_t allocated_size; #ifdef ROCKSDB_MALLOC_USABLE_SIZE allocated_size = malloc_usable_size(block); @@ -227,7 +164,6 @@ char* Arena::AllocateNewBlock(size_t block_bytes) { if (tracker_ != nullptr) { tracker_->Allocate(allocated_size); } - blocks_.back() = block; return block; } diff --git a/memory/arena.h b/memory/arena.h index 1de04c4770e..39399aa71b4 100644 --- a/memory/arena.h +++ b/memory/arena.h @@ -12,16 +12,13 @@ // size, it uses malloc to directly get the requested size. #pragma once -#ifndef OS_WIN -#include -#endif -#include -#include -#include + #include -#include +#include + #include "memory/allocator.h" -#include "util/mutexlock.h" +#include "port/mmap.h" +#include "rocksdb/env.h" namespace ROCKSDB_NAMESPACE { @@ -31,9 +28,13 @@ class Arena : public Allocator { Arena(const Arena&) = delete; void operator=(const Arena&) = delete; - static const size_t kInlineSize = 2048; - static const size_t kMinBlockSize; - static const size_t kMaxBlockSize; + static constexpr size_t kInlineSize = 2048; + static constexpr size_t kMinBlockSize = 4096; + static constexpr size_t kMaxBlockSize = 2u << 30; + + static constexpr unsigned kAlignUnit = alignof(std::max_align_t); + static_assert((kAlignUnit & (kAlignUnit - 1)) == 0, + "Pointer size should be power of 2"); // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the // supported hugepage size of the system), block allocation will try huge @@ -63,7 +64,7 @@ class Arena : public Allocator { // by the arena (exclude the space allocated but not yet used for future // allocations). size_t ApproximateMemoryUsage() const { - return blocks_memory_ + blocks_.capacity() * sizeof(char*) - + return blocks_memory_ + blocks_.size() * sizeof(char*) - alloc_bytes_remaining_; } @@ -81,21 +82,19 @@ class Arena : public Allocator { return blocks_.empty() && huge_blocks_.empty(); } + // check and adjust the block_size so that the return value is + // 1. in the range of [kMinBlockSize, kMaxBlockSize]. + // 2. the multiple of align unit. + static size_t OptimizeBlockSize(size_t block_size); + private: - char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t)))); + alignas(std::max_align_t) char inline_block_[kInlineSize]; // Number of bytes allocated in one block const size_t kBlockSize; - // Array of new[] allocated memory blocks - using Blocks = std::vector; - Blocks blocks_; - - struct MmapInfo { - void* addr_; - size_t length_; - - MmapInfo(void* addr, size_t length) : addr_(addr), length_(length) {} - }; - std::vector huge_blocks_; + // Allocated memory blocks + std::deque> blocks_; + // Huge page allocations + std::deque huge_blocks_; size_t irregular_block_num = 0; // Stats for current active block. @@ -108,15 +107,15 @@ class Arena : public Allocator { // How many bytes left in currently active block? size_t alloc_bytes_remaining_ = 0; -#ifdef MAP_HUGETLB size_t hugetlb_size_ = 0; -#endif // MAP_HUGETLB + char* AllocateFromHugePage(size_t bytes); char* AllocateFallback(size_t bytes, bool aligned); char* AllocateNewBlock(size_t block_bytes); // Bytes of memory in blocks allocated so far size_t blocks_memory_ = 0; + // Non-owned AllocTracker* tracker_; }; @@ -133,9 +132,4 @@ inline char* Arena::Allocate(size_t bytes) { return AllocateFallback(bytes, false /* unaligned */); } -// check and adjust the block_size so that the return value is -// 1. in the range of [kMinBlockSize, kMaxBlockSize]. -// 2. the multiple of align unit. -extern size_t OptimizeBlockSize(size_t block_size); - } // namespace ROCKSDB_NAMESPACE diff --git a/memory/arena_test.cc b/memory/arena_test.cc index 1a0b7be83df..21bf7ed6282 100644 --- a/memory/arena_test.cc +++ b/memory/arena_test.cc @@ -8,6 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memory/arena.h" + +#ifndef OS_WIN +#include +#endif +#include "port/port.h" #include "test_util/testharness.h" #include "util/random.h" @@ -31,7 +36,7 @@ bool CheckMemoryAllocated(size_t allocated, size_t expected) { void MemoryAllocatedBytesTest(size_t huge_page_size) { const int N = 17; - size_t req_sz; // requested size + size_t req_sz; // requested size size_t bsz = 32 * 1024; // block size size_t expected_memory_allocated; @@ -196,6 +201,91 @@ TEST_F(ArenaTest, Simple) { SimpleTest(0); SimpleTest(kHugePageSize); } + +// Number of minor page faults since last call +size_t PopMinorPageFaultCount() { +#ifdef RUSAGE_SELF + static long prev = 0; + struct rusage usage; + EXPECT_EQ(getrusage(RUSAGE_SELF, &usage), 0); + size_t rv = usage.ru_minflt - prev; + prev = usage.ru_minflt; + return rv; +#else + // Conservative + return SIZE_MAX; +#endif // RUSAGE_SELF +} + +TEST(MmapTest, AllocateLazyZeroed) { + // Doesn't have to be page aligned + constexpr size_t len = 1234567; + MemMapping m = MemMapping::AllocateLazyZeroed(len); + auto arr = static_cast(m.Get()); + + // Should generally work + ASSERT_NE(arr, nullptr); + + // Start counting page faults + PopMinorPageFaultCount(); + + // Access half of the allocation + size_t i = 0; + for (; i < len / 2; ++i) { + ASSERT_EQ(arr[i], 0); + arr[i] = static_cast(i & 255); + } + + // Appropriate page faults (maybe more) + size_t faults = PopMinorPageFaultCount(); + ASSERT_GE(faults, len / 2 / port::kPageSize); + + // Access rest of the allocation + for (; i < len; ++i) { + ASSERT_EQ(arr[i], 0); + arr[i] = static_cast(i & 255); + } + + // Appropriate page faults (maybe more) + faults = PopMinorPageFaultCount(); + ASSERT_GE(faults, len / 2 / port::kPageSize); + + // Verify data + for (i = 0; i < len; ++i) { + ASSERT_EQ(arr[i], static_cast(i & 255)); + } +} + +TEST_F(ArenaTest, UnmappedAllocation) { + // Verify that it's possible to get unmapped pages in large allocations, + // for memory efficiency and to ensure we don't accidentally waste time & + // space initializing the memory. + constexpr size_t kBlockSize = 2U << 20; + Arena arena(kBlockSize); + + // The allocator might give us back recycled memory for a while, but + // shouldn't last forever. + for (int i = 0;; ++i) { + char* p = arena.Allocate(kBlockSize); + + // Start counting page faults + PopMinorPageFaultCount(); + + // Overwrite the whole allocation + for (size_t j = 0; j < kBlockSize; ++j) { + p[j] = static_cast(j & 255); + } + + size_t faults = PopMinorPageFaultCount(); + if (faults >= kBlockSize * 3 / 4 / port::kPageSize) { + // Most of the access generated page faults => GOOD + break; + } + // Should have succeeded after enough tries + ASSERT_LT(i, 1000); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/memory/concurrent_arena.cc b/memory/concurrent_arena.cc index 3d45ca94939..1619bd93b00 100644 --- a/memory/concurrent_arena.cc +++ b/memory/concurrent_arena.cc @@ -8,7 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memory/concurrent_arena.h" + #include + #include "port/port.h" #include "util/random.h" diff --git a/memory/concurrent_arena.h b/memory/concurrent_arena.h index d2fbc2c931f..f14507d302e 100644 --- a/memory/concurrent_arena.h +++ b/memory/concurrent_arena.h @@ -11,6 +11,7 @@ #include #include #include + #include "memory/allocator.h" #include "memory/arena.h" #include "port/lang.h" diff --git a/memory/memory_allocator.h b/memory/memory_allocator.h index f1a548659b3..68aa35beb86 100644 --- a/memory/memory_allocator.h +++ b/memory/memory_allocator.h @@ -6,6 +6,8 @@ #pragma once +#include + #include "rocksdb/memory_allocator.h" namespace ROCKSDB_NAMESPACE { @@ -35,4 +37,11 @@ inline CacheAllocationPtr AllocateBlock(size_t size, return CacheAllocationPtr(new char[size]); } +inline CacheAllocationPtr AllocateAndCopyBlock(const Slice& data, + MemoryAllocator* allocator) { + CacheAllocationPtr cap = AllocateBlock(data.size(), allocator); + std::copy_n(data.data(), data.size(), cap.get()); + return cap; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/alloc_tracker.cc b/memtable/alloc_tracker.cc index fe213434714..4c6d3543193 100644 --- a/memtable/alloc_tracker.cc +++ b/memtable/alloc_tracker.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include + #include "memory/allocator.h" #include "memory/arena.h" #include "rocksdb/write_buffer_manager.h" diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index f990e89f713..a717683048c 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -77,9 +77,7 @@ struct Node { next_.store(x, std::memory_order_release); } // No-barrier variants that can be safely used in a few locations. - Node* NoBarrier_Next() { - return next_.load(std::memory_order_relaxed); - } + Node* NoBarrier_Next() { return next_.load(std::memory_order_relaxed); } void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); } @@ -296,9 +294,9 @@ class HashLinkListRep : public MemTableRep { // Advance to the first entry with a key >= target void Seek(const Slice& internal_key, const char* memtable_key) override { - const char* encoded_key = - (memtable_key != nullptr) ? - memtable_key : EncodeKey(&tmp_, internal_key); + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); iter_.Seek(encoded_key); } @@ -324,7 +322,7 @@ class HashLinkListRep : public MemTableRep { // To destruct with the iterator. std::unique_ptr full_list_; std::unique_ptr allocator_; - std::string tmp_; // For passing to EncodeKey + std::string tmp_; // For passing to EncodeKey }; class LinkListIterator : public MemTableRep::Iterator { @@ -365,8 +363,8 @@ class HashLinkListRep : public MemTableRep { // Advance to the first entry with a key >= target void Seek(const Slice& internal_key, const char* /*memtable_key*/) override { - node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, - internal_key); + node_ = + hash_link_list_rep_->FindGreaterOrEqualInBucket(head_, internal_key); } // Retreat to the last entry with a key <= target @@ -398,15 +396,14 @@ class HashLinkListRep : public MemTableRep { head_ = head; node_ = nullptr; } + private: friend class HashLinkListRep; const HashLinkListRep* const hash_link_list_rep_; Node* head_; Node* node_; - virtual void SeekToHead() { - node_ = head_; - } + virtual void SeekToHead() { node_ = head_; } }; class DynamicIterator : public HashLinkListRep::LinkListIterator { @@ -486,7 +483,7 @@ class HashLinkListRep : public MemTableRep { // This is used when there wasn't a bucket. It is cheaper than // instantiating an empty bucket over which to iterate. public: - EmptyIterator() { } + EmptyIterator() {} bool Valid() const override { return false; } const char* key() const override { assert(false); @@ -521,7 +518,7 @@ HashLinkListRep::HashLinkListRep( bucket_entries_logging_threshold_(bucket_entries_logging_threshold), if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) { char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size, - huge_page_tlb_size, logger); + huge_page_tlb_size, logger); buckets_ = new (mem) Pointer[bucket_size]; @@ -530,8 +527,7 @@ HashLinkListRep::HashLinkListRep( } } -HashLinkListRep::~HashLinkListRep() { -} +HashLinkListRep::~HashLinkListRep() {} KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { char* mem = allocator_->AllocateAligned(sizeof(Node) + len); @@ -633,9 +629,10 @@ void HashLinkListRep::Insert(KeyHandle handle) { if (bucket_entries_logging_threshold_ > 0 && header->GetNumEntries() == static_cast(bucket_entries_logging_threshold_)) { - Info(logger_, "HashLinkedList bucket %" ROCKSDB_PRIszt - " has more than %d " - "entries. Key to insert: %s", + Info(logger_, + "HashLinkedList bucket %" ROCKSDB_PRIszt + " has more than %d " + "entries. Key to insert: %s", GetHash(transformed), header->GetNumEntries(), GetLengthPrefixedSlice(x->key).ToString(true).c_str()); } @@ -786,7 +783,7 @@ MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) { for (itr.SeekToFirst(); itr.Valid(); itr.Next()) { list->Insert(itr.key()); count++; - } + } } } if (if_log_bucket_dist_when_flash_) { diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc index dc58046a451..9d093829ba8 100644 --- a/memtable/hash_skiplist_rep.cc +++ b/memtable/hash_skiplist_rep.cc @@ -118,9 +118,9 @@ class HashSkipListRep : public MemTableRep { // Advance to the first entry with a key >= target void Seek(const Slice& internal_key, const char* memtable_key) override { if (list_ != nullptr) { - const char* encoded_key = - (memtable_key != nullptr) ? - memtable_key : EncodeKey(&tmp_, internal_key); + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); iter_.Seek(encoded_key); } } @@ -158,6 +158,7 @@ class HashSkipListRep : public MemTableRep { iter_.SetList(list); own_list_ = false; } + private: // if list_ is nullptr, we should NEVER call any methods on iter_ // if list_ is nullptr, this Iterator is not Valid() @@ -167,14 +168,14 @@ class HashSkipListRep : public MemTableRep { // responsible for it's cleaning. This is a poor man's std::shared_ptr bool own_list_; std::unique_ptr arena_; - std::string tmp_; // For passing to EncodeKey + std::string tmp_; // For passing to EncodeKey }; class DynamicIterator : public HashSkipListRep::Iterator { public: explicit DynamicIterator(const HashSkipListRep& memtable_rep) - : HashSkipListRep::Iterator(nullptr, false), - memtable_rep_(memtable_rep) {} + : HashSkipListRep::Iterator(nullptr, false), + memtable_rep_(memtable_rep) {} // Advance to the first entry with a key >= target void Seek(const Slice& k, const char* memtable_key) override { @@ -208,7 +209,7 @@ class HashSkipListRep : public MemTableRep { // This is used when there wasn't a bucket. It is cheaper than // instantiating an empty bucket over which to iterate. public: - EmptyIterator() { } + EmptyIterator() {} bool Valid() const override { return false; } const char* key() const override { assert(false); @@ -239,8 +240,8 @@ HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare, transform_(transform), compare_(compare), allocator_(allocator) { - auto mem = allocator->AllocateAligned( - sizeof(std::atomic) * bucket_size); + auto mem = + allocator->AllocateAligned(sizeof(std::atomic) * bucket_size); buckets_ = new (mem) std::atomic[bucket_size]; for (size_t i = 0; i < bucket_size_; ++i) { @@ -248,8 +249,7 @@ HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare, } } -HashSkipListRep::~HashSkipListRep() { -} +HashSkipListRep::~HashSkipListRep() {} HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( const Slice& transformed) { @@ -281,9 +281,7 @@ bool HashSkipListRep::Contains(const char* key) const { return bucket->Contains(key); } -size_t HashSkipListRep::ApproximateMemoryUsage() { - return 0; -} +size_t HashSkipListRep::ApproximateMemoryUsage() { return 0; } void HashSkipListRep::Get(const LookupKey& k, void* callback_args, bool (*callback_func)(void* arg, const char* entry)) { @@ -388,7 +386,7 @@ MemTableRepFactory* NewHashSkipListRepFactory( size_t bucket_count, int32_t skiplist_height, int32_t skiplist_branching_factor) { return new HashSkipListRepFactory(bucket_count, skiplist_height, - skiplist_branching_factor); + skiplist_branching_factor); } } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 4a4e63df00b..abb3c3ddb7f 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -43,9 +43,11 @@ #pragma once #include #include + #include #include #include + #include "memory/allocator.h" #include "port/likely.h" #include "port/port.h" @@ -62,8 +64,8 @@ class InlineSkipList { struct Splice; public: - using DecodedKey = \ - typename std::remove_reference::type::DecodedType; + using DecodedKey = + typename std::remove_reference::type::DecodedType; static const uint16_t kMaxPossibleHeight = 32; @@ -264,9 +266,9 @@ class InlineSkipList { // point to a node that is before the key, and after should point to // a node that is after the key. after should be nullptr if a good after // node isn't conveniently available. - template - void FindSpliceForLevel(const DecodedKey& key, Node* before, Node* after, int level, - Node** out_prev, Node** out_next); + template + void FindSpliceForLevel(const DecodedKey& key, Node* before, Node* after, + int level, Node** out_prev, Node** out_next); // Recomputes Splice levels from highest_level (inclusive) down to // lowest_level (inclusive). @@ -766,8 +768,8 @@ void InlineSkipList::FindSpliceForLevel(const DecodedKey& key, PREFETCH(next->Next(level), 0, 1); } if (prefetch_before == true) { - if (next != nullptr && level>0) { - PREFETCH(next->Next(level-1), 0, 1); + if (next != nullptr && level > 0) { + PREFETCH(next->Next(level - 1), 0, 1); } } assert(before == head_ || next == nullptr || @@ -791,7 +793,7 @@ void InlineSkipList::RecomputeSpliceLevels(const DecodedKey& key, assert(recompute_level <= splice->height_); for (int i = recompute_level - 1; i >= 0; --i) { FindSpliceForLevel(key, splice->prev_[i + 1], splice->next_[i + 1], i, - &splice->prev_[i], &splice->next_[i]); + &splice->prev_[i], &splice->next_[i]); } } @@ -881,8 +883,7 @@ bool InlineSkipList::Insert(const char* key, Splice* splice, // we're pessimistic, recompute everything recompute_height = max_height; } - } else if (KeyIsAfterNode(key_decoded, - splice->next_[recompute_height])) { + } else if (KeyIsAfterNode(key_decoded, splice->next_[recompute_height])) { // key is from after splice if (allow_partial_splice_fix) { Node* bad = splice->next_[recompute_height]; diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index 1f3c6a69172..f856440649b 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -8,8 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memtable/inlineskiplist.h" + #include #include + #include "memory/concurrent_arena.h" #include "rocksdb/env.h" #include "test_util/testharness.h" @@ -34,9 +36,7 @@ static Key Decode(const char* key) { struct TestComparator { using DecodedType = Key; - static DecodedType decode_key(const char* b) { - return Decode(b); - } + static DecodedType decode_key(const char* b) { return Decode(b); } int operator()(const char* a, const char* b) const { if (Decode(a) < Decode(b)) { diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc index 1eaa7658f0d..a915abed786 100644 --- a/memtable/memtablerep_bench.cc +++ b/memtable/memtablerep_bench.cc @@ -467,8 +467,8 @@ class FillBenchmark : public Benchmark { num_write_ops_per_thread_ = FLAGS_num_operations; } - void RunThreads(std::vector* /*threads*/, uint64_t* bytes_written, - uint64_t* bytes_read, bool /*write*/, + void RunThreads(std::vector* /*threads*/, + uint64_t* bytes_written, uint64_t* bytes_read, bool /*write*/, uint64_t* read_hits) override { FillBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, sequence_, num_write_ops_per_thread_, read_hits)(); diff --git a/memtable/skiplist.h b/memtable/skiplist.h index 52818e3020e..e3cecd30c1f 100644 --- a/memtable/skiplist.h +++ b/memtable/skiplist.h @@ -33,14 +33,16 @@ #pragma once #include #include + #include + #include "memory/allocator.h" #include "port/port.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { -template +template class SkipList { private: struct Node; @@ -119,7 +121,7 @@ class SkipList { // Immutable after construction Comparator const compare_; - Allocator* const allocator_; // Allocator used for allocations of nodes + Allocator* const allocator_; // Allocator used for allocations of nodes Node* const head_; @@ -164,9 +166,9 @@ class SkipList { }; // Implementation details follow -template +template struct SkipList::Node { - explicit Node(const Key& k) : key(k) { } + explicit Node(const Key& k) : key(k) {} Key const key; @@ -200,43 +202,43 @@ struct SkipList::Node { std::atomic next_[1]; }; -template -typename SkipList::Node* -SkipList::NewNode(const Key& key, int height) { +template +typename SkipList::Node* SkipList::NewNode( + const Key& key, int height) { char* mem = allocator_->AllocateAligned( sizeof(Node) + sizeof(std::atomic) * (height - 1)); return new (mem) Node(key); } -template +template inline SkipList::Iterator::Iterator(const SkipList* list) { SetList(list); } -template +template inline void SkipList::Iterator::SetList(const SkipList* list) { list_ = list; node_ = nullptr; } -template +template inline bool SkipList::Iterator::Valid() const { return node_ != nullptr; } -template +template inline const Key& SkipList::Iterator::key() const { assert(Valid()); return node_->key; } -template +template inline void SkipList::Iterator::Next() { assert(Valid()); node_ = node_->Next(0); } -template +template inline void SkipList::Iterator::Prev() { // Instead of using explicit "prev" links, we just search for the // last node that falls before key. @@ -247,7 +249,7 @@ inline void SkipList::Iterator::Prev() { } } -template +template inline void SkipList::Iterator::Seek(const Key& target) { node_ = list_->FindGreaterOrEqual(target); } @@ -269,7 +271,7 @@ inline void SkipList::Iterator::SeekToFirst() { node_ = list_->head_->Next(0); } -template +template inline void SkipList::Iterator::SeekToLast() { node_ = list_->FindLast(); if (node_ == list_->head_) { @@ -277,7 +279,7 @@ inline void SkipList::Iterator::SeekToLast() { } } -template +template int SkipList::RandomHeight() { auto rnd = Random::GetTLSInstance(); @@ -291,15 +293,15 @@ int SkipList::RandomHeight() { return height; } -template +template bool SkipList::KeyIsAfterNode(const Key& key, Node* n) const { // nullptr n is considered infinite return (n != nullptr) && (compare_(n->key, key) < 0); } -template -typename SkipList::Node* SkipList:: - FindGreaterOrEqual(const Key& key) const { +template +typename SkipList::Node* +SkipList::FindGreaterOrEqual(const Key& key) const { // Note: It looks like we could reduce duplication by implementing // this function as FindLessThan(key)->Next(0), but we wouldn't be able // to exit early on equality and the result wouldn't even be correct. @@ -315,8 +317,8 @@ typename SkipList::Node* SkipList:: assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x)); // Make sure we haven't overshot during our search assert(x == head_ || KeyIsAfterNode(key, x)); - int cmp = (next == nullptr || next == last_bigger) - ? 1 : compare_(next->key, key); + int cmp = + (next == nullptr || next == last_bigger) ? 1 : compare_(next->key, key); if (cmp == 0 || (cmp > 0 && level == 0)) { return next; } else if (cmp < 0) { @@ -330,7 +332,7 @@ typename SkipList::Node* SkipList:: } } -template +template typename SkipList::Node* SkipList::FindLessThan(const Key& key, Node** prev) const { Node* x = head_; @@ -360,7 +362,7 @@ SkipList::FindLessThan(const Key& key, Node** prev) const { } } -template +template typename SkipList::Node* SkipList::FindLast() const { Node* x = head_; @@ -424,14 +426,14 @@ SkipList::SkipList(const Comparator cmp, Allocator* allocator, // prev_ does not need to be freed, as its life cycle is tied up with // the allocator as a whole. prev_ = reinterpret_cast( - allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_)); + allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_)); for (int i = 0; i < kMaxHeight_; i++) { head_->SetNext(i, nullptr); prev_[i] = head_; } } -template +template void SkipList::Insert(const Key& key) { // fast path for sequential insertion if (!KeyIsAfterNode(key, prev_[0]->NoBarrier_Next(0)) && @@ -460,7 +462,7 @@ void SkipList::Insert(const Key& key) { for (int i = GetMaxHeight(); i < height; i++) { prev_[i] = head_; } - //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); + // fprintf(stderr, "Change height from %d to %d\n", max_height_, height); // It is ok to mutate max_height_ without any synchronization // with concurrent readers. A concurrent reader that observes @@ -483,7 +485,7 @@ void SkipList::Insert(const Key& key) { prev_height_ = height; } -template +template bool SkipList::Contains(const Key& key) const { Node* x = FindGreaterOrEqual(key); if (x != nullptr && Equal(key, x->key)) { diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc index 1d43d734b4e..a070885110f 100644 --- a/memtable/skiplist_test.cc +++ b/memtable/skiplist_test.cc @@ -8,7 +8,9 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "memtable/skiplist.h" + #include + #include "memory/arena.h" #include "rocksdb/env.h" #include "test_util/testharness.h" @@ -169,7 +171,7 @@ class ConcurrentTest { static uint64_t hash(Key key) { return key & 0xff; } static uint64_t HashNumbers(uint64_t k, uint64_t g) { - uint64_t data[2] = { k, g }; + uint64_t data[2] = {k, g}; return Hash(reinterpret_cast(data), sizeof(data), 0); } @@ -311,11 +313,7 @@ class TestState { int seed_; std::atomic quit_flag_; - enum ReaderState { - STARTING, - RUNNING, - DONE - }; + enum ReaderState { STARTING, RUNNING, DONE }; explicit TestState(int s) : seed_(s), quit_flag_(false), state_(STARTING), state_cv_(&mu_) {} diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 5b8577e8792..40f13a2c17d 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -21,74 +21,76 @@ class SkipListRep : public MemTableRep { const size_t lookahead_; friend class LookaheadIterator; -public: - explicit SkipListRep(const MemTableRep::KeyComparator& compare, - Allocator* allocator, const SliceTransform* transform, - const size_t lookahead) - : MemTableRep(allocator), - skip_list_(compare, allocator), - cmp_(compare), - transform_(transform), - lookahead_(lookahead) {} - - KeyHandle Allocate(const size_t len, char** buf) override { - *buf = skip_list_.AllocateKey(len); - return static_cast(*buf); - } + + public: + explicit SkipListRep(const MemTableRep::KeyComparator& compare, + Allocator* allocator, const SliceTransform* transform, + const size_t lookahead) + : MemTableRep(allocator), + skip_list_(compare, allocator), + cmp_(compare), + transform_(transform), + lookahead_(lookahead) {} + + KeyHandle Allocate(const size_t len, char** buf) override { + *buf = skip_list_.AllocateKey(len); + return static_cast(*buf); + } // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. - void Insert(KeyHandle handle) override { - skip_list_.Insert(static_cast(handle)); - } + void Insert(KeyHandle handle) override { + skip_list_.Insert(static_cast(handle)); + } - bool InsertKey(KeyHandle handle) override { - return skip_list_.Insert(static_cast(handle)); - } + bool InsertKey(KeyHandle handle) override { + return skip_list_.Insert(static_cast(handle)); + } - void InsertWithHint(KeyHandle handle, void** hint) override { - skip_list_.InsertWithHint(static_cast(handle), hint); - } + void InsertWithHint(KeyHandle handle, void** hint) override { + skip_list_.InsertWithHint(static_cast(handle), hint); + } - bool InsertKeyWithHint(KeyHandle handle, void** hint) override { - return skip_list_.InsertWithHint(static_cast(handle), hint); - } + bool InsertKeyWithHint(KeyHandle handle, void** hint) override { + return skip_list_.InsertWithHint(static_cast(handle), hint); + } - void InsertWithHintConcurrently(KeyHandle handle, void** hint) override { - skip_list_.InsertWithHintConcurrently(static_cast(handle), hint); - } + void InsertWithHintConcurrently(KeyHandle handle, void** hint) override { + skip_list_.InsertWithHintConcurrently(static_cast(handle), hint); + } - bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override { - return skip_list_.InsertWithHintConcurrently(static_cast(handle), - hint); - } + bool InsertKeyWithHintConcurrently(KeyHandle handle, void** hint) override { + return skip_list_.InsertWithHintConcurrently(static_cast(handle), + hint); + } - void InsertConcurrently(KeyHandle handle) override { - skip_list_.InsertConcurrently(static_cast(handle)); - } + void InsertConcurrently(KeyHandle handle) override { + skip_list_.InsertConcurrently(static_cast(handle)); + } - bool InsertKeyConcurrently(KeyHandle handle) override { - return skip_list_.InsertConcurrently(static_cast(handle)); - } + bool InsertKeyConcurrently(KeyHandle handle) override { + return skip_list_.InsertConcurrently(static_cast(handle)); + } // Returns true iff an entry that compares equal to key is in the list. - bool Contains(const char* key) const override { - return skip_list_.Contains(key); - } - - size_t ApproximateMemoryUsage() override { - // All memory is allocated through allocator; nothing to report here - return 0; - } - - void Get(const LookupKey& k, void* callback_args, - bool (*callback_func)(void* arg, const char* entry)) override { - SkipListRep::Iterator iter(&skip_list_); - Slice dummy_slice; - for (iter.Seek(dummy_slice, k.memtable_key().data()); - iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) { - } - } + bool Contains(const char* key) const override { + return skip_list_.Contains(key); + } + + size_t ApproximateMemoryUsage() override { + // All memory is allocated through allocator; nothing to report here + return 0; + } + + void Get(const LookupKey& k, void* callback_args, + bool (*callback_func)(void* arg, const char* entry)) override { + SkipListRep::Iterator iter(&skip_list_); + Slice dummy_slice; + for (iter.Seek(dummy_slice, k.memtable_key().data()); + iter.Valid() && callback_func(callback_args, iter.key()); + iter.Next()) { + } + } uint64_t ApproximateNumEntries(const Slice& start_ikey, const Slice& end_ikey) override { @@ -218,7 +220,7 @@ class SkipListRep : public MemTableRep { void SeekToLast() override { iter_.SeekToLast(); } protected: - std::string tmp_; // For passing to EncodeKey + std::string tmp_; // For passing to EncodeKey }; // Iterator over the contents of a skip list which also keeps track of the @@ -227,8 +229,8 @@ class SkipListRep : public MemTableRep { // the target key hasn't been found. class LookaheadIterator : public MemTableRep::Iterator { public: - explicit LookaheadIterator(const SkipListRep& rep) : - rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} + explicit LookaheadIterator(const SkipListRep& rep) + : rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} ~LookaheadIterator() override {} @@ -271,9 +273,9 @@ class SkipListRep : public MemTableRep { } void Seek(const Slice& internal_key, const char* memtable_key) override { - const char *encoded_key = - (memtable_key != nullptr) ? - memtable_key : EncodeKey(&tmp_, internal_key); + const char* encoded_key = (memtable_key != nullptr) + ? memtable_key + : EncodeKey(&tmp_, internal_key); if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) { // prev_.key() is smaller or equal to our target key; do a quick @@ -313,7 +315,7 @@ class SkipListRep : public MemTableRep { } protected: - std::string tmp_; // For passing to EncodeKey + std::string tmp_; // For passing to EncodeKey private: const SkipListRep& rep_; @@ -323,19 +325,20 @@ class SkipListRep : public MemTableRep { MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { if (lookahead_ > 0) { - void *mem = - arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator)) - : operator new(sizeof(SkipListRep::LookaheadIterator)); + void* mem = + arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator)) + : + operator new(sizeof(SkipListRep::LookaheadIterator)); return new (mem) SkipListRep::LookaheadIterator(*this); } else { - void *mem = - arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator)) - : operator new(sizeof(SkipListRep::Iterator)); + void* mem = arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator)) + : + operator new(sizeof(SkipListRep::Iterator)); return new (mem) SkipListRep::Iterator(&skip_list_); } } }; -} +} // namespace static std::unordered_map skiplist_factory_info = { #ifndef ROCKSDB_LITE diff --git a/memtable/stl_wrappers.h b/memtable/stl_wrappers.h index e9f8f214ce1..783a8088d02 100644 --- a/memtable/stl_wrappers.h +++ b/memtable/stl_wrappers.h @@ -29,5 +29,5 @@ struct Compare : private Base { } }; -} +} // namespace stl_wrappers } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc index 26c699ca63c..29316334999 100644 --- a/memtable/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -48,13 +48,14 @@ class VectorRep : public MemTableRep { std::shared_ptr> bucket_; std::vector::const_iterator mutable cit_; const KeyComparator& compare_; - std::string tmp_; // For passing to EncodeKey + std::string tmp_; // For passing to EncodeKey bool mutable sorted_; void DoSort() const; + public: explicit Iterator(class VectorRep* vrep, - std::shared_ptr> bucket, - const KeyComparator& compare); + std::shared_ptr> bucket, + const KeyComparator& compare); // Initialize an iterator over the specified collection. // The returned iterator is not valid. @@ -123,12 +124,10 @@ void VectorRep::MarkReadOnly() { } size_t VectorRep::ApproximateMemoryUsage() { - return - sizeof(bucket_) + sizeof(*bucket_) + - bucket_->size() * - sizeof( - std::remove_reference::type::value_type - ); + return sizeof(bucket_) + sizeof(*bucket_) + + bucket_->size() * + sizeof( + std::remove_reference::type::value_type); } VectorRep::VectorRep(const KeyComparator& compare, Allocator* allocator, @@ -142,13 +141,13 @@ VectorRep::VectorRep(const KeyComparator& compare, Allocator* allocator, } VectorRep::Iterator::Iterator(class VectorRep* vrep, - std::shared_ptr> bucket, - const KeyComparator& compare) -: vrep_(vrep), - bucket_(bucket), - cit_(bucket_->end()), - compare_(compare), - sorted_(false) { } + std::shared_ptr> bucket, + const KeyComparator& compare) + : vrep_(vrep), + bucket_(bucket), + cit_(bucket_->end()), + compare_(compare), + sorted_(false) {} void VectorRep::Iterator::DoSort() const { // vrep is non-null means that we are working on an immutable memtable @@ -216,12 +215,11 @@ void VectorRep::Iterator::Seek(const Slice& user_key, // Do binary search to find first value not less than the target const char* encoded_key = (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key); - cit_ = std::equal_range(bucket_->begin(), - bucket_->end(), - encoded_key, - [this] (const char* a, const char* b) { + cit_ = std::equal_range(bucket_->begin(), bucket_->end(), encoded_key, + [this](const char* a, const char* b) { return compare_(a, b) < 0; - }).first; + }) + .first; } // Advance to the first entry with a key <= target @@ -282,7 +280,7 @@ MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) { } } else { std::shared_ptr tmp; - tmp.reset(new Bucket(*bucket_)); // make a copy + tmp.reset(new Bucket(*bucket_)); // make a copy if (arena == nullptr) { return new Iterator(nullptr, tmp, compare_); } else { @@ -290,7 +288,7 @@ MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) { } } } -} // anon namespace +} // namespace static std::unordered_map vector_rep_table_info = { {"count", diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc index 546df894a9a..1cc4c2cc576 100644 --- a/memtable/write_buffer_manager_test.cc +++ b/memtable/write_buffer_manager_test.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/write_buffer_manager.h" + #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { diff --git a/monitoring/histogram.cc b/monitoring/histogram.cc index 4d00219c974..61bc6c14097 100644 --- a/monitoring/histogram.cc +++ b/monitoring/histogram.cc @@ -52,11 +52,10 @@ size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { } namespace { - const HistogramBucketMapper bucketMapper; +const HistogramBucketMapper bucketMapper; } -HistogramStat::HistogramStat() - : num_buckets_(bucketMapper.BucketCount()) { +HistogramStat::HistogramStat() : num_buckets_(bucketMapper.BucketCount()) { assert(num_buckets_ == sizeof(buckets_) / sizeof(*buckets_)); Clear(); } @@ -109,12 +108,14 @@ void HistogramStat::Merge(const HistogramStat& other) { uint64_t old_min = min(); uint64_t other_min = other.min(); while (other_min < old_min && - !min_.compare_exchange_weak(old_min, other_min)) {} + !min_.compare_exchange_weak(old_min, other_min)) { + } uint64_t old_max = max(); uint64_t other_max = other.max(); while (other_max > old_max && - !max_.compare_exchange_weak(old_max, other_max)) {} + !max_.compare_exchange_weak(old_max, other_max)) { + } num_.fetch_add(other.num(), std::memory_order_relaxed); sum_.fetch_add(other.sum(), std::memory_order_relaxed); @@ -124,9 +125,7 @@ void HistogramStat::Merge(const HistogramStat& other) { } } -double HistogramStat::Median() const { - return Percentile(50.0); -} +double HistogramStat::Median() const { return Percentile(50.0); } double HistogramStat::Percentile(double p) const { double threshold = num() * (p / 100.0); @@ -136,14 +135,14 @@ double HistogramStat::Percentile(double p) const { cumulative_sum += bucket_value; if (cumulative_sum >= threshold) { // Scale linearly within this bucket - uint64_t left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b-1); + uint64_t left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b - 1); uint64_t right_point = bucketMapper.BucketLimit(b); uint64_t left_sum = cumulative_sum - bucket_value; uint64_t right_sum = cumulative_sum; double pos = 0; uint64_t right_left_diff = right_sum - left_sum; if (right_left_diff != 0) { - pos = (threshold - left_sum) / right_left_diff; + pos = (threshold - left_sum) / right_left_diff; } double r = left_point + (right_point - left_point) * pos; uint64_t cur_min = min(); @@ -180,8 +179,7 @@ std::string HistogramStat::ToString() const { uint64_t cur_num = num(); std::string r; char buf[1650]; - snprintf(buf, sizeof(buf), - "Count: %" PRIu64 " Average: %.4f StdDev: %.2f\n", + snprintf(buf, sizeof(buf), "Count: %" PRIu64 " Average: %.4f StdDev: %.2f\n", cur_num, Average(), StandardDeviation()); r.append(buf); snprintf(buf, sizeof(buf), @@ -195,7 +193,7 @@ std::string HistogramStat::ToString() const { Percentile(99.99)); r.append(buf); r.append("------------------------------------------------------\n"); - if (cur_num == 0) return r; // all buckets are empty + if (cur_num == 0) return r; // all buckets are empty const double mult = 100.0 / cur_num; uint64_t cumulative_sum = 0; for (unsigned int b = 0; b < num_buckets_; b++) { @@ -205,11 +203,11 @@ std::string HistogramStat::ToString() const { snprintf(buf, sizeof(buf), "%c %7" PRIu64 ", %7" PRIu64 " ] %8" PRIu64 " %7.3f%% %7.3f%% ", (b == 0) ? '[' : '(', - (b == 0) ? 0 : bucketMapper.BucketLimit(b-1), // left - bucketMapper.BucketLimit(b), // right - bucket_value, // count - (mult * bucket_value), // percentage - (mult * cumulative_sum)); // cumulative percentage + (b == 0) ? 0 : bucketMapper.BucketLimit(b - 1), // left + bucketMapper.BucketLimit(b), // right + bucket_value, // count + (mult * bucket_value), // percentage + (mult * cumulative_sum)); // cumulative percentage r.append(buf); // Add hash marks based on percentage; 20 marks for 100%. @@ -220,7 +218,7 @@ std::string HistogramStat::ToString() const { return r; } -void HistogramStat::Data(HistogramData * const data) const { +void HistogramStat::Data(HistogramData* const data) const { assert(data); data->median = Median(); data->percentile95 = Percentile(95); @@ -238,13 +236,9 @@ void HistogramImpl::Clear() { stats_.Clear(); } -bool HistogramImpl::Empty() const { - return stats_.Empty(); -} +bool HistogramImpl::Empty() const { return stats_.Empty(); } -void HistogramImpl::Add(uint64_t value) { - stats_.Add(value); -} +void HistogramImpl::Add(uint64_t value) { stats_.Add(value); } void HistogramImpl::Merge(const Histogram& other) { if (strcmp(Name(), other.Name()) == 0) { @@ -257,28 +251,20 @@ void HistogramImpl::Merge(const HistogramImpl& other) { stats_.Merge(other.stats_); } -double HistogramImpl::Median() const { - return stats_.Median(); -} +double HistogramImpl::Median() const { return stats_.Median(); } double HistogramImpl::Percentile(double p) const { return stats_.Percentile(p); } -double HistogramImpl::Average() const { - return stats_.Average(); -} +double HistogramImpl::Average() const { return stats_.Average(); } double HistogramImpl::StandardDeviation() const { - return stats_.StandardDeviation(); + return stats_.StandardDeviation(); } -std::string HistogramImpl::ToString() const { - return stats_.ToString(); -} +std::string HistogramImpl::ToString() const { return stats_.ToString(); } -void HistogramImpl::Data(HistogramData * const data) const { - stats_.Data(data); -} +void HistogramImpl::Data(HistogramData* const data) const { stats_.Data(data); } } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/histogram.h b/monitoring/histogram.h index 41af0144225..15fee2b4f8d 100644 --- a/monitoring/histogram.h +++ b/monitoring/histogram.h @@ -8,36 +8,29 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "rocksdb/statistics.h" - #include -#include -#include #include #include +#include +#include + +#include "rocksdb/statistics.h" namespace ROCKSDB_NAMESPACE { class HistogramBucketMapper { public: - HistogramBucketMapper(); // converts a value to the bucket index. size_t IndexForValue(uint64_t value) const; // number of buckets required. - size_t BucketCount() const { - return bucketValues_.size(); - } + size_t BucketCount() const { return bucketValues_.size(); } - uint64_t LastValue() const { - return maxBucketValue_; - } + uint64_t LastValue() const { return maxBucketValue_; } - uint64_t FirstValue() const { - return minBucketValue_; - } + uint64_t FirstValue() const { return minBucketValue_; } uint64_t BucketLimit(const size_t bucketNumber) const { assert(bucketNumber < BucketCount()); @@ -88,14 +81,14 @@ struct HistogramStat { std::atomic_uint_fast64_t num_; std::atomic_uint_fast64_t sum_; std::atomic_uint_fast64_t sum_squares_; - std::atomic_uint_fast64_t buckets_[109]; // 109==BucketMapper::BucketCount() + std::atomic_uint_fast64_t buckets_[109]; // 109==BucketMapper::BucketCount() const uint64_t num_buckets_; }; class Histogram { -public: + public: Histogram() {} - virtual ~Histogram() {}; + virtual ~Histogram(){}; virtual void Clear() = 0; virtual bool Empty() const = 0; diff --git a/monitoring/histogram_test.cc b/monitoring/histogram_test.cc index 834884cbd2e..19e9f15d05b 100644 --- a/monitoring/histogram_test.cc +++ b/monitoring/histogram_test.cc @@ -18,14 +18,14 @@ namespace ROCKSDB_NAMESPACE { class HistogramTest : public testing::Test {}; namespace { - const double kIota = 0.1; - const HistogramBucketMapper bucketMapper; - std::shared_ptr clock = - std::make_shared(SystemClock::Default()); -} - -void PopulateHistogram(Histogram& histogram, - uint64_t low, uint64_t high, uint64_t loop = 1) { +const double kIota = 0.1; +const HistogramBucketMapper bucketMapper; +std::shared_ptr clock = + std::make_shared(SystemClock::Default()); +} // namespace + +void PopulateHistogram(Histogram& histogram, uint64_t low, uint64_t high, + uint64_t loop = 1) { Random rnd(test::RandomSeed()); for (; loop > 0; loop--) { for (uint64_t i = low; i <= high; i++) { @@ -39,7 +39,7 @@ void PopulateHistogram(Histogram& histogram, } void BasicOperation(Histogram& histogram) { - PopulateHistogram(histogram, 1, 110, 10); // fill up to bucket [70, 110) + PopulateHistogram(histogram, 1, 110, 10); // fill up to bucket [70, 110) HistogramData data; histogram.Data(&data); @@ -47,8 +47,8 @@ void BasicOperation(Histogram& histogram) { ASSERT_LE(fabs(histogram.Percentile(100.0) - 110.0), kIota); ASSERT_LE(fabs(data.percentile99 - 108.9), kIota); // 99 * 110 / 100 ASSERT_LE(fabs(data.percentile95 - 104.5), kIota); // 95 * 110 / 100 - ASSERT_LE(fabs(data.median - 55.0), kIota); // 50 * 110 / 100 - ASSERT_EQ(data.average, 55.5); // (1 + 110) / 2 + ASSERT_LE(fabs(data.median - 55.0), kIota); // 50 * 110 / 100 + ASSERT_EQ(data.average, 55.5); // (1 + 110) / 2 } void MergeHistogram(Histogram& histogram, Histogram& other) { @@ -62,8 +62,8 @@ void MergeHistogram(Histogram& histogram, Histogram& other) { ASSERT_LE(fabs(histogram.Percentile(100.0) - 250.0), kIota); ASSERT_LE(fabs(data.percentile99 - 247.5), kIota); // 99 * 250 / 100 ASSERT_LE(fabs(data.percentile95 - 237.5), kIota); // 95 * 250 / 100 - ASSERT_LE(fabs(data.median - 125.0), kIota); // 50 * 250 / 100 - ASSERT_EQ(data.average, 125.5); // (1 + 250) / 2 + ASSERT_LE(fabs(data.median - 125.0), kIota); // 50 * 250 / 100 + ASSERT_EQ(data.average, 125.5); // (1 + 250) / 2 } void EmptyHistogram(Histogram& histogram) { @@ -139,8 +139,8 @@ TEST_F(HistogramTest, HistogramWindowingExpire) { int micros_per_window = 1000000; uint64_t min_num_per_window = 0; - HistogramWindowingImpl - histogramWindowing(num_windows, micros_per_window, min_num_per_window); + HistogramWindowingImpl histogramWindowing(num_windows, micros_per_window, + min_num_per_window); histogramWindowing.TEST_UpdateClock(clock); PopulateHistogram(histogramWindowing, 1, 1, 100); clock->SleepForMicroseconds(micros_per_window); @@ -190,10 +190,10 @@ TEST_F(HistogramTest, HistogramWindowingMerge) { int micros_per_window = 1000000; uint64_t min_num_per_window = 0; - HistogramWindowingImpl - histogramWindowing(num_windows, micros_per_window, min_num_per_window); - HistogramWindowingImpl - otherWindowing(num_windows, micros_per_window, min_num_per_window); + HistogramWindowingImpl histogramWindowing(num_windows, micros_per_window, + min_num_per_window); + HistogramWindowingImpl otherWindowing(num_windows, micros_per_window, + min_num_per_window); histogramWindowing.TEST_UpdateClock(clock); otherWindowing.TEST_UpdateClock(clock); diff --git a/monitoring/histogram_windowing.cc b/monitoring/histogram_windowing.cc index f31bbe06ace..c41ae8a03de 100644 --- a/monitoring/histogram_windowing.cc +++ b/monitoring/histogram_windowing.cc @@ -23,11 +23,10 @@ HistogramWindowingImpl::HistogramWindowingImpl() { Clear(); } -HistogramWindowingImpl::HistogramWindowingImpl( - uint64_t num_windows, - uint64_t micros_per_window, - uint64_t min_num_per_window) : - num_windows_(num_windows), +HistogramWindowingImpl::HistogramWindowingImpl(uint64_t num_windows, + uint64_t micros_per_window, + uint64_t min_num_per_window) + : num_windows_(num_windows), micros_per_window_(micros_per_window), min_num_per_window_(min_num_per_window) { clock_ = SystemClock::Default(); @@ -35,8 +34,7 @@ HistogramWindowingImpl::HistogramWindowingImpl( Clear(); } -HistogramWindowingImpl::~HistogramWindowingImpl() { -} +HistogramWindowingImpl::~HistogramWindowingImpl() {} void HistogramWindowingImpl::Clear() { std::lock_guard lock(mutex_); @@ -55,7 +53,7 @@ bool HistogramWindowingImpl::Empty() const { return stats_.Empty(); } // of any operation. // Each individual value is atomic, it is just that some samples can go // in the older bucket which is tolerable. -void HistogramWindowingImpl::Add(uint64_t value){ +void HistogramWindowingImpl::Add(uint64_t value) { TimerTick(); // Parent (global) member update @@ -83,17 +81,15 @@ void HistogramWindowingImpl::Merge(const HistogramWindowingImpl& other) { uint64_t cur_window = current_window(); uint64_t other_cur_window = other.current_window(); // going backwards for alignment - for (unsigned int i = 0; - i < std::min(num_windows_, other.num_windows_); i++) { - uint64_t window_index = - (cur_window + num_windows_ - i) % num_windows_; + for (unsigned int i = 0; i < std::min(num_windows_, other.num_windows_); + i++) { + uint64_t window_index = (cur_window + num_windows_ - i) % num_windows_; uint64_t other_window_index = (other_cur_window + other.num_windows_ - i) % other.num_windows_; size_t windex = static_cast(window_index); size_t other_windex = static_cast(other_window_index); - window_stats_[windex].Merge( - other.window_stats_[other_windex]); + window_stats_[windex].Merge(other.window_stats_[other_windex]); } } @@ -101,9 +97,7 @@ std::string HistogramWindowingImpl::ToString() const { return stats_.ToString(); } -double HistogramWindowingImpl::Median() const { - return Percentile(50.0); -} +double HistogramWindowingImpl::Median() const { return Percentile(50.0); } double HistogramWindowingImpl::Percentile(double p) const { // Retry 3 times in total @@ -118,15 +112,13 @@ double HistogramWindowingImpl::Percentile(double p) const { return 0.0; } -double HistogramWindowingImpl::Average() const { - return stats_.Average(); -} +double HistogramWindowingImpl::Average() const { return stats_.Average(); } double HistogramWindowingImpl::StandardDeviation() const { return stats_.StandardDeviation(); } -void HistogramWindowingImpl::Data(HistogramData * const data) const { +void HistogramWindowingImpl::Data(HistogramData* const data) const { stats_.Data(data); } @@ -149,17 +141,17 @@ void HistogramWindowingImpl::SwapHistoryBucket() { last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed); uint64_t curr_window = current_window(); - uint64_t next_window = (curr_window == num_windows_ - 1) ? - 0 : curr_window + 1; + uint64_t next_window = + (curr_window == num_windows_ - 1) ? 0 : curr_window + 1; // subtract next buckets from totals and swap to next buckets - HistogramStat& stats_to_drop = - window_stats_[static_cast(next_window)]; + HistogramStat& stats_to_drop = + window_stats_[static_cast(next_window)]; if (!stats_to_drop.Empty()) { - for (size_t b = 0; b < stats_.num_buckets_; b++){ - stats_.buckets_[b].fetch_sub( - stats_to_drop.bucket_at(b), std::memory_order_relaxed); + for (size_t b = 0; b < stats_.num_buckets_; b++) { + stats_.buckets_[b].fetch_sub(stats_to_drop.bucket_at(b), + std::memory_order_relaxed); } if (stats_.min() == stats_to_drop.min()) { @@ -186,8 +178,8 @@ void HistogramWindowingImpl::SwapHistoryBucket() { stats_.num_.fetch_sub(stats_to_drop.num(), std::memory_order_relaxed); stats_.sum_.fetch_sub(stats_to_drop.sum(), std::memory_order_relaxed); - stats_.sum_squares_.fetch_sub( - stats_to_drop.sum_squares(), std::memory_order_relaxed); + stats_.sum_squares_.fetch_sub(stats_to_drop.sum_squares(), + std::memory_order_relaxed); stats_to_drop.Clear(); } diff --git a/monitoring/histogram_windowing.h b/monitoring/histogram_windowing.h index f8da07b3665..9a862671f4f 100644 --- a/monitoring/histogram_windowing.h +++ b/monitoring/histogram_windowing.h @@ -14,12 +14,10 @@ namespace ROCKSDB_NAMESPACE { class SystemClock; -class HistogramWindowingImpl : public Histogram -{ -public: +class HistogramWindowingImpl : public Histogram { + public: HistogramWindowingImpl(); - HistogramWindowingImpl(uint64_t num_windows, - uint64_t micros_per_window, + HistogramWindowingImpl(uint64_t num_windows, uint64_t micros_per_window, uint64_t min_num_per_window); HistogramWindowingImpl(const HistogramWindowingImpl&) = delete; @@ -56,7 +54,7 @@ class HistogramWindowingImpl : public Histogram inline uint64_t current_window() const { return current_window_.load(std::memory_order_relaxed); } - inline uint64_t last_swap_time() const{ + inline uint64_t last_swap_time() const { return last_swap_time_.load(std::memory_order_relaxed); } diff --git a/monitoring/in_memory_stats_history.cc b/monitoring/in_memory_stats_history.cc index dba791e2b80..568d8ec134f 100644 --- a/monitoring/in_memory_stats_history.cc +++ b/monitoring/in_memory_stats_history.cc @@ -7,6 +7,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "monitoring/in_memory_stats_history.h" + #include "db/db_impl/db_impl.h" namespace ROCKSDB_NAMESPACE { diff --git a/monitoring/instrumented_mutex.h b/monitoring/instrumented_mutex.h index f2b0564bb30..e5aae34dfb8 100644 --- a/monitoring/instrumented_mutex.h +++ b/monitoring/instrumented_mutex.h @@ -46,9 +46,7 @@ class InstrumentedMutex { void Unlock() { mutex_.Unlock(); } - void AssertHeld() { - mutex_.AssertHeld(); - } + void AssertHeld() { mutex_.AssertHeld(); } private: void LockInternal(); @@ -76,9 +74,7 @@ class InstrumentedMutexLock { mutex_->Lock(); } - ~InstrumentedMutexLock() { - mutex_->Unlock(); - } + ~InstrumentedMutexLock() { mutex_->Unlock(); } private: InstrumentedMutex* const mutex_; @@ -114,13 +110,9 @@ class InstrumentedCondVar { bool TimedWait(uint64_t abs_time_us); - void Signal() { - cond_.Signal(); - } + void Signal() { cond_.Signal(); } - void SignalAll() { - cond_.SignalAll(); - } + void SignalAll() { cond_.SignalAll(); } private: void WaitInternal(); diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc index 2acc555dc75..04e98914da9 100644 --- a/monitoring/iostats_context.cc +++ b/monitoring/iostats_context.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include + #include "monitoring/iostats_context_imp.h" #include "rocksdb/env.h" @@ -17,9 +18,7 @@ static IOStatsContext iostats_context; thread_local IOStatsContext iostats_context; #endif -IOStatsContext* get_iostats_context() { - return &iostats_context; -} +IOStatsContext* get_iostats_context() { return &iostats_context; } void IOStatsContext::Reset() { #ifndef NIOSTATS_CONTEXT diff --git a/monitoring/iostats_context_test.cc b/monitoring/iostats_context_test.cc index ea40822ff4c..5fce33406a7 100644 --- a/monitoring/iostats_context_test.cc +++ b/monitoring/iostats_context_test.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "rocksdb/iostats_context.h" + #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index 33c71187317..9068ede0111 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -5,6 +5,7 @@ // #include + #include "monitoring/perf_context_imp.h" namespace ROCKSDB_NAMESPACE { @@ -17,9 +18,7 @@ PerfContext perf_context; thread_local PerfContext perf_context; #endif -PerfContext* get_perf_context() { - return &perf_context; -} +PerfContext* get_perf_context() { return &perf_context; } PerfContext::~PerfContext() { #if !defined(NPERF_CONTEXT) && !defined(OS_SOLARIS) @@ -499,15 +498,14 @@ void PerfContext::Reset() { ss << #counter << " = " << counter << ", "; \ } -#define PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(counter) \ - if (per_level_perf_context_enabled && \ - level_to_perf_context) { \ - ss << #counter << " = "; \ - for (auto& kv : *level_to_perf_context) { \ - if (!exclude_zero_counters || (kv.second.counter > 0)) { \ - ss << kv.second.counter << "@level" << kv.first << ", "; \ - } \ - } \ +#define PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(counter) \ + if (per_level_perf_context_enabled && level_to_perf_context) { \ + ss << #counter << " = "; \ + for (auto& kv : *level_to_perf_context) { \ + if (!exclude_zero_counters || (kv.second.counter > 0)) { \ + ss << kv.second.counter << "@level" << kv.first << ", "; \ + } \ + } \ } void PerfContextByLevel::Reset() { @@ -638,11 +636,11 @@ void PerfContext::EnablePerLevelPerfContext() { per_level_perf_context_enabled = true; } -void PerfContext::DisablePerLevelPerfContext(){ +void PerfContext::DisablePerLevelPerfContext() { per_level_perf_context_enabled = false; } -void PerfContext::ClearPerLevelPerfContext(){ +void PerfContext::ClearPerLevelPerfContext() { if (level_to_perf_context != nullptr) { level_to_perf_context->clear(); delete level_to_perf_context; diff --git a/monitoring/perf_level.cc b/monitoring/perf_level.cc index 9190af3021e..e3507624b63 100644 --- a/monitoring/perf_level.cc +++ b/monitoring/perf_level.cc @@ -5,6 +5,7 @@ // #include + #include "monitoring/perf_level_imp.h" namespace ROCKSDB_NAMESPACE { @@ -17,8 +18,6 @@ void SetPerfLevel(PerfLevel level) { perf_level = level; } -PerfLevel GetPerfLevel() { - return perf_level; -} +PerfLevel GetPerfLevel() { return perf_level; } } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/perf_level_imp.h b/monitoring/perf_level_imp.h index 68540e12570..28bd185cd1e 100644 --- a/monitoring/perf_level_imp.h +++ b/monitoring/perf_level_imp.h @@ -4,8 +4,8 @@ // (found in the LICENSE.Apache file in the root directory). // #pragma once -#include "rocksdb/perf_level.h" #include "port/port.h" +#include "rocksdb/perf_level.h" namespace ROCKSDB_NAMESPACE { diff --git a/monitoring/perf_step_timer.h b/monitoring/perf_step_timer.h index fb049f7252b..8deb312527f 100644 --- a/monitoring/perf_step_timer.h +++ b/monitoring/perf_step_timer.h @@ -26,9 +26,7 @@ class PerfStepTimer { metric_(metric), statistics_(statistics) {} - ~PerfStepTimer() { - Stop(); - } + ~PerfStepTimer() { Stop(); } void Start() { if (perf_counter_enabled_ || statistics_ != nullptr) { diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc index 9bde38b3ab3..f4c022148c8 100644 --- a/monitoring/persistent_stats_history.cc +++ b/monitoring/persistent_stats_history.cc @@ -11,6 +11,7 @@ #include #include #include + #include "db/db_impl/db_impl.h" #include "util/string_util.h" diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 794cb2f90a9..e01eed3f381 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -471,7 +471,7 @@ namespace { // a buffer size used for temp string buffers const int kTmpStrBufferSize = 200; -} // namespace +} // namespace std::string StatisticsImpl::ToString() const { MutexLock lock(&aggregate_lock_); diff --git a/monitoring/statistics.h b/monitoring/statistics.h index 20661777fb0..e0dc29d2846 100644 --- a/monitoring/statistics.h +++ b/monitoring/statistics.h @@ -4,8 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // #pragma once -#include "rocksdb/statistics.h" - #include #include #include @@ -14,6 +12,7 @@ #include "monitoring/histogram.h" #include "port/likely.h" #include "port/port.h" +#include "rocksdb/statistics.h" #include "util/core_local.h" #include "util/mutexlock.h" @@ -94,14 +93,15 @@ class StatisticsImpl : public Statistics { INTERNAL_HISTOGRAM_ENUM_MAX * sizeof(HistogramImpl)) % CACHE_LINE_SIZE)] ROCKSDB_FIELD_UNUSED; #endif - void *operator new(size_t s) { return port::cacheline_aligned_alloc(s); } - void *operator new[](size_t s) { return port::cacheline_aligned_alloc(s); } - void operator delete(void *p) { port::cacheline_aligned_free(p); } - void operator delete[](void *p) { port::cacheline_aligned_free(p); } + void* operator new(size_t s) { return port::cacheline_aligned_alloc(s); } + void* operator new[](size_t s) { return port::cacheline_aligned_alloc(s); } + void operator delete(void* p) { port::cacheline_aligned_free(p); } + void operator delete[](void* p) { port::cacheline_aligned_free(p); } }; #ifndef TEST_CACHE_LINE_SIZE - static_assert(sizeof(StatisticsData) % CACHE_LINE_SIZE == 0, "Expected " TOSTRING(CACHE_LINE_SIZE) "-byte aligned"); + static_assert(sizeof(StatisticsData) % CACHE_LINE_SIZE == 0, + "Expected " TOSTRING(CACHE_LINE_SIZE) "-byte aligned"); #endif CoreLocalArray per_core_stats_; diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index 21ac786b428..fed8535f4fb 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -66,10 +66,10 @@ TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) { // Wait for the first stats persist to finish, as the initial delay could be // different. - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); @@ -98,17 +98,17 @@ TEST_F(StatsHistoryTest, StatsPersistScheduling) { // Wait for the first stats persist to finish, as the initial delay could be // different. - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); // Test cancel job through SetOptions ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}})); int old_val = counter; - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec * 2); }); ASSERT_EQ(counter, old_val); @@ -130,7 +130,7 @@ TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) { {{"stats_persist_period_sec", std::to_string(kPeriodSec)}})); ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); ASSERT_GE(counter, 1); Close(); @@ -149,11 +149,11 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) { ReopenWithColumnFamilies({"default", "pikachu"}, options); // make sure the first stats persist to finish - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); // Wait for stats persist to finish - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); std::unique_ptr stats_iter; @@ -171,7 +171,7 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) { ASSERT_GT(stats_count, 0); // Wait a bit and verify no more stats are found for (int i = 0; i < 10; ++i) { - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(1); }); } ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter)); @@ -226,7 +226,7 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { const int kIterations = 10; for (int i = 0; i < kIterations; ++i) { - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); } @@ -250,7 +250,7 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { // Wait for stats persist to finish for (int i = 0; i < kIterations; ++i) { - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); } @@ -299,11 +299,11 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { // Wait for the first stats persist to finish, as the initial delay could be // different. - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); // Wait for stats persist to finish - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); auto iter = @@ -311,14 +311,14 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) { int key_count1 = countkeys(iter); delete iter; - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); int key_count2 = countkeys(iter); delete iter; - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); @@ -392,32 +392,32 @@ TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) { // Wait for the first stats persist to finish, as the initial delay could be // different. - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); // Wait for stats persist to finish - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); auto iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); countkeys(iter); delete iter; - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); std::map stats_map_after; @@ -481,10 +481,10 @@ TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) { ASSERT_EQ(Get(2, "foo"), "bar"); // make sure the first stats persist to finish - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); auto iter = db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily()); @@ -581,7 +581,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { // Wait for the first stats persist to finish, as the initial delay could be // different. - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); }); ColumnFamilyData* cfd_default = @@ -600,7 +600,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { ASSERT_OK(Put(1, "Eevee", "v0")); ASSERT_EQ("v0", Get(1, "Eevee")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to all three cf, flush default cf // LogNumbers: default: 16, stats: 10, pikachu: 5 @@ -629,7 +629,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { ASSERT_EQ("v2", Get("bar2")); ASSERT_EQ("v2", Get("foo2")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to default and stats cf, flushing default cf // LogNumbers: default: 19, stats: 19, pikachu: 19 @@ -644,7 +644,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { ASSERT_OK(Put(1, "Jolteon", "v3")); ASSERT_EQ("v3", Get(1, "Jolteon")); - dbfull()->TEST_WaitForPeridicTaskRun( + dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); }); // writing to all three cf, flushing test cf // LogNumbers: default: 19, stats: 19, pikachu: 22 diff --git a/monitoring/thread_status_updater.h b/monitoring/thread_status_updater.h index 792d4208f01..762c73ae2bb 100644 --- a/monitoring/thread_status_updater.h +++ b/monitoring/thread_status_updater.h @@ -36,9 +36,9 @@ #include #include +#include "port/port.h" #include "rocksdb/status.h" #include "rocksdb/thread_status.h" -#include "port/port.h" #include "util/thread_operation.h" namespace ROCKSDB_NAMESPACE { @@ -49,11 +49,9 @@ class ColumnFamilyHandle; struct ConstantColumnFamilyInfo { #ifdef ROCKSDB_USING_THREAD_STATUS public: - ConstantColumnFamilyInfo( - const void* _db_key, - const std::string& _db_name, - const std::string& _cf_name) : - db_key(_db_key), db_name(_db_name), cf_name(_cf_name) {} + ConstantColumnFamilyInfo(const void* _db_key, const std::string& _db_name, + const std::string& _cf_name) + : db_key(_db_key), db_name(_db_name), cf_name(_cf_name) {} const void* db_key; const std::string db_name; const std::string cf_name; @@ -142,13 +140,11 @@ class ThreadStatusUpdater { // will be set in std::memory_order_release. This is to ensure // whenever a thread operation is not OP_UNKNOWN, we will always // have a consistent information on its properties. - void SetThreadOperationProperty( - int i, uint64_t value); + void SetThreadOperationProperty(int i, uint64_t value); // Increase the "i"th property of the current operation with // the specified delta. - void IncreaseThreadOperationProperty( - int i, uint64_t delta); + void IncreaseThreadOperationProperty(int i, uint64_t delta); // Update the thread operation stage of the current thread. ThreadStatus::OperationStage SetThreadOperationStage( @@ -167,15 +163,13 @@ class ThreadStatusUpdater { void ClearThreadState(); // Obtain the status of all active registered threads. - Status GetThreadList( - std::vector* thread_list); + Status GetThreadList(std::vector* thread_list); // Create an entry in the global ColumnFamilyInfo table for the // specified column family. This function should be called only // when the current thread does not hold db_mutex. - void NewColumnFamilyInfo( - const void* db_key, const std::string& db_name, - const void* cf_key, const std::string& cf_name); + void NewColumnFamilyInfo(const void* db_key, const std::string& db_name, + const void* cf_key, const std::string& cf_name); // Erase all ConstantColumnFamilyInfo that is associated with the // specified db instance. This function should be called only when @@ -190,8 +184,7 @@ class ThreadStatusUpdater { // Verifies whether the input ColumnFamilyHandles matches // the information stored in the current cf_info_map. void TEST_VerifyColumnFamilyInfoMap( - const std::vector& handles, - bool check_exist); + const std::vector& handles, bool check_exist); protected: #ifdef ROCKSDB_USING_THREAD_STATUS @@ -204,9 +197,7 @@ class ThreadStatusUpdater { // Directly returns the pointer to thread_status_data_ without // checking whether enabling_tracking is true of not. - ThreadStatusData* Get() { - return thread_status_data_; - } + ThreadStatusData* Get() { return thread_status_data_; } // The mutex that protects cf_info_map and db_key_map. std::mutex thread_list_mutex_; @@ -222,8 +213,7 @@ class ThreadStatusUpdater { // A db_key to cf_key map that allows erasing elements in cf_info_map // associated to the same db_key faster. - std::unordered_map< - const void*, std::unordered_set> db_key_map_; + std::unordered_map> db_key_map_; #else static ThreadStatusData* thread_status_data_; diff --git a/monitoring/thread_status_util.h b/monitoring/thread_status_util.h index 70ef4e2ebc1..0137d26823f 100644 --- a/monitoring/thread_status_util.h +++ b/monitoring/thread_status_util.h @@ -30,8 +30,8 @@ class ColumnFamilyData; class ThreadStatusUtil { public: // Register the current thread for tracking. - static void RegisterThread( - const Env* env, ThreadStatus::ThreadType thread_type); + static void RegisterThread(const Env* env, + ThreadStatus::ThreadType thread_type); // Unregister the current thread. static void UnregisterThread(); @@ -62,19 +62,17 @@ class ThreadStatusUtil { static ThreadStatus::OperationStage SetThreadOperationStage( ThreadStatus::OperationStage stage); - static void SetThreadOperationProperty( - int code, uint64_t value); + static void SetThreadOperationProperty(int code, uint64_t value); - static void IncreaseThreadOperationProperty( - int code, uint64_t delta); + static void IncreaseThreadOperationProperty(int code, uint64_t delta); static void SetThreadState(ThreadStatus::StateType type); static void ResetThreadStatus(); #ifndef NDEBUG - static void TEST_SetStateDelay( - const ThreadStatus::StateType state, int micro); + static void TEST_SetStateDelay(const ThreadStatus::StateType state, + int micro); static void TEST_StateDelay(const ThreadStatus::StateType state); #endif @@ -121,8 +119,7 @@ class ThreadStatusUtil { // and set the thread state to the previous state in its destructor. class AutoThreadOperationStageUpdater { public: - explicit AutoThreadOperationStageUpdater( - ThreadStatus::OperationStage stage); + explicit AutoThreadOperationStageUpdater(ThreadStatus::OperationStage stage); ~AutoThreadOperationStageUpdater(); #ifdef ROCKSDB_USING_THREAD_STATUS diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc index c493ddca542..f7a94355dbe 100644 --- a/monitoring/thread_status_util_debug.cc +++ b/monitoring/thread_status_util_debug.cc @@ -15,8 +15,8 @@ namespace ROCKSDB_NAMESPACE { // the delay for debugging purpose. static std::atomic states_delay[ThreadStatus::NUM_STATE_TYPES]; -void ThreadStatusUtil::TEST_SetStateDelay( - const ThreadStatus::StateType state, int micro) { +void ThreadStatusUtil::TEST_SetStateDelay(const ThreadStatus::StateType state, + int micro) { states_delay[state].store(micro, std::memory_order_relaxed); } diff --git a/options/customizable_test.cc b/options/customizable_test.cc index 9d3c86c6205..2ed4eeb9e7e 100644 --- a/options/customizable_test.cc +++ b/options/customizable_test.cc @@ -1324,13 +1324,14 @@ class TestSecondaryCache : public SecondaryCache { public: static const char* kClassName() { return "Test"; } const char* Name() const override { return kClassName(); } - Status Insert(const Slice& /*key*/, void* /*value*/, + Status Insert(const Slice& /*key*/, Cache::ObjectPtr /*value*/, const Cache::CacheItemHelper* /*helper*/) override { return Status::NotSupported(); } std::unique_ptr Lookup( - const Slice& /*key*/, const Cache::CreateCallback& /*create_cb*/, - bool /*wait*/, bool /*advise_erase*/, bool& is_in_sec_cache) override { + const Slice& /*key*/, const Cache::CacheItemHelper* /*helper*/, + Cache::CreateContext* /*create_context*/, bool /*wait*/, + bool /*advise_erase*/, bool& is_in_sec_cache) override { is_in_sec_cache = true; return nullptr; } diff --git a/port/lang.h b/port/lang.h index 754f99bf225..52c597acdc0 100644 --- a/port/lang.h +++ b/port/lang.h @@ -11,7 +11,9 @@ #elif defined(__GNUC__) && __GNUC__ >= 7 #define FALLTHROUGH_INTENDED [[gnu::fallthrough]] #else -#define FALLTHROUGH_INTENDED do {} while (0) +#define FALLTHROUGH_INTENDED \ + do { \ + } while (0) #endif #endif diff --git a/port/likely.h b/port/likely.h index 397d757133e..0bd90d70153 100644 --- a/port/likely.h +++ b/port/likely.h @@ -10,9 +10,9 @@ #pragma once #if defined(__GNUC__) && __GNUC__ >= 4 -#define LIKELY(x) (__builtin_expect((x), 1)) +#define LIKELY(x) (__builtin_expect((x), 1)) #define UNLIKELY(x) (__builtin_expect((x), 0)) #else -#define LIKELY(x) (x) +#define LIKELY(x) (x) #define UNLIKELY(x) (x) #endif diff --git a/port/mmap.cc b/port/mmap.cc new file mode 100644 index 00000000000..36e8f32617f --- /dev/null +++ b/port/mmap.cc @@ -0,0 +1,98 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "port/mmap.h" + +#include +#include +#include +#include +#include + +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +MemMapping::~MemMapping() { +#ifdef OS_WIN + if (addr_ != nullptr) { + (void)::UnmapViewOfFile(addr_); + } + if (page_file_handle_ != NULL) { + (void)::CloseHandle(page_file_handle_); + } +#else // OS_WIN -> !OS_WIN + if (addr_ != nullptr) { + auto status = munmap(addr_, length_); + assert(status == 0); + if (status != 0) { + // TODO: handle error? + } + } +#endif // OS_WIN +} + +MemMapping::MemMapping(MemMapping&& other) noexcept { + *this = std::move(other); +} + +MemMapping& MemMapping::operator=(MemMapping&& other) noexcept { + if (&other == this) { + return *this; + } + this->~MemMapping(); + std::memcpy(this, &other, sizeof(*this)); + new (&other) MemMapping(); + return *this; +} + +MemMapping MemMapping::AllocateAnonymous(size_t length, bool huge) { + MemMapping mm; + mm.length_ = length; + assert(mm.addr_ == nullptr); + if (length == 0) { + // OK to leave addr as nullptr + return mm; + } + int huge_flag = 0; +#ifdef OS_WIN + if (huge) { +#ifdef FILE_MAP_LARGE_PAGES + huge_flag = FILE_MAP_LARGE_PAGES; +#endif // FILE_MAP_LARGE_PAGES + } + mm.page_file_handle_ = ::CreateFileMapping( + INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE | SEC_COMMIT, + Upper32of64(length), Lower32of64(length), nullptr); + if (mm.page_file_handle_ == NULL) { + // Failure + return mm; + } + mm.addr_ = ::MapViewOfFile(mm.page_file_handle_, FILE_MAP_WRITE | huge_flag, + 0, 0, length); +#else // OS_WIN -> !OS_WIN + if (huge) { +#ifdef MAP_HUGETLB + huge_flag = MAP_HUGETLB; +#endif // MAP_HUGE_TLB + } + mm.addr_ = mmap(nullptr, length, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | huge_flag, -1, 0); + if (mm.addr_ == MAP_FAILED) { + mm.addr_ = nullptr; + } +#endif // OS_WIN + return mm; +} + +MemMapping MemMapping::AllocateHuge(size_t length) { + return AllocateAnonymous(length, /*huge*/ true); +} + +MemMapping MemMapping::AllocateLazyZeroed(size_t length) { + return AllocateAnonymous(length, /*huge*/ false); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/port/mmap.h b/port/mmap.h new file mode 100644 index 00000000000..7342a13f967 --- /dev/null +++ b/port/mmap.h @@ -0,0 +1,70 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifdef OS_WIN +#include "port/win/port_win.h" +// ^^^ For proper/safe inclusion of windows.h. Must come first. +#include +#else +#include +#endif // OS_WIN + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// An RAII wrapper for mmaped memory +class MemMapping { + public: + static constexpr bool kHugePageSupported = +#if defined(MAP_HUGETLB) || defined(FILE_MAP_LARGE_PAGES) + true; +#else + false; +#endif + + // Allocate memory requesting to be backed by huge pages + static MemMapping AllocateHuge(size_t length); + + // Allocate memory that is only lazily mapped to resident memory and + // guaranteed to be zero-initialized. Note that some platforms like + // Linux allow memory over-commit, where only the used portion of memory + // matters, while other platforms require enough swap space (page file) to + // back the full mapping. + static MemMapping AllocateLazyZeroed(size_t length); + + // No copies + MemMapping(const MemMapping&) = delete; + MemMapping& operator=(const MemMapping&) = delete; + // Move + MemMapping(MemMapping&&) noexcept; + MemMapping& operator=(MemMapping&&) noexcept; + + // Releases the mapping + ~MemMapping(); + + inline void* Get() const { return addr_; } + inline size_t Length() const { return length_; } + + private: + MemMapping() {} + + // The mapped memory, or nullptr on failure / not supported + void* addr_ = nullptr; + // The known usable number of bytes starting at that address + size_t length_ = 0; + +#ifdef OS_WIN + HANDLE page_file_handle_ = NULL; +#endif // OS_WIN + + static MemMapping AllocateAnonymous(size_t length, bool huge); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/port/port_dirent.h b/port/port_dirent.h index 192abec4d6d..2b23e2f0760 100644 --- a/port/port_dirent.h +++ b/port/port_dirent.h @@ -33,11 +33,11 @@ int closedir(DIR* dirp); } // namespace port -using port::dirent; +using port::closedir; using port::DIR; +using port::dirent; using port::opendir; using port::readdir; -using port::closedir; } // namespace ROCKSDB_NAMESPACE diff --git a/port/port_posix.cc b/port/port_posix.cc index 935c8a97837..3872293b817 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -57,23 +57,21 @@ static int PthreadCall(const char* label, int result) { } Mutex::Mutex(bool adaptive) { - (void) adaptive; + (void)adaptive; #ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX if (!adaptive) { PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); } else { pthread_mutexattr_t mutex_attr; PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr)); - PthreadCall("set mutex attr", - pthread_mutexattr_settype(&mutex_attr, - PTHREAD_MUTEX_ADAPTIVE_NP)); + PthreadCall("set mutex attr", pthread_mutexattr_settype( + &mutex_attr, PTHREAD_MUTEX_ADAPTIVE_NP)); PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr)); - PthreadCall("destroy mutex attr", - pthread_mutexattr_destroy(&mutex_attr)); + PthreadCall("destroy mutex attr", pthread_mutexattr_destroy(&mutex_attr)); } #else PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); -#endif // ROCKSDB_PTHREAD_ADAPTIVE_MUTEX +#endif // ROCKSDB_PTHREAD_ADAPTIVE_MUTEX } Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } @@ -108,9 +106,8 @@ void Mutex::AssertHeld() { #endif } -CondVar::CondVar(Mutex* mu) - : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, nullptr)); +CondVar::CondVar(Mutex* mu) : mu_(mu) { + PthreadCall("init cv", pthread_cond_init(&cv_, nullptr)); } CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } @@ -146,9 +143,7 @@ bool CondVar::TimedWait(uint64_t abs_time_us) { return false; } -void CondVar::Signal() { - PthreadCall("signal", pthread_cond_signal(&cv_)); -} +void CondVar::Signal() { PthreadCall("signal", pthread_cond_signal(&cv_)); } void CondVar::SignalAll() { PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); @@ -158,28 +153,40 @@ RWMutex::RWMutex() { PthreadCall("init mutex", pthread_rwlock_init(&mu_, nullptr)); } -RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); } +RWMutex::~RWMutex() { + PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); +} -void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); } +void RWMutex::ReadLock() { + PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); +} -void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); } +void RWMutex::WriteLock() { + PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); +} -void RWMutex::ReadUnlock() { PthreadCall("read unlock", pthread_rwlock_unlock(&mu_)); } +void RWMutex::ReadUnlock() { + PthreadCall("read unlock", pthread_rwlock_unlock(&mu_)); +} -void RWMutex::WriteUnlock() { PthreadCall("write unlock", pthread_rwlock_unlock(&mu_)); } +void RWMutex::WriteUnlock() { + PthreadCall("write unlock", pthread_rwlock_unlock(&mu_)); +} int PhysicalCoreID() { #if defined(ROCKSDB_SCHED_GETCPU_PRESENT) && defined(__x86_64__) && \ (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 22)) - // sched_getcpu uses VDSO getcpu() syscall since 2.22. I believe Linux offers VDSO - // support only on x86_64. This is the fastest/preferred method if available. + // sched_getcpu uses VDSO getcpu() syscall since 2.22. I believe Linux offers + // VDSO support only on x86_64. This is the fastest/preferred method if + // available. int cpuno = sched_getcpu(); if (cpuno < 0) { return -1; } return cpuno; #elif defined(__x86_64__) || defined(__i386__) - // clang/gcc both provide cpuid.h, which defines __get_cpuid(), for x86_64 and i386. + // clang/gcc both provide cpuid.h, which defines __get_cpuid(), for x86_64 and + // i386. unsigned eax, ebx = 0, ecx, edx; if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { return -1; @@ -217,11 +224,11 @@ int GetMaxOpenFiles() { return -1; } -void *cacheline_aligned_alloc(size_t size) { +void* cacheline_aligned_alloc(size_t size) { #if __GNUC__ < 5 && defined(__SANITIZE_ADDRESS__) return malloc(size); -#elif ( _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__)) - void *m; +#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__)) + void* m; errno = posix_memalign(&m, CACHE_LINE_SIZE, size); return errno ? nullptr : m; #else @@ -229,9 +236,7 @@ void *cacheline_aligned_alloc(size_t size) { #endif } -void cacheline_aligned_free(void *memblock) { - free(memblock); -} +void cacheline_aligned_free(void* memblock) { free(memblock); } static size_t GetPageSize() { #if defined(OS_LINUX) || defined(_SC_PAGESIZE) diff --git a/port/port_posix.h b/port/port_posix.h index cd7bc1a6bff..417fbf4f611 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -25,36 +25,36 @@ #undef PLATFORM_IS_LITTLE_ENDIAN #if defined(OS_MACOSX) - #include - #if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER) - #define PLATFORM_IS_LITTLE_ENDIAN \ - (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN) - #endif +#include +#if defined(__DARWIN_LITTLE_ENDIAN) && defined(__DARWIN_BYTE_ORDER) +#define PLATFORM_IS_LITTLE_ENDIAN \ + (__DARWIN_BYTE_ORDER == __DARWIN_LITTLE_ENDIAN) +#endif #elif defined(OS_SOLARIS) - #include - #ifdef _LITTLE_ENDIAN - #define PLATFORM_IS_LITTLE_ENDIAN true - #else - #define PLATFORM_IS_LITTLE_ENDIAN false - #endif - #include +#include +#ifdef _LITTLE_ENDIAN +#define PLATFORM_IS_LITTLE_ENDIAN true +#else +#define PLATFORM_IS_LITTLE_ENDIAN false +#endif +#include #elif defined(OS_AIX) - #include - #include - #define PLATFORM_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN) - #include +#include +#include +#define PLATFORM_IS_LITTLE_ENDIAN (BYTE_ORDER == LITTLE_ENDIAN) +#include #elif defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || \ defined(OS_DRAGONFLYBSD) || defined(OS_ANDROID) - #include - #include - #define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) +#include +#include +#define PLATFORM_IS_LITTLE_ENDIAN (_BYTE_ORDER == _LITTLE_ENDIAN) #else - #include +#include #endif #include - #include #include + #include #include @@ -62,8 +62,8 @@ #define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) #endif -#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ - defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ +#if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) || \ + defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) || \ defined(OS_ANDROID) || defined(CYGWIN) || defined(OS_AIX) // Use fread/fwrite/fflush on platforms without _unlocked variants #define fread_unlocked fread @@ -71,8 +71,8 @@ #define fflush_unlocked fflush #endif -#if defined(OS_MACOSX) || defined(OS_FREEBSD) ||\ - defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) +#if defined(OS_MACOSX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || \ + defined(OS_DRAGONFLYBSD) // Use fsync() on platforms without fdatasync() #define fdatasync fsync #endif @@ -139,10 +139,10 @@ class RWMutex { void WriteLock(); void ReadUnlock(); void WriteUnlock(); - void AssertHeld() { } + void AssertHeld() {} private: - pthread_rwlock_t mu_; // the underlying platform mutex + pthread_rwlock_t mu_; // the underlying platform mutex }; class CondVar { @@ -154,6 +154,7 @@ class CondVar { bool TimedWait(uint64_t abs_time_us); void Signal(); void SignalAll(); + private: pthread_cond_t cv_; Mutex* mu_; @@ -168,6 +169,8 @@ static inline void AsmVolatilePause() { asm volatile("isb"); #elif defined(__powerpc64__) asm volatile("or 27,27,27"); +#elif defined(__loongarch64) + asm volatile("dbar 0"); #endif // it's okay for other platforms to be no-ops } @@ -205,9 +208,9 @@ extern void InitOnce(OnceType* once, void (*initializer)()); static_assert((CACHE_LINE_SIZE & (CACHE_LINE_SIZE - 1)) == 0, "Cache line size must be a power of 2 number of bytes"); -extern void *cacheline_aligned_alloc(size_t size); +extern void* cacheline_aligned_alloc(size_t size); -extern void cacheline_aligned_free(void *memblock); +extern void cacheline_aligned_free(void* memblock); #if defined(__aarch64__) // __builtin_prefetch(..., 1) turns into a prefetch into prfm pldl3keep. On @@ -236,5 +239,5 @@ int64_t GetProcessID(); // true on success or false on failure. bool GenerateRfcUuid(std::string* output); -} // namespace port +} // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/port/stack_trace.cc b/port/stack_trace.cc index afb8baf3b49..ef7144947fb 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -24,13 +24,13 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { #else +#include #include #include #include #include #include #include -#include #if defined(OS_FREEBSD) #include diff --git a/port/sys_time.h b/port/sys_time.h index d4dd2e07f33..f2137526b13 100644 --- a/port/sys_time.h +++ b/port/sys_time.h @@ -39,8 +39,8 @@ inline struct tm* LocalTimeR(const time_t* timep, struct tm* result) { } // namespace ROCKSDB_NAMESPACE #else -#include #include +#include namespace ROCKSDB_NAMESPACE { diff --git a/port/win/io_win.h b/port/win/io_win.h index d5a0790522f..a4fee8346c4 100644 --- a/port/win/io_win.h +++ b/port/win/io_win.h @@ -27,9 +27,9 @@ std::string GetWindowsErrSz(DWORD err); inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) { return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL)) ? IOStatus::NoSpace(context, GetWindowsErrSz(err)) - : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) - ? IOStatus::PathNotFound(context, GetWindowsErrSz(err)) - : IOStatus::IOError(context, GetWindowsErrSz(err)); + : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND)) + ? IOStatus::PathNotFound(context, GetWindowsErrSz(err)) + : IOStatus::IOError(context, GetWindowsErrSz(err)); } inline IOStatus IOErrorFromLastWindowsError(const std::string& context) { @@ -39,10 +39,9 @@ inline IOStatus IOErrorFromLastWindowsError(const std::string& context) { inline IOStatus IOError(const std::string& context, int err_number) { return (err_number == ENOSPC) ? IOStatus::NoSpace(context, errnoStr(err_number).c_str()) - : (err_number == ENOENT) - ? IOStatus::PathNotFound(context, - errnoStr(err_number).c_str()) - : IOStatus::IOError(context, errnoStr(err_number).c_str()); + : (err_number == ENOENT) + ? IOStatus::PathNotFound(context, errnoStr(err_number).c_str()) + : IOStatus::IOError(context, errnoStr(err_number).c_str()); } class WinFileData; diff --git a/port/win/port_win.h b/port/win/port_win.h index 9ac8d045de4..989b5620b9b 100644 --- a/port/win/port_win.h +++ b/port/win/port_win.h @@ -38,7 +38,6 @@ #undef DeleteFile #undef GetCurrentTime - #ifndef strcasecmp #define strcasecmp _stricmp #endif @@ -133,12 +132,9 @@ class Mutex { void operator=(const Mutex&) = delete; private: - friend class CondVar; - std::mutex& getLock() { - return mutex_; - } + std::mutex& getLock() { return mutex_; } std::mutex mutex_; #ifndef NDEBUG @@ -170,8 +166,7 @@ class RWMutex { class CondVar { public: - explicit CondVar(Mutex* mu) : mu_(mu) { - } + explicit CondVar(Mutex* mu) : mu_(mu) {} ~CondVar(); void Wait(); @@ -191,7 +186,6 @@ class CondVar { Mutex* mu_; }; - #ifdef _POSIX_THREADS using Thread = std::thread; #else @@ -204,15 +198,14 @@ using Thread = WindowsThread; // Posix semantics with initialization // adopted in the project struct OnceType { + struct Init {}; - struct Init {}; - - OnceType() {} - OnceType(const Init&) {} - OnceType(const OnceType&) = delete; - OnceType& operator=(const OnceType&) = delete; + OnceType() {} + OnceType(const Init&) {} + OnceType(const OnceType&) = delete; + OnceType& operator=(const OnceType&) = delete; - std::once_flag flag_; + std::once_flag flag_; }; #define LEVELDB_ONCE_INIT port::OnceType::Init() @@ -228,7 +221,7 @@ void* jemalloc_aligned_alloc(size_t size, size_t alignment) noexcept; void jemalloc_aligned_free(void* p) noexcept; #endif -inline void *cacheline_aligned_alloc(size_t size) { +inline void* cacheline_aligned_alloc(size_t size) { #ifdef ROCKSDB_JEMALLOC return jemalloc_aligned_alloc(size, CACHE_LINE_SIZE); #else @@ -236,7 +229,7 @@ inline void *cacheline_aligned_alloc(size_t size) { #endif } -inline void cacheline_aligned_free(void *memblock) { +inline void cacheline_aligned_free(void* memblock) { #ifdef ROCKSDB_JEMALLOC jemalloc_aligned_free(memblock); #else @@ -322,7 +315,6 @@ bool GenerateRfcUuid(std::string* output); } // namespace port - #ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES #define RX_FILESTRING std::wstring @@ -376,11 +368,11 @@ bool GenerateRfcUuid(std::string* output); #endif -using port::pthread_key_t; +using port::pthread_getspecific; using port::pthread_key_create; using port::pthread_key_delete; +using port::pthread_key_t; using port::pthread_setspecific; -using port::pthread_getspecific; using port::truncate; } // namespace ROCKSDB_NAMESPACE diff --git a/port/win/win_jemalloc.cc b/port/win/win_jemalloc.cc index 691ebc27e4b..cf38f55b756 100644 --- a/port/win/win_jemalloc.cc +++ b/port/win/win_jemalloc.cc @@ -10,10 +10,11 @@ #if defined(OS_WIN) #ifndef ROCKSDB_JEMALLOC -# error This file can only be part of jemalloc aware build +#error This file can only be part of jemalloc aware build #endif #include + #include "jemalloc/jemalloc.h" #include "port/win/port_win.h" @@ -31,10 +32,10 @@ void JemallocDeallocateForZSTD(void* /* opaque */, void* address) { ZSTD_customMem GetJeZstdAllocationOverrides() { return {JemallocAllocateForZSTD, JemallocDeallocateForZSTD, nullptr}; } -} // namespace port +} // namespace port } // namespace ROCKSDB_NAMESPACE -#endif // (ZSTD_VERSION_NUMBER >= 500) -#endif // defined(ZSTD) defined(ZSTD_STATIC_LINKING_ONLY) +#endif // (ZSTD_VERSION_NUMBER >= 500) +#endif // defined(ZSTD) defined(ZSTD_STATIC_LINKING_ONLY) // Global operators to be replaced by a linker when this file is // a part of the build diff --git a/port/win/win_logger.cc b/port/win/win_logger.cc index 6773699d1d8..072ea419a1d 100644 --- a/port/win/win_logger.cc +++ b/port/win/win_logger.cc @@ -57,9 +57,7 @@ void WinLogger::DebugWriter(const char* str, int len) { WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); } -Status WinLogger::CloseImpl() { - return CloseInternal(); -} +Status WinLogger::CloseImpl() { return CloseInternal(); } Status WinLogger::CloseInternal() { Status s; @@ -160,7 +158,7 @@ void WinLogger::Logv(const char* format, va_list ap) { DWORD bytesWritten = 0; BOOL ret = WriteFile(file_, base, static_cast(write_size), - &bytesWritten, NULL); + &bytesWritten, NULL); if (ret == FALSE) { std::string errSz = GetWindowsErrSz(GetLastError()); fprintf(stderr, "%s", errSz.c_str()); @@ -187,7 +185,7 @@ void WinLogger::Logv(const char* format, va_list ap) { size_t WinLogger::GetLogFileSize() const { return log_size_; } -} +} // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/port/win/win_logger.h b/port/win/win_logger.h index 809c7d5a2bb..1ca4610e9ee 100644 --- a/port/win/win_logger.h +++ b/port/win/win_logger.h @@ -44,9 +44,8 @@ class WinLogger : public ROCKSDB_NAMESPACE::Logger { void DebugWriter(const char* str, int len); -protected: - - Status CloseImpl() override; + protected: + Status CloseImpl() override; private: HANDLE file_; @@ -60,6 +59,6 @@ class WinLogger : public ROCKSDB_NAMESPACE::Logger { const static uint64_t flush_every_seconds_ = 5; }; -} +} // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/port/win/win_thread.cc b/port/win/win_thread.cc index 86dffea35cc..3c82e736ec9 100644 --- a/port/win/win_thread.cc +++ b/port/win/win_thread.cc @@ -17,7 +17,7 @@ #include "port/win/win_thread.h" #include -#include // __beginthreadex +#include // __beginthreadex #include #include @@ -28,14 +28,10 @@ namespace ROCKSDB_NAMESPACE { namespace port { struct WindowsThread::Data { - std::function func_; - uintptr_t handle_; + uintptr_t handle_; - Data(std::function&& func) : - func_(std::move(func)), - handle_(0) { - } + Data(std::function&& func) : func_(std::move(func)), handle_(0) {} Data(const Data&) = delete; Data& operator=(const Data&) = delete; @@ -43,36 +39,30 @@ struct WindowsThread::Data { static unsigned int __stdcall ThreadProc(void* arg); }; - void WindowsThread::Init(std::function&& func) { - data_ = std::make_shared(std::move(func)); // We create another instance of std::shared_ptr to get an additional ref // since we may detach and destroy this instance before the threadproc // may start to run. We choose to allocate this additional ref on the heap // so we do not need to synchronize and allow this thread to proceed - std::unique_ptr> th_data(new std::shared_ptr(data_)); + std::unique_ptr> th_data( + new std::shared_ptr(data_)); data_->handle_ = _beginthreadex(NULL, - 0, // stack size - &Data::ThreadProc, - th_data.get(), - 0, // init flag - &th_id_); + 0, // stack size + &Data::ThreadProc, th_data.get(), + 0, // init flag + &th_id_); if (data_->handle_ == 0) { - throw std::system_error(std::make_error_code( - std::errc::resource_unavailable_try_again), - "Unable to create a thread"); + throw std::system_error( + std::make_error_code(std::errc::resource_unavailable_try_again), + "Unable to create a thread"); } th_data.release(); } -WindowsThread::WindowsThread() : - data_(nullptr), - th_id_(0) -{} - +WindowsThread::WindowsThread() : data_(nullptr), th_id_(0) {} WindowsThread::~WindowsThread() { // Must be joined or detached @@ -87,13 +77,11 @@ WindowsThread::~WindowsThread() { } } -WindowsThread::WindowsThread(WindowsThread&& o) noexcept : - WindowsThread() { +WindowsThread::WindowsThread(WindowsThread&& o) noexcept : WindowsThread() { *this = std::move(o); } WindowsThread& WindowsThread::operator=(WindowsThread&& o) noexcept { - if (joinable()) { assert(false); std::terminate(); @@ -107,9 +95,7 @@ WindowsThread& WindowsThread::operator=(WindowsThread&& o) noexcept { return *this; } -bool WindowsThread::joinable() const { - return (data_ && data_->handle_ != 0); -} +bool WindowsThread::joinable() const { return (data_ && data_->handle_ != 0); } WindowsThread::native_handle_type WindowsThread::native_handle() const { return reinterpret_cast(data_->handle_); @@ -120,36 +106,33 @@ unsigned WindowsThread::hardware_concurrency() { } void WindowsThread::join() { - if (!joinable()) { assert(false); - throw std::system_error( - std::make_error_code(std::errc::invalid_argument), - "Thread is no longer joinable"); + throw std::system_error(std::make_error_code(std::errc::invalid_argument), + "Thread is no longer joinable"); } if (GetThreadId(GetCurrentThread()) == th_id_) { assert(false); throw std::system_error( - std::make_error_code(std::errc::resource_deadlock_would_occur), - "Can not join itself"); + std::make_error_code(std::errc::resource_deadlock_would_occur), + "Can not join itself"); } - auto ret = WaitForSingleObject(reinterpret_cast(data_->handle_), - INFINITE); + auto ret = + WaitForSingleObject(reinterpret_cast(data_->handle_), INFINITE); if (ret != WAIT_OBJECT_0) { auto lastError = GetLastError(); assert(false); - throw std::system_error(static_cast(lastError), - std::system_category(), - "WaitForSingleObjectFailed: thread join"); + throw std::system_error(static_cast(lastError), std::system_category(), + "WaitForSingleObjectFailed: thread join"); } BOOL rc #if defined(_MSC_VER) - = FALSE; + = FALSE; #else - __attribute__((__unused__)); + __attribute__((__unused__)); #endif rc = CloseHandle(reinterpret_cast(data_->handle_)); assert(rc != 0); @@ -157,12 +140,10 @@ void WindowsThread::join() { } bool WindowsThread::detach() { - if (!joinable()) { assert(false); - throw std::system_error( - std::make_error_code(std::errc::invalid_argument), - "Thread is no longer available"); + throw std::system_error(std::make_error_code(std::errc::invalid_argument), + "Thread is no longer available"); } BOOL ret = CloseHandle(reinterpret_cast(data_->handle_)); @@ -171,18 +152,18 @@ bool WindowsThread::detach() { return (ret != 0); } -void WindowsThread::swap(WindowsThread& o) { +void WindowsThread::swap(WindowsThread& o) { data_.swap(o.data_); std::swap(th_id_, o.th_id_); } -unsigned int __stdcall WindowsThread::Data::ThreadProc(void* arg) { +unsigned int __stdcall WindowsThread::Data::ThreadProc(void* arg) { auto ptr = reinterpret_cast*>(arg); std::unique_ptr> data(ptr); (*data)->func_(); return 0; } -} // namespace port +} // namespace port } // namespace ROCKSDB_NAMESPACE #endif // !_POSIX_THREADS diff --git a/port/win/win_thread.h b/port/win/win_thread.h index 89cfd02217f..916033b77c8 100644 --- a/port/win/win_thread.h +++ b/port/win/win_thread.h @@ -11,8 +11,8 @@ #ifndef _POSIX_THREADS -#include #include +#include #include #include "rocksdb/rocksdb_namespace.h" @@ -31,8 +31,8 @@ namespace port { class WindowsThread { struct Data; - std::shared_ptr data_; - unsigned int th_id_; + std::shared_ptr data_; + unsigned int th_id_; void Init(std::function&&); @@ -104,7 +104,7 @@ class WindowsThread { void swap(WindowsThread&); }; -} // namespace port +} // namespace port } // namespace ROCKSDB_NAMESPACE namespace std { @@ -112,6 +112,6 @@ inline void swap(ROCKSDB_NAMESPACE::port::WindowsThread& th1, ROCKSDB_NAMESPACE::port::WindowsThread& th2) { th1.swap(th2); } -} // namespace std +} // namespace std #endif // !_POSIX_THREADS diff --git a/port/win/xpress_win.cc b/port/win/xpress_win.cc index 9039ec89b5f..21904d50267 100644 --- a/port/win/xpress_win.cc +++ b/port/win/xpress_win.cc @@ -10,12 +10,13 @@ #if defined(OS_WIN) #include "port/win/xpress_win.h" + #include #include -#include -#include #include +#include +#include #ifdef XPRESS @@ -41,10 +42,9 @@ auto CloseDecompressorFun = [](void* h) { ::CloseDecompressor(reinterpret_cast(h)); } }; -} +} // namespace bool Compress(const char* input, size_t length, std::string* output) { - assert(input != nullptr); assert(output != nullptr); @@ -57,42 +57,40 @@ bool Compress(const char* input, size_t length, std::string* output) { COMPRESSOR_HANDLE compressor = NULL; - BOOL success = CreateCompressor( - COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm - allocRoutinesPtr, // Optional allocation routine - &compressor); // Handle + BOOL success = + CreateCompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &compressor); // Handle if (!success) { #ifdef _DEBUG - std::cerr << "XPRESS: Failed to create Compressor LastError: " << - GetLastError() << std::endl; + std::cerr << "XPRESS: Failed to create Compressor LastError: " + << GetLastError() << std::endl; #endif return false; } - std::unique_ptr - compressorGuard(compressor, CloseCompressorFun); + std::unique_ptr compressorGuard( + compressor, CloseCompressorFun); SIZE_T compressedBufferSize = 0; // Query compressed buffer size. - success = ::Compress( - compressor, // Compressor Handle - const_cast(input), // Input buffer - length, // Uncompressed data size - NULL, // Compressed Buffer - 0, // Compressed Buffer size - &compressedBufferSize); // Compressed Data size + success = ::Compress(compressor, // Compressor Handle + const_cast(input), // Input buffer + length, // Uncompressed data size + NULL, // Compressed Buffer + 0, // Compressed Buffer size + &compressedBufferSize); // Compressed Data size if (!success) { - auto lastError = GetLastError(); if (lastError != ERROR_INSUFFICIENT_BUFFER) { #ifdef _DEBUG - std::cerr << - "XPRESS: Failed to estimate compressed buffer size LastError " << - lastError << std::endl; + std::cerr + << "XPRESS: Failed to estimate compressed buffer size LastError " + << lastError << std::endl; #endif return false; } @@ -106,18 +104,17 @@ bool Compress(const char* input, size_t length, std::string* output) { SIZE_T compressedDataSize = 0; // Compress - success = ::Compress( - compressor, // Compressor Handle - const_cast(input), // Input buffer - length, // Uncompressed data size - &result[0], // Compressed Buffer - compressedBufferSize, // Compressed Buffer size - &compressedDataSize); // Compressed Data size + success = ::Compress(compressor, // Compressor Handle + const_cast(input), // Input buffer + length, // Uncompressed data size + &result[0], // Compressed Buffer + compressedBufferSize, // Compressed Buffer size + &compressedDataSize); // Compressed Data size if (!success) { #ifdef _DEBUG - std::cerr << "XPRESS: Failed to compress LastError " << - GetLastError() << std::endl; + std::cerr << "XPRESS: Failed to compress LastError " << GetLastError() + << std::endl; #endif return false; } @@ -141,42 +138,39 @@ char* Decompress(const char* input_data, size_t input_length, DECOMPRESSOR_HANDLE decompressor = NULL; - BOOL success = CreateDecompressor( - COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm - allocRoutinesPtr, // Optional allocation routine - &decompressor); // Handle - + BOOL success = + CreateDecompressor(COMPRESS_ALGORITHM_XPRESS, // Compression Algorithm + allocRoutinesPtr, // Optional allocation routine + &decompressor); // Handle if (!success) { #ifdef _DEBUG std::cerr << "XPRESS: Failed to create Decompressor LastError " - << GetLastError() << std::endl; + << GetLastError() << std::endl; #endif return nullptr; } - std::unique_ptr - compressorGuard(decompressor, CloseDecompressorFun); + std::unique_ptr compressorGuard( + decompressor, CloseDecompressorFun); SIZE_T decompressedBufferSize = 0; - success = ::Decompress( - decompressor, // Compressor Handle - const_cast(input_data), // Compressed data - input_length, // Compressed data size - NULL, // Buffer set to NULL - 0, // Buffer size set to 0 - &decompressedBufferSize); // Decompressed Data size + success = ::Decompress(decompressor, // Compressor Handle + const_cast(input_data), // Compressed data + input_length, // Compressed data size + NULL, // Buffer set to NULL + 0, // Buffer size set to 0 + &decompressedBufferSize); // Decompressed Data size if (!success) { - auto lastError = GetLastError(); if (lastError != ERROR_INSUFFICIENT_BUFFER) { #ifdef _DEBUG std::cerr - << "XPRESS: Failed to estimate decompressed buffer size LastError " - << lastError << std::endl; + << "XPRESS: Failed to estimate decompressed buffer size LastError " + << lastError << std::endl; #endif return nullptr; } @@ -190,19 +184,14 @@ char* Decompress(const char* input_data, size_t input_length, SIZE_T decompressedDataSize = 0; - success = ::Decompress( - decompressor, - const_cast(input_data), - input_length, - outputBuffer.get(), - decompressedBufferSize, - &decompressedDataSize); + success = ::Decompress(decompressor, const_cast(input_data), + input_length, outputBuffer.get(), + decompressedBufferSize, &decompressedDataSize); if (!success) { #ifdef _DEBUG - std::cerr << - "XPRESS: Failed to decompress LastError " << - GetLastError() << std::endl; + std::cerr << "XPRESS: Failed to decompress LastError " << GetLastError() + << std::endl; #endif return nullptr; } @@ -212,8 +201,8 @@ char* Decompress(const char* input_data, size_t input_length, // Return the raw buffer to the caller supporting the tradition return outputBuffer.release(); } -} -} +} // namespace xpress +} // namespace port } // namespace ROCKSDB_NAMESPACE #endif diff --git a/port/win/xpress_win.h b/port/win/xpress_win.h index d491f963da9..187adffa658 100644 --- a/port/win/xpress_win.h +++ b/port/win/xpress_win.h @@ -21,6 +21,6 @@ bool Compress(const char* input, size_t length, std::string* output); char* Decompress(const char* input_data, size_t input_length, size_t* uncompressed_size); -} -} +} // namespace xpress +} // namespace port } // namespace ROCKSDB_NAMESPACE diff --git a/src.mk b/src.mk index 859d8777681..26f2bdde318 100644 --- a/src.mk +++ b/src.mk @@ -3,12 +3,13 @@ LIB_SOURCES = \ cache/cache.cc \ cache/cache_entry_roles.cc \ cache/cache_key.cc \ + cache/cache_helpers.cc \ cache/cache_reservation_manager.cc \ cache/charged_cache.cc \ cache/clock_cache.cc \ - cache/fast_lru_cache.cc \ cache/lru_cache.cc \ cache/compressed_secondary_cache.cc \ + cache/secondary_cache.cc \ cache/sharded_cache.cc \ cloud/aws/aws_file_system.cc \ cloud/aws/aws_kafka.cc \ @@ -24,7 +25,6 @@ LIB_SOURCES = \ cloud/cloud_manifest.cc \ cloud/cloud_scheduler.cc \ cloud/cloud_storage_provider.cc \ - cloud/cloud_file_cache.cc \ cloud/cloud_file_deletion_scheduler.cc \ db/arena_wrapped_db_iter.cc \ db/blob/blob_contents.cc \ @@ -172,6 +172,7 @@ LIB_SOURCES = \ options/options.cc \ options/options_helper.cc \ options/options_parser.cc \ + port/mmap.cc \ port/port_posix.cc \ port/win/env_default.cc \ port/win/env_win.cc \ @@ -188,6 +189,7 @@ LIB_SOURCES = \ table/block_based/block_based_table_iterator.cc \ table/block_based/block_based_table_reader.cc \ table/block_based/block_builder.cc \ + table/block_based/block_cache.cc \ table/block_based/block_prefetcher.cc \ table/block_based/block_prefix_index.cc \ table/block_based/data_block_hash_index.cc \ diff --git a/table/adaptive/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc index 8a65d64f068..bbea91b5426 100644 --- a/table/adaptive/adaptive_table_factory.cc +++ b/table/adaptive/adaptive_table_factory.cc @@ -6,9 +6,9 @@ #ifndef ROCKSDB_LITE #include "table/adaptive/adaptive_table_factory.h" -#include "table/table_builder.h" -#include "table/format.h" #include "port/port.h" +#include "table/format.h" +#include "table/table_builder.h" namespace ROCKSDB_NAMESPACE { @@ -48,8 +48,9 @@ Status AdaptiveTableFactory::NewTableReader( bool prefetch_index_and_filter_in_cache) const { Footer footer; IOOptions opts; - auto s = ReadFooterFromFile(opts, file.get(), nullptr /* prefetch_buffer */, - file_size, &footer); + auto s = + ReadFooterFromFile(opts, file.get(), *table_reader_options.ioptions.fs, + nullptr /* prefetch_buffer */, file_size, &footer); if (!s.ok()) { return s; } @@ -118,7 +119,8 @@ extern TableFactory* NewAdaptiveTableFactory( std::shared_ptr plain_table_factory, std::shared_ptr cuckoo_table_factory) { return new AdaptiveTableFactory(table_factory_to_write, - block_based_table_factory, plain_table_factory, cuckoo_table_factory); + block_based_table_factory, + plain_table_factory, cuckoo_table_factory); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/adaptive/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h index 65f816fad83..3b631942d23 100644 --- a/table/adaptive/adaptive_table_factory.h +++ b/table/adaptive/adaptive_table_factory.h @@ -8,6 +8,7 @@ #ifndef ROCKSDB_LITE #include + #include "rocksdb/options.h" #include "rocksdb/table.h" diff --git a/table/block_based/block.h b/table/block_based/block.h index 5d73f72f6aa..90f9aa397bf 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -236,6 +236,9 @@ class Block { // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; + // For TypedCacheInterface + const Slice& ContentSlice() const { return contents_.data; } + private: BlockContents contents_; const char* data_; // contents_.data.data() diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 011a71ccced..81113c9c7a8 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -21,7 +21,9 @@ #include #include +#include "block_cache.h" #include "cache/cache_entry_roles.h" +#include "cache/cache_helpers.h" #include "cache/cache_key.h" #include "cache/cache_reservation_manager.h" #include "db/dbformat.h" @@ -40,7 +42,6 @@ #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" -#include "table/block_based/block_like_traits.h" #include "table/block_based/filter_block.h" #include "table/block_based/filter_policy_internal.h" #include "table/block_based/full_filter_block.h" @@ -59,7 +60,6 @@ namespace ROCKSDB_NAMESPACE { extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesMetadataBlock; - // Without anonymous namespace here, we fail the warning -Wmissing-prototypes namespace { @@ -335,6 +335,7 @@ struct BlockBasedTableBuilder::Rep { std::vector> table_properties_collectors; std::unique_ptr pc_rep; + BlockCreateContext create_context; uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } @@ -443,6 +444,9 @@ struct BlockBasedTableBuilder::Rep { flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( table_options, data_block)), + create_context(&table_options, ioptions.stats, + compression_type == kZSTD || + compression_type == kZSTDNotFinalCompression), status_ok(true), io_status_ok(true) { if (tbo.target_file_size == 0) { @@ -1011,7 +1015,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->props.num_entries++; r->props.raw_key_size += key.size(); r->props.raw_value_size += value.size(); - if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion) { + if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion || + value_type == kTypeDeletionWithTimestamp) { r->props.num_deletions++; } else if (value_type == kTypeRangeDeletion) { r->props.num_deletions++; @@ -1239,6 +1244,10 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( handle->set_size(block_contents.size()); assert(status().ok()); assert(io_status().ok()); + if (uncompressed_block_data == nullptr) { + uncompressed_block_data = &block_contents; + assert(type == kNoCompression); + } { IOStatus io_s = r->file->Append(block_contents); @@ -1290,12 +1299,8 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( warm_cache = false; } if (warm_cache) { - if (type == kNoCompression) { - s = InsertBlockInCacheHelper(block_contents, handle, block_type); - } else if (uncompressed_block_data != nullptr) { - s = InsertBlockInCacheHelper(*uncompressed_block_data, handle, - block_type); - } + s = InsertBlockInCacheHelper(*uncompressed_block_data, handle, + block_type); if (!s.ok()) { r->SetStatus(s); return; @@ -1417,15 +1422,6 @@ IOStatus BlockBasedTableBuilder::io_status() const { return rep_->GetIOStatus(); } -namespace { -// Delete the entry resided in the cache. -template -void DeleteEntryCached(const Slice& /*key*/, void* value) { - auto entry = reinterpret_cast(value); - delete entry; -} -} // namespace - // // Make a copy of the block contents and insert into compressed block cache // @@ -1433,13 +1429,14 @@ Status BlockBasedTableBuilder::InsertBlockInCompressedCache( const Slice& block_contents, const CompressionType type, const BlockHandle* handle) { Rep* r = rep_; - Cache* block_cache_compressed = r->table_options.block_cache_compressed.get(); + CompressedBlockCacheInterface block_cache_compressed{ + r->table_options.block_cache_compressed.get()}; Status s; - if (type != kNoCompression && block_cache_compressed != nullptr) { + if (type != kNoCompression && block_cache_compressed) { size_t size = block_contents.size(); - auto ubuf = - AllocateBlock(size + 1, block_cache_compressed->memory_allocator()); + auto ubuf = AllocateBlock(size + 1, + block_cache_compressed.get()->memory_allocator()); memcpy(ubuf.get(), block_contents.data(), size); ubuf[size] = type; @@ -1451,10 +1448,9 @@ Status BlockBasedTableBuilder::InsertBlockInCompressedCache( CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle); - s = block_cache_compressed->Insert( + s = block_cache_compressed.Insert( key.AsSlice(), block_contents_to_cache, - block_contents_to_cache->ApproximateMemoryUsage(), - &DeleteEntryCached); + block_contents_to_cache->ApproximateMemoryUsage()); if (s.ok()) { RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD); } else { @@ -1470,65 +1466,19 @@ Status BlockBasedTableBuilder::InsertBlockInCompressedCache( Status BlockBasedTableBuilder::InsertBlockInCacheHelper( const Slice& block_contents, const BlockHandle* handle, BlockType block_type) { - Status s; - switch (block_type) { - case BlockType::kData: - case BlockType::kIndex: - case BlockType::kFilterPartitionIndex: - s = InsertBlockInCache(block_contents, handle, block_type); - break; - case BlockType::kFilter: - s = InsertBlockInCache(block_contents, handle, - block_type); - break; - case BlockType::kCompressionDictionary: - s = InsertBlockInCache(block_contents, handle, - block_type); - break; - default: - // no-op / not cached - break; - } - return s; -} -template -Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, - const BlockHandle* handle, - BlockType block_type) { - // Uncompressed regular block cache Cache* block_cache = rep_->table_options.block_cache.get(); Status s; - if (block_cache != nullptr) { - size_t size = block_contents.size(); - auto buf = AllocateBlock(size, block_cache->memory_allocator()); - memcpy(buf.get(), block_contents.data(), size); - BlockContents results(std::move(buf), size); - + auto helper = + GetCacheItemHelper(block_type, rep_->ioptions.lowest_used_cache_tier); + if (block_cache && helper && helper->create_cb) { CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle); - - const size_t read_amp_bytes_per_bit = - rep_->table_options.read_amp_bytes_per_bit; - - // TODO akanksha:: Dedup below code by calling - // BlockBasedTable::PutDataBlockToCache. - std::unique_ptr block_holder( - BlocklikeTraits::Create( - std::move(results), read_amp_bytes_per_bit, - rep_->ioptions.statistics.get(), - false /*rep_->blocks_definitely_zstd_compressed*/, - rep_->table_options.filter_policy.get())); - - assert(block_holder->own_bytes()); - size_t charge = block_holder->ApproximateMemoryUsage(); - s = block_cache->Insert( - key.AsSlice(), block_holder.get(), - BlocklikeTraits::GetCacheItemHelper(block_type), charge, - nullptr, Cache::Priority::LOW); + size_t charge; + s = WarmInCache(block_cache, key.AsSlice(), block_contents, + &rep_->create_context, helper, Cache::Priority::LOW, + &charge); if (s.ok()) { - // Release ownership of block_holder. - block_holder.release(); BlockBasedTable::UpdateCacheInsertionMetrics( block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(), rep_->ioptions.stats); diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index ecc13d0f7d0..7cf33953a12 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -122,9 +122,9 @@ class BlockBasedTableBuilder : public TableBuilder { void WriteBlock(const Slice& block_contents, BlockHandle* handle, BlockType block_type); // Directly write data to the file. - void WriteMaybeCompressedBlock(const Slice& data, CompressionType, - BlockHandle* handle, BlockType block_type, - const Slice* raw_data = nullptr); + void WriteMaybeCompressedBlock( + const Slice& block_contents, CompressionType, BlockHandle* handle, + BlockType block_type, const Slice* uncompressed_block_data = nullptr); void SetupCacheKeyPrefix(const TableBuilderOptions& tbo); diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 53b59e3d893..b3f76a731a9 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -474,7 +474,8 @@ void BlockBasedTableFactory::InitializeOptions() { } if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && table_options_.index_block_restart_interval != 1) { - // Currently kHashSearch is incompatible with index_block_restart_interval > 1 + // Currently kHashSearch is incompatible with + // index_block_restart_interval > 1 table_options_.index_block_restart_interval = 1; } if (table_options_.partition_filters && @@ -524,20 +525,24 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { // More complex test of shared key space, in case the instances are wrappers // for some shared underlying cache. + static Cache::CacheItemHelper kHelper{CacheEntryRole::kMisc}; CacheKey sentinel_key = CacheKey::CreateUniqueForProcessLifetime(); - static char kRegularBlockCacheMarker = 'b'; - static char kCompressedBlockCacheMarker = 'c'; - static char kPersistentCacheMarker = 'p'; + struct SentinelValue { + explicit SentinelValue(char _c) : c(_c) {} + char c; + }; + static SentinelValue kRegularBlockCacheMarker{'b'}; + static SentinelValue kCompressedBlockCacheMarker{'c'}; + static char kPersistentCacheMarker{'p'}; if (bbto.block_cache) { bbto.block_cache - ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, 1, - GetNoopDeleterForRole()) + ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, &kHelper, 1) .PermitUncheckedError(); } if (bbto.block_cache_compressed) { bbto.block_cache_compressed - ->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, 1, - GetNoopDeleterForRole()) + ->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, &kHelper, + 1) .PermitUncheckedError(); } if (bbto.persistent_cache) { @@ -551,8 +556,8 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { if (bbto.block_cache) { auto handle = bbto.block_cache->Lookup(sentinel_key.AsSlice()); if (handle) { - auto v = static_cast(bbto.block_cache->Value(handle)); - char c = *v; + auto v = static_cast(bbto.block_cache->Value(handle)); + char c = v->c; bbto.block_cache->Release(handle); if (v == &kCompressedBlockCacheMarker) { return Status::InvalidArgument( @@ -570,8 +575,9 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { if (bbto.block_cache_compressed) { auto handle = bbto.block_cache_compressed->Lookup(sentinel_key.AsSlice()); if (handle) { - auto v = static_cast(bbto.block_cache_compressed->Value(handle)); - char c = *v; + auto v = static_cast( + bbto.block_cache_compressed->Value(handle)); + char c = v->c; bbto.block_cache_compressed->Release(handle); if (v == &kRegularBlockCacheMarker) { return Status::InvalidArgument( @@ -594,11 +600,11 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { bbto.persistent_cache->Lookup(sentinel_key.AsSlice(), &data, &size) .PermitUncheckedError(); if (data && size > 0) { - if (data[0] == kRegularBlockCacheMarker) { + if (data[0] == kRegularBlockCacheMarker.c) { return Status::InvalidArgument( "persistent_cache and block_cache share the same key space, " "which is not supported"); - } else if (data[0] == kCompressedBlockCacheMarker) { + } else if (data[0] == kCompressedBlockCacheMarker.c) { return Status::InvalidArgument( "persistent_cache and block_cache_compressed share the same key " "space, " diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index 7937104ba6f..a2918b24866 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once #include "table/block_based/block_based_table_reader.h" - #include "table/block_based/block_based_table_reader_impl.h" #include "table/block_based/block_prefetcher.h" #include "table/block_based/reader_common.h" diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index b331cb4e522..e9005cbac07 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -19,6 +19,7 @@ #include #include +#include "block_cache.h" #include "cache/cache_entry_roles.h" #include "cache/cache_key.h" #include "db/compaction/compaction_picker.h" @@ -29,6 +30,7 @@ #include "file/random_access_file_reader.h" #include "logging/logging.h" #include "monitoring/perf_context_imp.h" +#include "parsed_full_filter_block.h" #include "port/lang.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" @@ -48,7 +50,6 @@ #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_iterator.h" -#include "table/block_based/block_like_traits.h" #include "table/block_based/block_prefix_index.h" #include "table/block_based/block_type.h" #include "table/block_based/filter_block.h" @@ -83,6 +84,26 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) { return heap_buf; } } // namespace + +// Explicitly instantiate templates for each "blocklike" type we use (and +// before implicit specialization). +// This makes it possible to keep the template definitions in the .cc file. +#define INSTANTIATE_RETRIEVE_BLOCK(T) \ + template Status BlockBasedTable::RetrieveBlock( \ + FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro, \ + const BlockHandle& handle, const UncompressionDict& uncompression_dict, \ + CachableEntry* out_parsed_block, GetContext* get_context, \ + BlockCacheLookupContext* lookup_context, bool for_compaction, \ + bool use_cache, bool wait_for_cache, bool async_read) const; + +INSTANTIATE_RETRIEVE_BLOCK(ParsedFullFilterBlock); +INSTANTIATE_RETRIEVE_BLOCK(UncompressionDict); +INSTANTIATE_RETRIEVE_BLOCK(Block_kData); +INSTANTIATE_RETRIEVE_BLOCK(Block_kIndex); +INSTANTIATE_RETRIEVE_BLOCK(Block_kFilterPartitionIndex); +INSTANTIATE_RETRIEVE_BLOCK(Block_kRangeDeletion); +INSTANTIATE_RETRIEVE_BLOCK(Block_kMetaIndex); + } // namespace ROCKSDB_NAMESPACE // Generate the regular and coroutine versions of some methods by @@ -104,9 +125,7 @@ extern const uint64_t kBlockBasedTableMagicNumber; extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesMetadataBlock; -BlockBasedTable::~BlockBasedTable() { - delete rep_; -} +BlockBasedTable::~BlockBasedTable() { delete rep_; } namespace { // Read the block identified by "handle" from "file". @@ -116,22 +135,22 @@ namespace { // @param uncompression_dict Data for presetting the compression library's // dictionary. template -Status ReadBlockFromFile( +Status ReadAndParseBlockFromFile( RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, std::unique_ptr* result, const ImmutableOptions& ioptions, - bool do_uncompress, bool maybe_compressed, BlockType block_type, + BlockCreateContext& create_context, bool maybe_compressed, const UncompressionDict& uncompression_dict, - const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit, - MemoryAllocator* memory_allocator, bool for_compaction, bool using_zstd, - const FilterPolicy* filter_policy, bool async_read) { + const PersistentCacheOptions& cache_options, + MemoryAllocator* memory_allocator, bool for_compaction, bool async_read) { assert(result); BlockContents contents; BlockFetcher block_fetcher( file, prefetch_buffer, footer, options, handle, &contents, ioptions, - do_uncompress, maybe_compressed, block_type, uncompression_dict, - cache_options, memory_allocator, nullptr, for_compaction); + /*do_uncompress*/ maybe_compressed, maybe_compressed, + TBlocklike::kBlockType, uncompression_dict, cache_options, + memory_allocator, nullptr, for_compaction); Status s; // If prefetch_buffer is not allocated, it will fallback to synchronous // reading of block contents. @@ -144,11 +163,8 @@ Status ReadBlockFromFile( s = block_fetcher.ReadBlockContents(); } if (s.ok()) { - result->reset(BlocklikeTraits::Create( - std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd, - filter_policy)); + create_context.Create(result, std::move(contents)); } - return s; } @@ -173,6 +189,16 @@ inline bool PrefixExtractorChangedHelper( } } +template +uint32_t GetBlockNumRestarts(const TBlocklike& block) { + if constexpr (std::is_convertible_v) { + const Block& b = block; + return b.NumRestarts(); + } else { + return 0; + } +} + } // namespace void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type, @@ -379,56 +405,6 @@ void BlockBasedTable::UpdateCacheInsertionMetrics( } } -Cache::Handle* BlockBasedTable::GetEntryFromCache( - const CacheTier& cache_tier, Cache* block_cache, const Slice& key, - BlockType block_type, const bool wait, GetContext* get_context, - const Cache::CacheItemHelper* cache_helper, - const Cache::CreateCallback& create_cb, Cache::Priority priority) const { - Cache::Handle* cache_handle = nullptr; - if (cache_tier == CacheTier::kNonVolatileBlockTier) { - cache_handle = block_cache->Lookup(key, cache_helper, create_cb, priority, - wait, rep_->ioptions.statistics.get()); - } else { - cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics.get()); - } - - // Avoid updating metrics here if the handle is not complete yet. This - // happens with MultiGet and secondary cache. So update the metrics only - // if its a miss, or a hit and value is ready - if (!cache_handle || block_cache->Value(cache_handle)) { - if (cache_handle != nullptr) { - UpdateCacheHitMetrics(block_type, get_context, - block_cache->GetUsage(cache_handle)); - } else { - UpdateCacheMissMetrics(block_type, get_context); - } - } - - return cache_handle; -} - -template -Status BlockBasedTable::InsertEntryToCache( - const CacheTier& cache_tier, Cache* block_cache, const Slice& key, - const Cache::CacheItemHelper* cache_helper, - std::unique_ptr&& block_holder, size_t charge, - Cache::Handle** cache_handle, Cache::Priority priority) const { - Status s = Status::OK(); - if (cache_tier == CacheTier::kNonVolatileBlockTier) { - s = block_cache->Insert(key, block_holder.get(), cache_helper, charge, - cache_handle, priority); - } else { - s = block_cache->Insert(key, block_holder.get(), charge, - cache_helper->del_cb, cache_handle, priority); - } - if (s.ok()) { - // Cache took ownership - block_holder.release(); - } - s.MustCheck(); - return s; -} - namespace { // Return True if table_properties has `user_prop_name` has a `true` value // or it doesn't contain this property (for backward compatible). @@ -640,8 +616,9 @@ Status BlockBasedTable::Open( IOOptions opts; s = file->PrepareIOOptions(ro, opts); if (s.ok()) { - s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size, - &footer, kBlockBasedTableMagicNumber); + s = ReadFooterFromFile(opts, file.get(), *ioptions.fs, + prefetch_buffer.get(), file_size, &footer, + kBlockBasedTableMagicNumber); } if (!s.ok()) { return s; @@ -688,6 +665,17 @@ Status BlockBasedTable::Open( return s; } + // Populate BlockCreateContext + bool blocks_definitely_zstd_compressed = + rep->table_properties && + (rep->table_properties->compression_name == + CompressionTypeToString(kZSTD) || + rep->table_properties->compression_name == + CompressionTypeToString(kZSTDNotFinalCompression)); + rep->create_context = + BlockCreateContext(&rep->table_options, rep->ioptions.stats, + blocks_definitely_zstd_compressed); + // Check expected unique id if provided if (expected_unique_id != kNullUniqueId64x2) { auto props = rep->table_properties; @@ -904,11 +892,6 @@ Status BlockBasedTable::ReadPropertiesBlock( rep_->blocks_maybe_compressed = rep_->table_properties->compression_name != CompressionTypeToString(kNoCompression); - rep_->blocks_definitely_zstd_compressed = - (rep_->table_properties->compression_name == - CompressionTypeToString(kZSTD) || - rep_->table_properties->compression_name == - CompressionTypeToString(kZSTDNotFinalCompression)); } } else { ROCKS_LOG_ERROR(rep_->ioptions.logger, @@ -1248,15 +1231,14 @@ Status BlockBasedTable::ReadMetaIndexBlock( std::unique_ptr* iter) { // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // it is an empty block. - std::unique_ptr metaindex; - Status s = ReadBlockFromFile( + std::unique_ptr metaindex; + Status s = ReadAndParseBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions, - true /* decompress */, true /*maybe_compressed*/, BlockType::kMetaIndex, + rep_->create_context, true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options, - 0 /* read_amp_bytes_per_bit */, GetMemoryAllocator(rep_->table_options), - false /* for_compaction */, rep_->blocks_definitely_zstd_compressed, - nullptr /* filter_policy */, false /* async_read */); + GetMemoryAllocator(rep_->table_options), false /* for_compaction */, + false /* async_read */); if (!s.ok()) { ROCKS_LOG_ERROR(rep_->ioptions.logger, @@ -1273,16 +1255,13 @@ Status BlockBasedTable::ReadMetaIndexBlock( } template -Status BlockBasedTable::GetDataBlockFromCache( - const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed, +WithBlocklikeCheck BlockBasedTable::GetDataBlockFromCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CompressedBlockCacheInterface block_cache_compressed, const ReadOptions& read_options, CachableEntry* out_parsed_block, - const UncompressionDict& uncompression_dict, BlockType block_type, - const bool wait, GetContext* get_context) const { - const size_t read_amp_bytes_per_bit = - block_type == BlockType::kData - ? rep_->table_options.read_amp_bytes_per_bit - : 0; + const UncompressionDict& uncompression_dict, const bool wait, + GetContext* get_context) const { assert(out_parsed_block); assert(out_parsed_block->IsEmpty()); // Here we treat the legacy name "...index_and_filter_blocks..." to mean all @@ -1293,33 +1272,33 @@ Status BlockBasedTable::GetDataBlockFromCache( // high-priority treatment if it should go into BlockCache. const Cache::Priority priority = rep_->table_options.cache_index_and_filter_blocks_with_high_priority && - block_type != BlockType::kData && - block_type != BlockType::kProperties + TBlocklike::kBlockType != BlockType::kData && + TBlocklike::kBlockType != BlockType::kProperties ? Cache::Priority::HIGH : Cache::Priority::LOW; Status s; - BlockContents* compressed_block = nullptr; - Cache::Handle* block_cache_compressed_handle = nullptr; Statistics* statistics = rep_->ioptions.statistics.get(); - bool using_zstd = rep_->blocks_definitely_zstd_compressed; - const FilterPolicy* filter_policy = rep_->filter_policy; - Cache::CreateCallback create_cb = GetCreateCallback( - read_amp_bytes_per_bit, statistics, using_zstd, filter_policy); // Lookup uncompressed cache first - if (block_cache != nullptr) { + if (block_cache) { assert(!cache_key.empty()); - Cache::Handle* cache_handle = nullptr; - cache_handle = GetEntryFromCache( - rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key, - block_type, wait, get_context, - BlocklikeTraits::GetCacheItemHelper(block_type), create_cb, - priority); - if (cache_handle != nullptr) { - out_parsed_block->SetCachedValue( - reinterpret_cast(block_cache->Value(cache_handle)), - block_cache, cache_handle); + auto cache_handle = block_cache.LookupFull( + cache_key, &rep_->create_context, priority, wait, statistics, + rep_->ioptions.lowest_used_cache_tier); + + // Avoid updating metrics here if the handle is not complete yet. This + // happens with MultiGet and secondary cache. So update the metrics only + // if its a miss, or a hit and value is ready + if (!cache_handle) { + UpdateCacheMissMetrics(TBlocklike::kBlockType, get_context); + } else { + TBlocklike* value = block_cache.Value(cache_handle); + if (value) { + UpdateCacheHitMetrics(TBlocklike::kBlockType, get_context, + block_cache.get()->GetUsage(cache_handle)); + } + out_parsed_block->SetCachedValue(value, block_cache.get(), cache_handle); return s; } } @@ -1327,24 +1306,14 @@ Status BlockBasedTable::GetDataBlockFromCache( // If not found, search from the compressed block cache. assert(out_parsed_block->IsEmpty()); - if (block_cache_compressed == nullptr) { + if (!block_cache_compressed) { return s; } assert(!cache_key.empty()); BlockContents contents; - if (rep_->ioptions.lowest_used_cache_tier == - CacheTier::kNonVolatileBlockTier) { - Cache::CreateCallback create_cb_special = GetCreateCallback( - read_amp_bytes_per_bit, statistics, using_zstd, filter_policy); - block_cache_compressed_handle = block_cache_compressed->Lookup( - cache_key, - BlocklikeTraits::GetCacheItemHelper(block_type), - create_cb_special, priority, true); - } else { - block_cache_compressed_handle = - block_cache_compressed->Lookup(cache_key, statistics); - } + auto block_cache_compressed_handle = + block_cache_compressed.Lookup(cache_key, statistics); // if we found in the compressed cache, then uncompress and insert into // uncompressed cache @@ -1355,8 +1324,8 @@ Status BlockBasedTable::GetDataBlockFromCache( // found compressed block RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); - compressed_block = reinterpret_cast( - block_cache_compressed->Value(block_cache_compressed_handle)); + BlockContents* compressed_block = + block_cache_compressed.Value(block_cache_compressed_handle); CompressionType compression_type = GetBlockCompressionType(*compressed_block); assert(compression_type != kNoCompression); @@ -1371,27 +1340,21 @@ Status BlockBasedTable::GetDataBlockFromCache( // Insert parsed block into block cache, the priority is based on the // data block type. if (s.ok()) { - std::unique_ptr block_holder( - BlocklikeTraits::Create( - std::move(contents), read_amp_bytes_per_bit, statistics, - rep_->blocks_definitely_zstd_compressed, - rep_->table_options.filter_policy.get())); - - if (block_cache != nullptr && block_holder->own_bytes() && - read_options.fill_cache) { + std::unique_ptr block_holder; + rep_->create_context.Create(&block_holder, std::move(contents)); + + if (block_cache && block_holder->own_bytes() && read_options.fill_cache) { size_t charge = block_holder->ApproximateMemoryUsage(); - Cache::Handle* cache_handle = nullptr; - auto block_holder_raw_ptr = block_holder.get(); - s = InsertEntryToCache( - rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key, - BlocklikeTraits::GetCacheItemHelper(block_type), - std::move(block_holder), charge, &cache_handle, priority); + BlockCacheTypedHandle* cache_handle = nullptr; + s = block_cache.InsertFull(cache_key, block_holder.get(), charge, + &cache_handle, priority, + rep_->ioptions.lowest_used_cache_tier); if (s.ok()) { assert(cache_handle != nullptr); - out_parsed_block->SetCachedValue(block_holder_raw_ptr, block_cache, - cache_handle); + out_parsed_block->SetCachedValue(block_holder.release(), + block_cache.get(), cache_handle); - UpdateCacheInsertionMetrics(block_type, get_context, charge, + UpdateCacheInsertionMetrics(TBlocklike::kBlockType, get_context, charge, s.IsOkOverwritten(), rep_->ioptions.stats); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); @@ -1402,27 +1365,23 @@ Status BlockBasedTable::GetDataBlockFromCache( } // Release hold on compressed cache entry - block_cache_compressed->Release(block_cache_compressed_handle); + block_cache_compressed.Release(block_cache_compressed_handle); return s; } template -Status BlockBasedTable::PutDataBlockToCache( - const Slice& cache_key, Cache* block_cache, Cache* block_cache_compressed, +WithBlocklikeCheck BlockBasedTable::PutDataBlockToCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CompressedBlockCacheInterface block_cache_compressed, CachableEntry* out_parsed_block, BlockContents&& block_contents, CompressionType block_comp_type, const UncompressionDict& uncompression_dict, - MemoryAllocator* memory_allocator, BlockType block_type, - GetContext* get_context) const { + MemoryAllocator* memory_allocator, GetContext* get_context) const { const ImmutableOptions& ioptions = rep_->ioptions; const uint32_t format_version = rep_->table_options.format_version; - const size_t read_amp_bytes_per_bit = - block_type == BlockType::kData - ? rep_->table_options.read_amp_bytes_per_bit - : 0; const Cache::Priority priority = rep_->table_options.cache_index_and_filter_blocks_with_high_priority && - block_type != BlockType::kData + TBlocklike::kBlockType != BlockType::kData ? Cache::Priority::HIGH : Cache::Priority::LOW; assert(out_parsed_block); @@ -1444,21 +1403,15 @@ Status BlockBasedTable::PutDataBlockToCache( if (!s.ok()) { return s; } - - block_holder.reset(BlocklikeTraits::Create( - std::move(uncompressed_block_contents), read_amp_bytes_per_bit, - statistics, rep_->blocks_definitely_zstd_compressed, - rep_->table_options.filter_policy.get())); + rep_->create_context.Create(&block_holder, + std::move(uncompressed_block_contents)); } else { - block_holder.reset(BlocklikeTraits::Create( - std::move(block_contents), read_amp_bytes_per_bit, statistics, - rep_->blocks_definitely_zstd_compressed, - rep_->table_options.filter_policy.get())); + rep_->create_context.Create(&block_holder, std::move(block_contents)); } // Insert compressed block into compressed block cache. // Release the hold on the compressed cache entry immediately. - if (block_cache_compressed != nullptr && block_comp_type != kNoCompression && + if (block_cache_compressed && block_comp_type != kNoCompression && block_contents.own_bytes()) { assert(block_contents.has_trailer); assert(!cache_key.empty()); @@ -1468,15 +1421,14 @@ Status BlockBasedTable::PutDataBlockToCache( auto block_cont_for_comp_cache = std::make_unique(std::move(block_contents)); size_t charge = block_cont_for_comp_cache->ApproximateMemoryUsage(); - s = InsertEntryToCache( - rep_->ioptions.lowest_used_cache_tier, block_cache_compressed, - cache_key, - BlocklikeTraits::GetCacheItemHelper(block_type), - std::move(block_cont_for_comp_cache), charge, nullptr, - Cache::Priority::LOW); + + s = block_cache_compressed.Insert(cache_key, + block_cont_for_comp_cache.get(), charge, + nullptr /*handle*/, Cache::Priority::LOW); if (s.ok()) { - // Avoid the following code to delete this cached block. + // Cache took ownership + block_cont_for_comp_cache.release(); RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); } else { RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); @@ -1484,20 +1436,19 @@ Status BlockBasedTable::PutDataBlockToCache( } // insert into uncompressed block cache - if (block_cache != nullptr && block_holder->own_bytes()) { + if (block_cache && block_holder->own_bytes()) { size_t charge = block_holder->ApproximateMemoryUsage(); - auto block_holder_raw_ptr = block_holder.get(); - Cache::Handle* cache_handle = nullptr; - s = InsertEntryToCache( - rep_->ioptions.lowest_used_cache_tier, block_cache, cache_key, - BlocklikeTraits::GetCacheItemHelper(block_type), - std::move(block_holder), charge, &cache_handle, priority); + BlockCacheTypedHandle* cache_handle = nullptr; + s = block_cache.InsertFull(cache_key, block_holder.get(), charge, + &cache_handle, priority, + rep_->ioptions.lowest_used_cache_tier); + if (s.ok()) { assert(cache_handle != nullptr); - out_parsed_block->SetCachedValue(block_holder_raw_ptr, block_cache, - cache_handle); + out_parsed_block->SetCachedValue(block_holder.release(), + block_cache.get(), cache_handle); - UpdateCacheInsertionMetrics(block_type, get_context, charge, + UpdateCacheInsertionMetrics(TBlocklike::kBlockType, get_context, charge, s.IsOkOverwritten(), rep_->ioptions.stats); } else { RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); @@ -1553,6 +1504,7 @@ InternalIteratorBase* BlockBasedTable::NewIndexIterator( lookup_context); } +// TODO? template <> DataBlockIter* BlockBasedTable::InitBlockIterator( const Rep* rep, Block* block, BlockType block_type, @@ -1562,6 +1514,7 @@ DataBlockIter* BlockBasedTable::InitBlockIterator( rep->ioptions.stats, block_contents_pinned); } +// TODO? template <> IndexBlockIter* BlockBasedTable::InitBlockIterator( const Rep* rep, Block* block, BlockType block_type, @@ -1580,18 +1533,20 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator( // the caller has already read it. In both cases, if ro.fill_cache is true, // it inserts the block into the block cache. template -Status BlockBasedTable::MaybeReadBlockAndLoadToCache( +WithBlocklikeCheck +BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, const bool wait, const bool for_compaction, - CachableEntry* out_parsed_block, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context, - BlockContents* contents, bool async_read) const { + CachableEntry* out_parsed_block, GetContext* get_context, + BlockCacheLookupContext* lookup_context, BlockContents* contents, + bool async_read) const { assert(out_parsed_block != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); - Cache* block_cache = rep_->table_options.block_cache.get(); - Cache* block_cache_compressed = - rep_->table_options.block_cache_compressed.get(); + BlockCacheInterface block_cache{ + rep_->table_options.block_cache.get()}; + CompressedBlockCacheInterface block_cache_compressed{ + rep_->table_options.block_cache_compressed.get()}; // First, try to get the block from the cache // @@ -1600,15 +1555,15 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( CacheKey key_data; Slice key; bool is_cache_hit = false; - if (block_cache != nullptr || block_cache_compressed != nullptr) { + if (block_cache || block_cache_compressed) { // create key for block cache key_data = GetCacheKey(rep_->base_cache_key, handle); key = key_data.AsSlice(); if (!contents) { s = GetDataBlockFromCache(key, block_cache, block_cache_compressed, ro, - out_parsed_block, uncompression_dict, - block_type, wait, get_context); + out_parsed_block, uncompression_dict, wait, + get_context); // Value could still be null at this point, so check the cache handle // and update the read pattern for prefetching if (out_parsed_block->GetValue() || out_parsed_block->GetCacheHandle()) { @@ -1633,8 +1588,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( ro.fill_cache) { Statistics* statistics = rep_->ioptions.stats; const bool maybe_compressed = - block_type != BlockType::kFilter && - block_type != BlockType::kCompressionDictionary && + TBlocklike::kBlockType != BlockType::kFilter && + TBlocklike::kBlockType != BlockType::kCompressionDictionary && rep_->blocks_maybe_compressed; const bool do_uncompress = maybe_compressed && !block_cache_compressed; CompressionType contents_comp_type; @@ -1647,7 +1602,8 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( BlockFetcher block_fetcher( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed, - block_type, uncompression_dict, rep_->persistent_cache_options, + TBlocklike::kBlockType, uncompression_dict, + rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options), GetMemoryAllocatorForCompressedBlock(rep_->table_options)); @@ -1665,7 +1621,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( contents_comp_type = block_fetcher.get_compression_type(); contents = &tmp_contents; if (get_context) { - switch (block_type) { + switch (TBlocklike::kBlockType) { case BlockType::kIndex: ++get_context->get_context_stats_.num_index_read; break; @@ -1687,7 +1643,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( s = PutDataBlockToCache( key, block_cache, block_cache_compressed, out_parsed_block, std::move(*contents), contents_comp_type, uncompression_dict, - GetMemoryAllocator(rep_->table_options), block_type, get_context); + GetMemoryAllocator(rep_->table_options), get_context); } } } @@ -1699,13 +1655,13 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( uint64_t nkeys = 0; if (out_parsed_block->GetValue()) { // Approximate the number of keys in the block using restarts. + // FIXME: Should this only apply to data blocks? nkeys = rep_->table_options.block_restart_interval * - BlocklikeTraits::GetNumRestarts( - *out_parsed_block->GetValue()); + GetBlockNumRestarts(*out_parsed_block->GetValue()); usage = out_parsed_block->GetValue()->ApproximateMemoryUsage(); } TraceType trace_block_type = TraceType::kTraceMax; - switch (block_type) { + switch (TBlocklike::kBlockType) { case BlockType::kData: trace_block_type = TraceType::kBlockTraceDataBlock; break; @@ -1761,24 +1717,22 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache( return s; } -template -Status BlockBasedTable::RetrieveBlock( +template +WithBlocklikeCheck BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* out_parsed_block, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache, bool wait_for_cache, - bool async_read) const { + CachableEntry* out_parsed_block, GetContext* get_context, + BlockCacheLookupContext* lookup_context, bool for_compaction, + bool use_cache, bool wait_for_cache, bool async_read) const { assert(out_parsed_block); assert(out_parsed_block->IsEmpty()); Status s; if (use_cache) { - s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle, - uncompression_dict, wait_for_cache, - for_compaction, out_parsed_block, - block_type, get_context, lookup_context, - /*contents=*/nullptr, async_read); + s = MaybeReadBlockAndLoadToCache( + prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache, + for_compaction, out_parsed_block, get_context, lookup_context, + /*contents=*/nullptr, async_read); if (!s.ok()) { return s; @@ -1799,29 +1753,23 @@ Status BlockBasedTable::RetrieveBlock( } const bool maybe_compressed = - block_type != BlockType::kFilter && - block_type != BlockType::kCompressionDictionary && + TBlocklike::kBlockType != BlockType::kFilter && + TBlocklike::kBlockType != BlockType::kCompressionDictionary && rep_->blocks_maybe_compressed; - const bool do_uncompress = maybe_compressed; std::unique_ptr block; { Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram); - s = ReadBlockFromFile( + s = ReadAndParseBlockFromFile( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block, - rep_->ioptions, do_uncompress, maybe_compressed, block_type, + rep_->ioptions, rep_->create_context, maybe_compressed, uncompression_dict, rep_->persistent_cache_options, - block_type == BlockType::kData - ? rep_->table_options.read_amp_bytes_per_bit - : 0, - GetMemoryAllocator(rep_->table_options), for_compaction, - rep_->blocks_definitely_zstd_compressed, - rep_->table_options.filter_policy.get(), async_read); + GetMemoryAllocator(rep_->table_options), for_compaction, async_read); if (get_context) { - switch (block_type) { + switch (TBlocklike::kBlockType) { case BlockType::kIndex: ++(get_context->get_context_stats_.num_index_read); break; @@ -1845,40 +1793,6 @@ Status BlockBasedTable::RetrieveBlock( return s; } -// Explicitly instantiate templates for both "blocklike" types we use. -// This makes it possible to keep the template definitions in the .cc file. -template Status BlockBasedTable::RetrieveBlock( - FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* out_parsed_block, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache, bool wait_for_cache, - bool async_read) const; - -template Status BlockBasedTable::RetrieveBlock( - FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* out_parsed_block, - BlockType block_type, GetContext* get_context, - BlockCacheLookupContext* lookup_context, bool for_compaction, - bool use_cache, bool wait_for_cache, bool async_read) const; - -template Status BlockBasedTable::RetrieveBlock( - FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* out_parsed_block, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache, bool wait_for_cache, - bool async_read) const; - -template Status BlockBasedTable::RetrieveBlock( - FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, - const BlockHandle& handle, const UncompressionDict& uncompression_dict, - CachableEntry* out_parsed_block, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache, bool wait_for_cache, - bool async_read) const; - BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( const BlockBasedTable* table, UnorderedMap>* block_map) @@ -2983,7 +2897,8 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { out_stream << " HEX " << user_key.ToString(true) << ": " << blockhandles_iter->value().ToString(true, rep_->index_has_first_key) - << "\n"; + << " offset " << blockhandles_iter->value().handle.offset() + << " size " << blockhandles_iter->value().handle.size() << "\n"; std::string str_key = user_key.ToString(); std::string res_key(""); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 7054a2dd438..55ef76c45fb 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -21,6 +21,7 @@ #include "rocksdb/table_properties.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_factory.h" +#include "table/block_based/block_cache.h" #include "table/block_based/block_type.h" #include "table/block_based/cachable_entry.h" #include "table/block_based/filter_block.h" @@ -315,22 +316,6 @@ class BlockBasedTable : public TableReader { void UpdateCacheMissMetrics(BlockType block_type, GetContext* get_context) const; - Cache::Handle* GetEntryFromCache(const CacheTier& cache_tier, - Cache* block_cache, const Slice& key, - BlockType block_type, const bool wait, - GetContext* get_context, - const Cache::CacheItemHelper* cache_helper, - const Cache::CreateCallback& create_cb, - Cache::Priority priority) const; - - template - Status InsertEntryToCache(const CacheTier& cache_tier, Cache* block_cache, - const Slice& key, - const Cache::CacheItemHelper* cache_helper, - std::unique_ptr&& block_holder, - size_t charge, Cache::Handle** cache_handle, - Cache::Priority priority) const; - // Either Block::NewDataIterator() or Block::NewIndexIterator(). template static TBlockIter* InitBlockIterator(const Rep* rep, Block* block, @@ -348,26 +333,24 @@ class BlockBasedTable : public TableReader { // in uncompressed block cache, also sets cache_handle to reference that // block. template - Status MaybeReadBlockAndLoadToCache( + WithBlocklikeCheck MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, const bool wait, const bool for_compaction, - CachableEntry* block_entry, BlockType block_type, - GetContext* get_context, BlockCacheLookupContext* lookup_context, - BlockContents* contents, bool async_read) const; + CachableEntry* block_entry, GetContext* get_context, + BlockCacheLookupContext* lookup_context, BlockContents* contents, + bool async_read) const; // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the // read options allow I/O). template - Status RetrieveBlock(FilePrefetchBuffer* prefetch_buffer, - const ReadOptions& ro, const BlockHandle& handle, - const UncompressionDict& uncompression_dict, - CachableEntry* block_entry, - BlockType block_type, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - bool for_compaction, bool use_cache, bool wait_for_cache, - bool async_read) const; + WithBlocklikeCheck RetrieveBlock( + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, + const BlockHandle& handle, const UncompressionDict& uncompression_dict, + CachableEntry* block_entry, GetContext* get_context, + BlockCacheLookupContext* lookup_context, bool for_compaction, + bool use_cache, bool wait_for_cache, bool async_read) const; DECLARE_SYNC_AND_ASYNC_CONST( void, RetrieveMultipleBlocks, const ReadOptions& options, @@ -403,13 +386,12 @@ class BlockBasedTable : public TableReader { // @param uncompression_dict Data for presetting the compression library's // dictionary. template - Status GetDataBlockFromCache(const Slice& cache_key, Cache* block_cache, - Cache* block_cache_compressed, - const ReadOptions& read_options, - CachableEntry* block, - const UncompressionDict& uncompression_dict, - BlockType block_type, const bool wait, - GetContext* get_context) const; + WithBlocklikeCheck GetDataBlockFromCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CompressedBlockCacheInterface block_cache_compressed, + const ReadOptions& read_options, CachableEntry* block, + const UncompressionDict& uncompression_dict, const bool wait, + GetContext* get_context) const; // Put a maybe compressed block to the corresponding block caches. // This method will perform decompression against block_contents if needed @@ -422,15 +404,13 @@ class BlockBasedTable : public TableReader { // @param uncompression_dict Data for presetting the compression library's // dictionary. template - Status PutDataBlockToCache(const Slice& cache_key, Cache* block_cache, - Cache* block_cache_compressed, - CachableEntry* cached_block, - BlockContents&& block_contents, - CompressionType block_comp_type, - const UncompressionDict& uncompression_dict, - MemoryAllocator* memory_allocator, - BlockType block_type, - GetContext* get_context) const; + WithBlocklikeCheck PutDataBlockToCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CompressedBlockCacheInterface block_cache_compressed, + CachableEntry* cached_block, BlockContents&& block_contents, + CompressionType block_comp_type, + const UncompressionDict& uncompression_dict, + MemoryAllocator* memory_allocator, GetContext* get_context) const; // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // after a call to Seek(key), until handle_result returns false. @@ -597,7 +577,14 @@ struct BlockBasedTable::Rep { bool prefix_filtering; std::shared_ptr table_prefix_extractor; - std::shared_ptr fragmented_range_dels; + std::shared_ptr fragmented_range_dels; + + // FIXME + // If true, data blocks in this file are definitely ZSTD compressed. If false + // they might not be. When false we skip creating a ZSTD digested + // uncompression dictionary. Even if we get a false negative, things should + // still work, just not as quickly. + BlockCreateContext create_context; // If global_seqno is used, all Keys in this file will have the same // seqno with value `global_seqno`. @@ -617,12 +604,6 @@ struct BlockBasedTable::Rep { // before reading individual blocks enables certain optimizations. bool blocks_maybe_compressed = true; - // If true, data blocks in this file are definitely ZSTD compressed. If false - // they might not be. When false we skip creating a ZSTD digested - // uncompression dictionary. Even if we get a false negative, things should - // still work, just not as quickly. - bool blocks_definitely_zstd_compressed = false; - // These describe how index is encoded. bool index_has_first_key = false; bool index_key_includes_seq = true; diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h index dc321567a36..105a479f36c 100644 --- a/table/block_based/block_based_table_reader_impl.h +++ b/table/block_based/block_based_table_reader_impl.h @@ -7,8 +7,11 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "table/block_based/block_based_table_reader.h" +#include +#include "block.h" +#include "block_cache.h" +#include "table/block_based/block_based_table_reader.h" #include "table/block_based/reader_common.h" // The file contains some member functions of BlockBasedTable that @@ -17,6 +20,25 @@ // are templates. namespace ROCKSDB_NAMESPACE { +namespace { +using IterPlaceholderCacheInterface = + PlaceholderCacheInterface; + +template +struct IterTraits {}; + +template <> +struct IterTraits { + using IterBlocklike = Block_kData; +}; + +template <> +struct IterTraits { + using IterBlocklike = Block_kIndex; +}; + +} // namespace + // Convert an index iterator value (i.e., an encoded BlockHandle) // into an iterator over the contents of the corresponding block. // If input_iter is null, new a iterator @@ -28,6 +50,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( BlockCacheLookupContext* lookup_context, FilePrefetchBuffer* prefetch_buffer, bool for_compaction, bool async_read, Status& s) const { + using IterBlocklike = typename IterTraits::IterBlocklike; PERF_TIMER_GUARD(new_table_block_iter_nanos); TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; @@ -40,9 +63,13 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( if (rep_->uncompression_dict_reader && block_type == BlockType::kData) { CachableEntry uncompression_dict; const bool no_io = (ro.read_tier == kBlockCacheTier); + // For async scans, don't use the prefetch buffer since an async prefetch + // might already be under way and this would invalidate it. Also, the + // uncompression dict is typically at the end of the file and would + // most likely break the sequentiality of the access pattern. s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - prefetch_buffer, no_io, ro.verify_checksums, get_context, - lookup_context, &uncompression_dict); + ro.async_io ? nullptr : prefetch_buffer, no_io, ro.verify_checksums, + get_context, lookup_context, &uncompression_dict); if (!s.ok()) { iter->Invalidate(s); return iter; @@ -50,14 +77,14 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( const UncompressionDict& dict = uncompression_dict.GetValue() ? *uncompression_dict.GetValue() : UncompressionDict::GetEmptyDict(); - s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type, - get_context, lookup_context, for_compaction, - /* use_cache */ true, /* wait_for_cache */ true, - async_read); + s = RetrieveBlock( + prefetch_buffer, ro, handle, dict, &block.As(), + get_context, lookup_context, for_compaction, + /* use_cache */ true, /* wait_for_cache */ true, async_read); } else { s = RetrieveBlock( - prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(), &block, - block_type, get_context, lookup_context, for_compaction, + prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(), + &block.As(), get_context, lookup_context, for_compaction, /* use_cache */ true, /* wait_for_cache */ true, async_read); } @@ -88,18 +115,20 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( if (!block.IsCached()) { if (!ro.fill_cache) { - Cache* const block_cache = rep_->table_options.block_cache.get(); + IterPlaceholderCacheInterface block_cache{ + rep_->table_options.block_cache.get()}; if (block_cache) { // insert a dummy record to block cache to track the memory usage Cache::Handle* cache_handle = nullptr; - CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache); - s = block_cache->Insert(key.AsSlice(), nullptr, - block.GetValue()->ApproximateMemoryUsage(), - nullptr, &cache_handle); + CacheKey key = + CacheKey::CreateUniqueForCacheLifetime(block_cache.get()); + s = block_cache.Insert(key.AsSlice(), + block.GetValue()->ApproximateMemoryUsage(), + &cache_handle); if (s.ok()) { assert(cache_handle != nullptr); - iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache.get(), cache_handle); } } @@ -146,18 +175,20 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(const ReadOptions& ro, if (!block.IsCached()) { if (!ro.fill_cache) { - Cache* const block_cache = rep_->table_options.block_cache.get(); + IterPlaceholderCacheInterface block_cache{ + rep_->table_options.block_cache.get()}; if (block_cache) { // insert a dummy record to block cache to track the memory usage Cache::Handle* cache_handle = nullptr; - CacheKey key = CacheKey::CreateUniqueForCacheLifetime(block_cache); - s = block_cache->Insert(key.AsSlice(), nullptr, - block.GetValue()->ApproximateMemoryUsage(), - nullptr, &cache_handle); + CacheKey key = + CacheKey::CreateUniqueForCacheLifetime(block_cache.get()); + s = block_cache.Insert(key.AsSlice(), + block.GetValue()->ApproximateMemoryUsage(), + &cache_handle); if (s.ok()) { assert(cache_handle != nullptr); - iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, + iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache.get(), cache_handle); } } diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h index 8c7547a2a44..ea75f631d97 100644 --- a/table/block_based/block_based_table_reader_sync_and_async.h +++ b/table/block_based/block_based_table_reader_sync_and_async.h @@ -54,7 +54,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) (*statuses)[idx_in_batch] = RetrieveBlock(nullptr, options, handle, uncompression_dict, - &(*results)[idx_in_batch], BlockType::kData, + &(*results)[idx_in_batch].As(), mget_iter->get_context, &lookup_data_block_context, /* for_compaction */ false, /* use_cache */ true, /* wait_for_cache */ true, /* async_read */ false); @@ -269,7 +269,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) // will avoid looking up the block cache s = MaybeReadBlockAndLoadToCache( nullptr, options, handle, uncompression_dict, /*wait=*/true, - /*for_compaction=*/false, block_entry, BlockType::kData, + /*for_compaction=*/false, &block_entry->As(), mget_iter->get_context, &lookup_data_block_context, &serialized_block, /*async_read=*/false); @@ -441,7 +441,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) ? *uncompression_dict.GetValue() : UncompressionDict::GetEmptyDict(); Status s = RetrieveBlock( - nullptr, ro, handle, dict, &(results.back()), BlockType::kData, + nullptr, ro, handle, dict, &(results.back()).As(), miter->get_context, &lookup_data_block_context, /* for_compaction */ false, /* use_cache */ true, /* wait_for_cache */ false, /* async_read */ false); diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index c5a615dfc66..4a2ef7ed5fc 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -50,7 +50,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test { // Internal key is constructed directly from this key, // and internal key size is required to be >= 8 bytes, // so use %08u as the format string. - sprintf(k, "%08u", key); + snprintf(k, sizeof(k), "%08u", key); std::string v; if (mixed_with_human_readable_string_value) { v = (block % 2) ? rnd.HumanReadableString(256) diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc index 850693454bd..92702b17d0b 100644 --- a/table/block_based/block_builder.cc +++ b/table/block_based/block_builder.cc @@ -34,7 +34,9 @@ #include "table/block_based/block_builder.h" #include + #include + #include "db/dbformat.h" #include "rocksdb/comparator.h" #include "table/block_based/data_block_footer.h" diff --git a/table/block_based/block_builder.h b/table/block_based/block_builder.h index 7702ef1172e..5f68b449bf3 100644 --- a/table/block_based/block_builder.h +++ b/table/block_based/block_builder.h @@ -8,9 +8,10 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include + #include -#include #include "rocksdb/slice.h" #include "rocksdb/table.h" #include "table/block_based/data_block_hash_index.h" diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc new file mode 100644 index 00000000000..86a3918448d --- /dev/null +++ b/table/block_based/block_cache.cc @@ -0,0 +1,96 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "table/block_based/block_cache.h" + +namespace ROCKSDB_NAMESPACE { + +void BlockCreateContext::Create(std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kData( + std::move(block), table_options->read_amp_bytes_per_bit, statistics)); +} +void BlockCreateContext::Create(std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kIndex(std::move(block), + /*read_amp_bytes_per_bit*/ 0, statistics)); +} +void BlockCreateContext::Create( + std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kFilterPartitionIndex( + std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); +} +void BlockCreateContext::Create( + std::unique_ptr* parsed_out, BlockContents&& block) { + parsed_out->reset(new Block_kRangeDeletion( + std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); +} +void BlockCreateContext::Create(std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new Block_kMetaIndex( + std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); +} + +void BlockCreateContext::Create( + std::unique_ptr* parsed_out, BlockContents&& block) { + parsed_out->reset(new ParsedFullFilterBlock( + table_options->filter_policy.get(), std::move(block))); +} + +void BlockCreateContext::Create(std::unique_ptr* parsed_out, + BlockContents&& block) { + parsed_out->reset(new UncompressionDict( + block.data, std::move(block.allocation), using_zstd)); +} + +namespace { +// For getting SecondaryCache-compatible helpers from a BlockType. This is +// useful for accessing block cache in untyped contexts, such as for generic +// cache warming in table builder. +constexpr std::array(BlockType::kInvalid) + 1> + kCacheItemFullHelperForBlockType{{ + &BlockCacheInterface::kFullHelper, + &BlockCacheInterface::kFullHelper, + &BlockCacheInterface::kFullHelper, + nullptr, // kProperties + &BlockCacheInterface::kFullHelper, + &BlockCacheInterface::kFullHelper, + nullptr, // kHashIndexPrefixes + nullptr, // kHashIndexMetadata + nullptr, // kMetaIndex (not yet stored in block cache) + &BlockCacheInterface::kFullHelper, + nullptr, // kInvalid + }}; + +// For getting basic helpers from a BlockType (no SecondaryCache support) +constexpr std::array(BlockType::kInvalid) + 1> + kCacheItemBasicHelperForBlockType{{ + &BlockCacheInterface::kBasicHelper, + &BlockCacheInterface::kBasicHelper, + &BlockCacheInterface::kBasicHelper, + nullptr, // kProperties + &BlockCacheInterface::kBasicHelper, + &BlockCacheInterface::kBasicHelper, + nullptr, // kHashIndexPrefixes + nullptr, // kHashIndexMetadata + nullptr, // kMetaIndex (not yet stored in block cache) + &BlockCacheInterface::kBasicHelper, + nullptr, // kInvalid + }}; +} // namespace + +const Cache::CacheItemHelper* GetCacheItemHelper( + BlockType block_type, CacheTier lowest_used_cache_tier) { + if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) { + return kCacheItemFullHelperForBlockType[static_cast(block_type)]; + } else { + return kCacheItemBasicHelperForBlockType[static_cast(block_type)]; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h new file mode 100644 index 00000000000..8a881595baf --- /dev/null +++ b/table/block_based/block_cache.h @@ -0,0 +1,132 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// Code supporting block cache (Cache) access for block-based table, based on +// the convenient APIs in typed_cache.h + +#pragma once + +#include + +#include "cache/typed_cache.h" +#include "port/lang.h" +#include "table/block_based/block.h" +#include "table/block_based/block_type.h" +#include "table/block_based/parsed_full_filter_block.h" +#include "table/format.h" + +namespace ROCKSDB_NAMESPACE { + +// Metaprogramming wrappers for Block, to give each type a single role when +// used with FullTypedCacheInterface. +// (NOTE: previous attempts to create actual derived classes of Block with +// virtual calls resulted in performance regression) + +class Block_kData : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kDataBlock; + static constexpr BlockType kBlockType = BlockType::kData; +}; + +class Block_kIndex : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kIndexBlock; + static constexpr BlockType kBlockType = BlockType::kIndex; +}; + +class Block_kFilterPartitionIndex : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = + CacheEntryRole::kFilterMetaBlock; + static constexpr BlockType kBlockType = BlockType::kFilterPartitionIndex; +}; + +class Block_kRangeDeletion : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock; + static constexpr BlockType kBlockType = BlockType::kRangeDeletion; +}; + +// Useful for creating the Block even though meta index blocks are not +// yet stored in block cache +class Block_kMetaIndex : public Block { + public: + using Block::Block; + + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock; + static constexpr BlockType kBlockType = BlockType::kMetaIndex; +}; + +struct BlockCreateContext : public Cache::CreateContext { + BlockCreateContext() {} + BlockCreateContext(const BlockBasedTableOptions* _table_options, + Statistics* _statistics, bool _using_zstd) + : table_options(_table_options), + statistics(_statistics), + using_zstd(_using_zstd) {} + + const BlockBasedTableOptions* table_options = nullptr; + Statistics* statistics = nullptr; + bool using_zstd = false; + + // For TypedCacheInterface + template + inline void Create(std::unique_ptr* parsed_out, + size_t* charge_out, const Slice& data, + MemoryAllocator* alloc) { + Create(parsed_out, + BlockContents(AllocateAndCopyBlock(data, alloc), data.size())); + *charge_out = parsed_out->get()->ApproximateMemoryUsage(); + } + + void Create(std::unique_ptr* parsed_out, BlockContents&& block); + void Create(std::unique_ptr* parsed_out, BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); + void Create(std::unique_ptr* parsed_out, + BlockContents&& block); +}; + +// Convenient cache interface to use with block_cache_compressed +using CompressedBlockCacheInterface = + BasicTypedCacheInterface; + +// Convenient cache interface to use for block_cache, with support for +// SecondaryCache. +template +using BlockCacheInterface = + FullTypedCacheInterface; + +// Shortcut name for cache handles under BlockCacheInterface +template +using BlockCacheTypedHandle = + typename BlockCacheInterface::TypedHandle; + +// Selects the right helper based on BlockType and CacheTier +const Cache::CacheItemHelper* GetCacheItemHelper( + BlockType block_type, + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier); + +// For SFINAE check that a type is "blocklike" with a kCacheEntryRole member. +// Can get difficult compiler/linker errors without a good check like this. +template +using WithBlocklikeCheck = std::enable_if_t< + TBlocklike::kCacheEntryRole == CacheEntryRole::kMisc || true, TUse>; + +} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_like_traits.h b/table/block_based/block_like_traits.h deleted file mode 100644 index aeb2551147a..00000000000 --- a/table/block_based/block_like_traits.h +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once - -#include "cache/cache_entry_roles.h" -#include "port/lang.h" -#include "table/block_based/block.h" -#include "table/block_based/block_type.h" -#include "table/block_based/parsed_full_filter_block.h" -#include "table/format.h" - -namespace ROCKSDB_NAMESPACE { - -template -class BlocklikeTraits; - -template -Cache::CacheItemHelper* GetCacheItemHelperForRole(); - -template -Cache::CreateCallback GetCreateCallback(size_t read_amp_bytes_per_bit, - Statistics* statistics, bool using_zstd, - const FilterPolicy* filter_policy) { - return [read_amp_bytes_per_bit, statistics, using_zstd, filter_policy]( - const void* buf, size_t size, void** out_obj, - size_t* charge) -> Status { - assert(buf != nullptr); - std::unique_ptr buf_data(new char[size]()); - memcpy(buf_data.get(), buf, size); - BlockContents bc = BlockContents(std::move(buf_data), size); - TBlocklike* ucd_ptr = BlocklikeTraits::Create( - std::move(bc), read_amp_bytes_per_bit, statistics, using_zstd, - filter_policy); - *out_obj = reinterpret_cast(ucd_ptr); - *charge = size; - return Status::OK(); - }; -} - -template <> -class BlocklikeTraits { - public: - static BlockContents* Create(BlockContents&& contents, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool /* using_zstd */, - const FilterPolicy* /* filter_policy */) { - return new BlockContents(std::move(contents)); - } - - static uint32_t GetNumRestarts(const BlockContents& /* contents */) { - return 0; - } - - static size_t SizeCallback(void* obj) { - assert(obj != nullptr); - BlockContents* ptr = static_cast(obj); - return ptr->data.size(); - } - - static Status SaveToCallback(void* from_obj, size_t from_offset, - size_t length, void* out) { - assert(from_obj != nullptr); - BlockContents* ptr = static_cast(from_obj); - const char* buf = ptr->data.data(); - assert(length == ptr->data.size()); - (void)from_offset; - memcpy(out, buf, length); - return Status::OK(); - } - - static Cache::CacheItemHelper* GetCacheItemHelper(BlockType /*block_type*/) { - // E.g. compressed cache - return GetCacheItemHelperForRole(); - } -}; - -template <> -class BlocklikeTraits { - public: - static ParsedFullFilterBlock* Create(BlockContents&& contents, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool /* using_zstd */, - const FilterPolicy* filter_policy) { - return new ParsedFullFilterBlock(filter_policy, std::move(contents)); - } - - static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) { - return 0; - } - - static size_t SizeCallback(void* obj) { - assert(obj != nullptr); - ParsedFullFilterBlock* ptr = static_cast(obj); - return ptr->GetBlockContentsData().size(); - } - - static Status SaveToCallback(void* from_obj, size_t from_offset, - size_t length, void* out) { - assert(from_obj != nullptr); - ParsedFullFilterBlock* ptr = static_cast(from_obj); - const char* buf = ptr->GetBlockContentsData().data(); - assert(length == ptr->GetBlockContentsData().size()); - (void)from_offset; - memcpy(out, buf, length); - return Status::OK(); - } - - static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { - (void)block_type; - assert(block_type == BlockType::kFilter); - return GetCacheItemHelperForRole(); - } -}; - -template <> -class BlocklikeTraits { - public: - static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit, - Statistics* statistics, bool /* using_zstd */, - const FilterPolicy* /* filter_policy */) { - return new Block(std::move(contents), read_amp_bytes_per_bit, statistics); - } - - static uint32_t GetNumRestarts(const Block& block) { - return block.NumRestarts(); - } - - static size_t SizeCallback(void* obj) { - assert(obj != nullptr); - Block* ptr = static_cast(obj); - return ptr->size(); - } - - static Status SaveToCallback(void* from_obj, size_t from_offset, - size_t length, void* out) { - assert(from_obj != nullptr); - Block* ptr = static_cast(from_obj); - const char* buf = ptr->data(); - assert(length == ptr->size()); - (void)from_offset; - memcpy(out, buf, length); - return Status::OK(); - } - - static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { - switch (block_type) { - case BlockType::kData: - return GetCacheItemHelperForRole(); - case BlockType::kIndex: - return GetCacheItemHelperForRole(); - case BlockType::kFilterPartitionIndex: - return GetCacheItemHelperForRole(); - default: - // Not a recognized combination - assert(false); - FALLTHROUGH_INTENDED; - case BlockType::kRangeDeletion: - return GetCacheItemHelperForRole(); - } - } -}; - -template <> -class BlocklikeTraits { - public: - static UncompressionDict* Create(BlockContents&& contents, - size_t /* read_amp_bytes_per_bit */, - Statistics* /* statistics */, - bool using_zstd, - const FilterPolicy* /* filter_policy */) { - return new UncompressionDict(contents.data, std::move(contents.allocation), - using_zstd); - } - - static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) { - return 0; - } - - static size_t SizeCallback(void* obj) { - assert(obj != nullptr); - UncompressionDict* ptr = static_cast(obj); - return ptr->slice_.size(); - } - - static Status SaveToCallback(void* from_obj, size_t from_offset, - size_t length, void* out) { - assert(from_obj != nullptr); - UncompressionDict* ptr = static_cast(from_obj); - const char* buf = ptr->slice_.data(); - assert(length == ptr->slice_.size()); - (void)from_offset; - memcpy(out, buf, length); - return Status::OK(); - } - - static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) { - (void)block_type; - assert(block_type == BlockType::kCompressionDictionary); - return GetCacheItemHelperForRole(); - } -}; - -// Get an CacheItemHelper pointer for value type T and role R. -template -Cache::CacheItemHelper* GetCacheItemHelperForRole() { - static Cache::CacheItemHelper cache_helper( - BlocklikeTraits::SizeCallback, BlocklikeTraits::SaveToCallback, - GetCacheEntryDeleterForRole()); - return &cache_helper; -} - -} // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h index d21385f03ce..464dc8ebaf4 100644 --- a/table/block_based/cachable_entry.h +++ b/table/block_based/cachable_entry.h @@ -10,6 +10,8 @@ #pragma once #include +#include + #include "port/likely.h" #include "rocksdb/cache.h" #include "rocksdb/cleanable.h" @@ -40,18 +42,17 @@ namespace ROCKSDB_NAMESPACE { template class CachableEntry { -public: + public: CachableEntry() = default; CachableEntry(T* value, Cache* cache, Cache::Handle* cache_handle, - bool own_value) - : value_(value) - , cache_(cache) - , cache_handle_(cache_handle) - , own_value_(own_value) - { + bool own_value) + : value_(value), + cache_(cache), + cache_handle_(cache_handle), + own_value_(own_value) { assert(value_ != nullptr || - (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); assert(!!cache_ == !!cache_handle_); assert(!cache_handle_ || !own_value_); } @@ -65,7 +66,7 @@ class CachableEntry { cache_handle_(rhs.cache_handle_), own_value_(rhs.own_value_) { assert(value_ != nullptr || - (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); assert(!!cache_ == !!cache_handle_); assert(!cache_handle_ || !own_value_); @@ -85,7 +86,7 @@ class CachableEntry { own_value_ = rhs.own_value_; assert(value_ != nullptr || - (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); + (cache_ == nullptr && cache_handle_ == nullptr && !own_value_)); assert(!!cache_ == !!cache_handle_); assert(!cache_handle_ || !own_value_); @@ -94,13 +95,11 @@ class CachableEntry { return *this; } - ~CachableEntry() { - ReleaseResource(); - } + ~CachableEntry() { ReleaseResource(); } bool IsEmpty() const { return value_ == nullptr && cache_ == nullptr && cache_handle_ == nullptr && - !own_value_; + !own_value_; } bool IsCached() const { @@ -193,22 +192,45 @@ class CachableEntry { return true; } -private: - void ReleaseResource() noexcept { - if (LIKELY(cache_handle_ != nullptr)) { - assert(cache_ != nullptr); - cache_->Release(cache_handle_); - } else if (own_value_) { - delete value_; - } - } + // Since this class is essentially an elaborate pointer, it's sometimes + // useful to be able to upcast or downcast the base type of the pointer, + // especially when interacting with typed_cache.h. + template + std::enable_if_t || + std::is_base_of_v), + /* Actual return type */ + CachableEntry&> + As() { + CachableEntry* result_ptr = + reinterpret_cast*>(this); + // Ensure no weirdness in template instantiations + assert(static_cast(&this->value_) == + static_cast(&result_ptr->value_)); + assert(&this->cache_handle_ == &result_ptr->cache_handle_); + // This function depends on no arithmetic involved in the pointer + // conversion, which is not statically checkable. + assert(static_cast(this->value_) == + static_cast(result_ptr->value_)); + return *result_ptr; + } + + private: + void ReleaseResource() noexcept { + if (LIKELY(cache_handle_ != nullptr)) { + assert(cache_ != nullptr); + cache_->Release(cache_handle_); + } else if (own_value_) { + delete value_; + } + } - void ResetFields() noexcept { - value_ = nullptr; - cache_ = nullptr; - cache_handle_ = nullptr; - own_value_ = false; - } + void ResetFields() noexcept { + value_ = nullptr; + cache_ = nullptr; + cache_handle_ = nullptr; + own_value_ = false; + } static void ReleaseCacheHandle(void* arg1, void* arg2) { Cache* const cache = static_cast(arg1); @@ -224,7 +246,11 @@ class CachableEntry { delete static_cast(arg1); } -private: + private: + // Have to be your own best friend + template + friend class CachableEntry; + T* value_ = nullptr; Cache* cache_ = nullptr; Cache::Handle* cache_handle_ = nullptr; diff --git a/table/block_based/data_block_hash_index.cc b/table/block_based/data_block_hash_index.cc index 22247583432..c579dcc43dc 100644 --- a/table/block_based/data_block_hash_index.cc +++ b/table/block_based/data_block_hash_index.cc @@ -2,11 +2,12 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "table/block_based/data_block_hash_index.h" + #include #include #include "rocksdb/slice.h" -#include "table/block_based/data_block_hash_index.h" #include "util/coding.h" #include "util/hash.h" diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index b3106634b39..cd2e30833dd 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -130,7 +130,7 @@ TEST(DataBlockHashIndex, DataBlockHashTest) { ASSERT_EQ(buffer.size(), estimated_size); - buffer2 = buffer; // test for the correctness of relative offset + buffer2 = buffer; // test for the correctness of relative offset Slice s(buffer2); DataBlockHashIndex index; @@ -167,7 +167,7 @@ TEST(DataBlockHashIndex, DataBlockHashTestCollision) { ASSERT_EQ(buffer.size(), estimated_size); - buffer2 = buffer; // test for the correctness of relative offset + buffer2 = buffer; // test for the correctness of relative offset Slice s(buffer2); DataBlockHashIndex index; @@ -208,7 +208,7 @@ TEST(DataBlockHashIndex, DataBlockHashTestLarge) { ASSERT_EQ(buffer.size(), estimated_size); - buffer2 = buffer; // test for the correctness of relative offset + buffer2 = buffer; // test for the correctness of relative offset Slice s(buffer2); DataBlockHashIndex index; diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 804d6a354ab..e1e20699064 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -53,11 +53,11 @@ class FilterBlockBuilder { virtual ~FilterBlockBuilder() {} virtual void Add( - const Slice& key_without_ts) = 0; // Add a key to current filter - virtual bool IsEmpty() const = 0; // Empty == none added + const Slice& key_without_ts) = 0; // Add a key to current filter + virtual bool IsEmpty() const = 0; // Empty == none added // For reporting stats on how many entries the builder considered unique virtual size_t EstimateEntriesAdded() = 0; - Slice Finish() { // Generate Filter + Slice Finish() { // Generate Filter const BlockHandle empty_handle; Status dont_care_status; auto ret = Finish(empty_handle, &dont_care_status); diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc index 6eb3c05fc3d..838fb5296a1 100644 --- a/table/block_based/filter_block_reader_common.cc +++ b/table/block_based/filter_block_reader_common.cc @@ -5,6 +5,8 @@ // #include "table/block_based/filter_block_reader_common.h" + +#include "block_cache.h" #include "monitoring/perf_context_imp.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/parsed_full_filter_block.h" @@ -16,7 +18,7 @@ Status FilterBlockReaderCommon::ReadFilterBlock( const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, const ReadOptions& read_options, bool use_cache, GetContext* get_context, BlockCacheLookupContext* lookup_context, - CachableEntry* filter_block, BlockType block_type) { + CachableEntry* filter_block) { PERF_TIMER_GUARD(read_filter_block_nanos); assert(table); @@ -29,7 +31,7 @@ Status FilterBlockReaderCommon::ReadFilterBlock( const Status s = table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, UncompressionDict::GetEmptyDict(), filter_block, - block_type, get_context, lookup_context, + get_context, lookup_context, /* for_compaction */ false, use_cache, /* wait_for_cache */ true, /* async_read */ false); @@ -67,7 +69,7 @@ template Status FilterBlockReaderCommon::GetOrReadFilterBlock( bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, - CachableEntry* filter_block, BlockType block_type, + CachableEntry* filter_block, Env::IOPriority rate_limiter_priority) const { assert(filter_block); @@ -84,7 +86,7 @@ Status FilterBlockReaderCommon::GetOrReadFilterBlock( return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options, cache_filter_blocks(), get_context, lookup_context, - filter_block, block_type); + filter_block); } template @@ -157,8 +159,7 @@ bool FilterBlockReaderCommon::IsFilterCompatible( // Explicitly instantiate templates for both "blocklike" types we use. // This makes it possible to keep the template definitions in the .cc file. -template class FilterBlockReaderCommon; -template class FilterBlockReaderCommon; +template class FilterBlockReaderCommon; template class FilterBlockReaderCommon; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h index ca07f505085..5c2fbdcea7b 100644 --- a/table/block_based/filter_block_reader_common.h +++ b/table/block_based/filter_block_reader_common.h @@ -8,7 +8,6 @@ #include -#include "block_type.h" #include "table/block_based/cachable_entry.h" #include "table/block_based/filter_block.h" @@ -49,8 +48,7 @@ class FilterBlockReaderCommon : public FilterBlockReader { const ReadOptions& read_options, bool use_cache, GetContext* get_context, BlockCacheLookupContext* lookup_context, - CachableEntry* filter_block, - BlockType block_type); + CachableEntry* filter_block); const BlockBasedTable* table() const { return table_; } const SliceTransform* table_prefix_extractor() const; @@ -60,7 +58,6 @@ class FilterBlockReaderCommon : public FilterBlockReader { Status GetOrReadFilterBlock(bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, CachableEntry* filter_block, - BlockType block_type, Env::IOPriority rate_limiter_priority) const; size_t ApproximateFilterBlockMemoryUsage() const; diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index 2c220da98a6..f84f804dd6c 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -1422,10 +1422,9 @@ FilterBitsBuilder* BloomLikeFilterPolicy::GetFastLocalBloomBuilderWithContext( CacheReservationManagerImpl>( context.table_options.block_cache); } - return new FastLocalBloomBitsBuilder( - millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, - cache_res_mgr, - context.table_options.detect_filter_construct_corruption); + return new FastLocalBloomBitsBuilder( + millibits_per_key_, offm ? &aggregate_rounding_balance_ : nullptr, + cache_res_mgr, context.table_options.detect_filter_construct_corruption); } FilterBitsBuilder* BloomLikeFilterPolicy::GetLegacyBloomBuilderWithContext( @@ -1788,7 +1787,7 @@ FilterBuildingContext::FilterBuildingContext( const BlockBasedTableOptions& _table_options) : table_options(_table_options) {} -FilterPolicy::~FilterPolicy() { } +FilterPolicy::~FilterPolicy() {} std::shared_ptr BloomLikeFilterPolicy::Create( const std::string& name, double bits_per_key) { diff --git a/table/block_based/flush_block_policy.cc b/table/block_based/flush_block_policy.cc index f5fbe76fd49..9bb1f334b34 100644 --- a/table/block_based/flush_block_policy.cc +++ b/table/block_based/flush_block_policy.cc @@ -16,7 +16,6 @@ #include "table/block_based/flush_block_policy.h" #include "table/format.h" - namespace ROCKSDB_NAMESPACE { // Flush block by size @@ -27,8 +26,7 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy { // @params block_size_deviation: This is used to close a block before it // reaches the configured FlushBlockBySizePolicy(const uint64_t block_size, - const uint64_t block_size_deviation, - const bool align, + const uint64_t block_size_deviation, const bool align, const BlockBuilder& data_block_builder) : block_size_(block_size), block_size_deviation_limit_( diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index a953559b0a9..a7680e494de 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -121,8 +121,7 @@ Slice FullFilterBlockBuilder::Finish( FullFilterBlockReader::FullFilterBlockReader( const BlockBasedTable* t, CachableEntry&& filter_block) - : FilterBlockReaderCommon(t, std::move(filter_block)) { -} + : FilterBlockReaderCommon(t, std::move(filter_block)) {} bool FullFilterBlockReader::KeyMayMatch(const Slice& key, const bool no_io, const Slice* const /*const_ikey_ptr*/, @@ -148,7 +147,7 @@ std::unique_ptr FullFilterBlockReader::Create( if (prefetch || !use_cache) { const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache, nullptr /* get_context */, lookup_context, - &filter_block, BlockType::kFilter); + &filter_block); if (!s.ok()) { IGNORE_STATUS_IF_ERROR(s); return std::unique_ptr(); @@ -178,9 +177,8 @@ bool FullFilterBlockReader::MayMatch( Env::IOPriority rate_limiter_priority) const { CachableEntry filter_block; - const Status s = - GetOrReadFilterBlock(no_io, get_context, lookup_context, &filter_block, - BlockType::kFilter, rate_limiter_priority); + const Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, + &filter_block, rate_limiter_priority); if (!s.ok()) { IGNORE_STATUS_IF_ERROR(s); return true; @@ -229,9 +227,9 @@ void FullFilterBlockReader::MayMatch( Env::IOPriority rate_limiter_priority) const { CachableEntry filter_block; - const Status s = GetOrReadFilterBlock( - no_io, range->begin()->get_context, lookup_context, &filter_block, - BlockType::kFilter, rate_limiter_priority); + const Status s = + GetOrReadFilterBlock(no_io, range->begin()->get_context, lookup_context, + &filter_block, rate_limiter_priority); if (!s.ok()) { IGNORE_STATUS_IF_ERROR(s); return; diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index 25f0fbe0928..cd1771a388c 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -133,6 +133,7 @@ class FullFilterBlockReader BlockCacheLookupContext* lookup_context, Env::IOPriority rate_limiter_priority) override; size_t ApproximateMemoryUsage() const override; + private: bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index 37a9a5085f2..bd98638e5b6 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -80,7 +80,6 @@ class TestFilterBitsReader : public FilterBitsReader { uint32_t len_; }; - class TestHashFilter : public FilterPolicy { public: const char* Name() const override { return "TestHashFilter"; } diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h index 55a79729a3a..dd3be03316a 100644 --- a/table/block_based/index_builder.h +++ b/table/block_based/index_builder.h @@ -10,8 +10,8 @@ #pragma once #include -#include +#include #include #include #include diff --git a/table/block_based/index_reader_common.cc b/table/block_based/index_reader_common.cc index 6584586c9b1..46c276e6be0 100644 --- a/table/block_based/index_reader_common.cc +++ b/table/block_based/index_reader_common.cc @@ -8,6 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/block_based/index_reader_common.h" +#include "block_cache.h" + namespace ROCKSDB_NAMESPACE { Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer, @@ -25,7 +27,7 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( const Status s = table->RetrieveBlock( prefetch_buffer, read_options, rep->footer.index_handle(), - UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex, + UncompressionDict::GetEmptyDict(), &index_block->As(), get_context, lookup_context, /* for_compaction */ false, use_cache, /* wait_for_cache */ true, /* async_read */ false); diff --git a/table/block_based/index_reader_common.h b/table/block_based/index_reader_common.h index a0f268ad8f9..5627b0eeb37 100644 --- a/table/block_based/index_reader_common.h +++ b/table/block_based/index_reader_common.h @@ -9,7 +9,6 @@ #pragma once #include "table/block_based/block_based_table_reader.h" - #include "table/block_based/reader_common.h" namespace ROCKSDB_NAMESPACE { diff --git a/table/block_based/parsed_full_filter_block.h b/table/block_based/parsed_full_filter_block.h index 95d7b520871..8d81868d1bc 100644 --- a/table/block_based/parsed_full_filter_block.h +++ b/table/block_based/parsed_full_filter_block.h @@ -7,6 +7,7 @@ #include +#include "table/block_based/block_type.h" #include "table/format.h" namespace ROCKSDB_NAMESPACE { @@ -32,7 +33,11 @@ class ParsedFullFilterBlock { bool own_bytes() const { return block_contents_.own_bytes(); } - const Slice GetBlockContentsData() const { return block_contents_.data; } + // For TypedCacheInterface + const Slice& ContentSlice() const { return block_contents_.data; } + static constexpr CacheEntryRole kCacheEntryRole = + CacheEntryRole::kFilterBlock; + static constexpr BlockType kBlockType = BlockType::kFilter; private: BlockContents block_contents_; diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index af30925b73a..092446f022d 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -7,6 +7,7 @@ #include +#include "block_cache.h" #include "block_type.h" #include "file/random_access_file_reader.h" #include "logging/logging.h" @@ -185,7 +186,8 @@ Slice PartitionedFilterBlockBuilder::Finish( } PartitionedFilterBlockReader::PartitionedFilterBlockReader( - const BlockBasedTable* t, CachableEntry&& filter_block) + const BlockBasedTable* t, + CachableEntry&& filter_block) : FilterBlockReaderCommon(t, std::move(filter_block)) {} std::unique_ptr PartitionedFilterBlockReader::Create( @@ -196,11 +198,11 @@ std::unique_ptr PartitionedFilterBlockReader::Create( assert(table->get_rep()); assert(!pin || prefetch); - CachableEntry filter_block; + CachableEntry filter_block; if (prefetch || !use_cache) { - const Status s = ReadFilterBlock( - table, prefetch_buffer, ro, use_cache, nullptr /* get_context */, - lookup_context, &filter_block, BlockType::kFilterPartitionIndex); + const Status s = ReadFilterBlock(table, prefetch_buffer, ro, use_cache, + nullptr /* get_context */, lookup_context, + &filter_block); if (!s.ok()) { IGNORE_STATUS_IF_ERROR(s); return std::unique_ptr(); @@ -260,7 +262,8 @@ void PartitionedFilterBlockReader::PrefixesMayMatch( } BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( - const CachableEntry& filter_block, const Slice& entry) const { + const CachableEntry& filter_block, + const Slice& entry) const { IndexBlockIter iter; const InternalKeyComparator* const comparator = internal_comparator(); Statistics* kNullStats = nullptr; @@ -313,7 +316,7 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock( const Status s = table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, UncompressionDict::GetEmptyDict(), filter_block, - BlockType::kFilter, get_context, lookup_context, + get_context, lookup_context, /* for_compaction */ false, /* use_cache */ true, /* wait_for_cache */ true, /* async_read */ false); @@ -325,10 +328,9 @@ bool PartitionedFilterBlockReader::MayMatch( GetContext* get_context, BlockCacheLookupContext* lookup_context, Env::IOPriority rate_limiter_priority, FilterFunction filter_function) const { - CachableEntry filter_block; - Status s = GetOrReadFilterBlock( - no_io, get_context, lookup_context, &filter_block, - BlockType::kFilterPartitionIndex, rate_limiter_priority); + CachableEntry filter_block; + Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, + &filter_block, rate_limiter_priority); if (UNLIKELY(!s.ok())) { IGNORE_STATUS_IF_ERROR(s); return true; @@ -364,10 +366,10 @@ void PartitionedFilterBlockReader::MayMatch( BlockCacheLookupContext* lookup_context, Env::IOPriority rate_limiter_priority, FilterManyFunction filter_function) const { - CachableEntry filter_block; - Status s = GetOrReadFilterBlock( - no_io, range->begin()->get_context, lookup_context, &filter_block, - BlockType::kFilterPartitionIndex, rate_limiter_priority); + CachableEntry filter_block; + Status s = + GetOrReadFilterBlock(no_io, range->begin()->get_context, lookup_context, + &filter_block, rate_limiter_priority); if (UNLIKELY(!s.ok())) { IGNORE_STATUS_IF_ERROR(s); return; // Any/all may match @@ -455,11 +457,10 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; - CachableEntry filter_block; + CachableEntry filter_block; Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, &lookup_context, &filter_block, - BlockType::kFilterPartitionIndex, ro.rate_limiter_priority); if (!s.ok()) { ROCKS_LOG_ERROR(rep->ioptions.logger, @@ -517,7 +518,7 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, // filter blocks s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - /* wait */ true, /* for_compaction */ false, &block, BlockType::kFilter, + /* wait */ true, /* for_compaction */ false, &block, nullptr /* get_context */, &lookup_context, nullptr /* contents */, false); if (!s.ok()) { diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 955b5073963..e810c01eeb3 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -10,6 +10,7 @@ #include #include +#include "block_cache.h" #include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" @@ -99,10 +100,12 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { BlockHandle last_encoded_handle_; }; -class PartitionedFilterBlockReader : public FilterBlockReaderCommon { +class PartitionedFilterBlockReader + : public FilterBlockReaderCommon { public: - PartitionedFilterBlockReader(const BlockBasedTable* t, - CachableEntry&& filter_block); + PartitionedFilterBlockReader( + const BlockBasedTable* t, + CachableEntry&& filter_block); static std::unique_ptr Create( const BlockBasedTable* table, const ReadOptions& ro, @@ -131,8 +134,9 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon { size_t ApproximateMemoryUsage() const override; private: - BlockHandle GetFilterPartitionHandle(const CachableEntry& filter_block, - const Slice& entry) const; + BlockHandle GetFilterPartitionHandle( + const CachableEntry& filter_block, + const Slice& entry) const; Status GetFilterPartitionBlock( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle, bool no_io, GetContext* get_context, diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 21e07336503..59445c45e0c 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -7,6 +7,7 @@ #include +#include "block_cache.h" #include "index_builder.h" #include "rocksdb/filter_policy.h" #include "table/block_based/block_based_table_reader.h" @@ -35,7 +36,8 @@ class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader { public: MyPartitionedFilterBlockReader(BlockBasedTable* t, CachableEntry&& filter_block) - : PartitionedFilterBlockReader(t, std::move(filter_block)) { + : PartitionedFilterBlockReader( + t, std::move(filter_block.As())) { for (const auto& pair : blooms) { const uint64_t offset = pair.first; const std::string& bloom = pair.second; @@ -86,7 +88,8 @@ class PartitionedFilterBlockTest int num_keys = sizeof(keys) / sizeof(*keys); uint64_t max_key_size = 0; for (int i = 1; i < num_keys; i++) { - max_key_size = std::max(max_key_size, static_cast(keys[i].size())); + max_key_size = + std::max(max_key_size, static_cast(keys[i].size())); } uint64_t max_index_size = num_keys * (max_key_size + 8 /*handle*/); return max_index_size; @@ -116,11 +119,11 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* const p_index_builder, const SliceTransform* prefix_extractor = nullptr) { assert(table_options_.block_size_deviation <= 100); - auto partition_size = static_cast( - ((table_options_.metadata_block_size * - (100 - table_options_.block_size_deviation)) + - 99) / - 100); + auto partition_size = + static_cast(((table_options_.metadata_block_size * + (100 - table_options_.block_size_deviation)) + + 99) / + 100); partition_size = std::max(partition_size, static_cast(1)); const bool kValueDeltaEncoded = true; return new PartitionedFilterBlockBuilder( diff --git a/table/block_based/partitioned_index_iterator.h b/table/block_based/partitioned_index_iterator.h index 6532edc4b76..6412fe2399b 100644 --- a/table/block_based/partitioned_index_iterator.h +++ b/table/block_based/partitioned_index_iterator.h @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once #include "table/block_based/block_based_table_reader.h" - #include "table/block_based/block_based_table_reader_impl.h" #include "table/block_based/block_prefetcher.h" #include "table/block_based/reader_common.h" diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index 017ea4a3a2a..705223c90ac 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/block_based/partitioned_index_reader.h" +#include "block_cache.h" #include "file/random_access_file_reader.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/partitioned_index_iterator.h" @@ -186,7 +187,7 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, // filter blocks Status s = table()->MaybeReadBlockAndLoadToCache( prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - /*wait=*/true, /*for_compaction=*/false, &block, BlockType::kIndex, + /*wait=*/true, /*for_compaction=*/false, &block.As(), /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr, /*async_read=*/false); diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc index dc9a47ec73f..7b0b7c94352 100644 --- a/table/block_based/uncompression_dict_reader.cc +++ b/table/block_based/uncompression_dict_reader.cc @@ -60,8 +60,8 @@ Status UncompressionDictReader::ReadUncompressionDictionary( const Status s = table->RetrieveBlock( prefetch_buffer, read_options, rep->compression_dict_handle, - UncompressionDict::GetEmptyDict(), uncompression_dict, - BlockType::kCompressionDictionary, get_context, lookup_context, + UncompressionDict::GetEmptyDict(), uncompression_dict, get_context, + lookup_context, /* for_compaction */ false, use_cache, /* wait_for_cache */ true, /* async_read */ false); diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h index daac82cfbcb..416d25e2d96 100644 --- a/table/block_based/uncompression_dict_reader.h +++ b/table/block_based/uncompression_dict_reader.h @@ -7,6 +7,7 @@ #pragma once #include + #include "table/block_based/cachable_entry.h" #include "table/format.h" diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index ac0f005701a..8df0850b3df 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -147,12 +147,11 @@ inline void BlockFetcher::PrepareBufferForBlockFromFile() { // file reader that does not implement mmap reads properly. used_buf_ = &stack_buf_[0]; } else if (maybe_compressed_ && !do_uncompress_) { - compressed_buf_ = AllocateBlock(block_size_with_trailer_, - memory_allocator_compressed_); + compressed_buf_ = + AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_); used_buf_ = compressed_buf_.get(); } else { - heap_buf_ = - AllocateBlock(block_size_with_trailer_, memory_allocator_); + heap_buf_ = AllocateBlock(block_size_with_trailer_, memory_allocator_); used_buf_ = heap_buf_.get(); } } @@ -187,8 +186,8 @@ inline void BlockFetcher::CopyBufferToHeapBuf() { inline void BlockFetcher::CopyBufferToCompressedBuf() { assert(used_buf_ != compressed_buf_.get()); - compressed_buf_ = AllocateBlock(block_size_with_trailer_, - memory_allocator_compressed_); + compressed_buf_ = + AllocateBlock(block_size_with_trailer_, memory_allocator_compressed_); memcpy(compressed_buf_.get(), used_buf_, block_size_with_trailer_); #ifndef NDEBUG num_compressed_buf_memcpy_++; diff --git a/table/block_fetcher.h b/table/block_fetcher.h index 4871e81e8c6..72adced30e3 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -19,8 +19,8 @@ namespace ROCKSDB_NAMESPACE { // Retrieves a single block of a given file. Utilizes the prefetch buffer and/or // persistent cache provided (if any) to try to avoid reading from the file // directly. Note that both the prefetch buffer and the persistent cache are -// optional; also, note that the persistent cache may be configured to store either -// compressed or uncompressed blocks. +// optional; also, note that the persistent cache may be configured to store +// either compressed or uncompressed blocks. // // If the retrieved block is compressed and the do_uncompress flag is set, // BlockFetcher uncompresses the block (using the uncompression dictionary, diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 82caee28257..f87b23c3a4c 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -282,9 +282,9 @@ class BlockFetcherTest : public testing::Test { uint64_t file_size = 0; ASSERT_OK(env_->GetFileSize(file->file_name(), &file_size)); IOOptions opts; - ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */, - file_size, footer, - kBlockBasedTableMagicNumber)); + ASSERT_OK(ReadFooterFromFile(opts, file, *fs_, + nullptr /* prefetch_buffer */, file_size, + footer, kBlockBasedTableMagicNumber)); } // NOTE: compression_type returns the compression type of the fetched block diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc index ea56fdae8b6..296825d9480 100644 --- a/table/cuckoo/cuckoo_table_builder.cc +++ b/table/cuckoo/cuckoo_table_builder.cc @@ -7,6 +7,7 @@ #include "table/cuckoo/cuckoo_table_builder.h" #include + #include #include #include @@ -26,23 +27,23 @@ namespace ROCKSDB_NAMESPACE { const std::string CuckooTablePropertyNames::kEmptyKey = - "rocksdb.cuckoo.bucket.empty.key"; + "rocksdb.cuckoo.bucket.empty.key"; const std::string CuckooTablePropertyNames::kNumHashFunc = - "rocksdb.cuckoo.hash.num"; + "rocksdb.cuckoo.hash.num"; const std::string CuckooTablePropertyNames::kHashTableSize = - "rocksdb.cuckoo.hash.size"; + "rocksdb.cuckoo.hash.size"; const std::string CuckooTablePropertyNames::kValueLength = - "rocksdb.cuckoo.value.length"; + "rocksdb.cuckoo.value.length"; const std::string CuckooTablePropertyNames::kIsLastLevel = - "rocksdb.cuckoo.file.islastlevel"; + "rocksdb.cuckoo.file.islastlevel"; const std::string CuckooTablePropertyNames::kCuckooBlockSize = - "rocksdb.cuckoo.hash.cuckooblocksize"; + "rocksdb.cuckoo.hash.cuckooblocksize"; const std::string CuckooTablePropertyNames::kIdentityAsFirstHash = - "rocksdb.cuckoo.hash.identityfirst"; + "rocksdb.cuckoo.hash.identityfirst"; const std::string CuckooTablePropertyNames::kUseModuleHash = - "rocksdb.cuckoo.hash.usemodule"; + "rocksdb.cuckoo.hash.usemodule"; const std::string CuckooTablePropertyNames::kUserKeyLength = - "rocksdb.cuckoo.hash.userkeylength"; + "rocksdb.cuckoo.hash.userkeylength"; // Obtained by running echo rocksdb.table.cuckoo | sha1sum extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; @@ -174,9 +175,12 @@ bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const { Slice CuckooTableBuilder::GetKey(uint64_t idx) const { assert(closed_); if (IsDeletedKey(idx)) { - return Slice(&deleted_keys_[static_cast((idx - num_values_) * key_size_)], static_cast(key_size_)); + return Slice( + &deleted_keys_[static_cast((idx - num_values_) * key_size_)], + static_cast(key_size_)); } - return Slice(&kvs_[static_cast(idx * (key_size_ + value_size_))], static_cast(key_size_)); + return Slice(&kvs_[static_cast(idx * (key_size_ + value_size_))], + static_cast(key_size_)); } Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const { @@ -190,11 +194,14 @@ Slice CuckooTableBuilder::GetValue(uint64_t idx) const { static std::string empty_value(static_cast(value_size_), 'a'); return Slice(empty_value); } - return Slice(&kvs_[static_cast(idx * (key_size_ + value_size_) + key_size_)], static_cast(value_size_)); + return Slice( + &kvs_[static_cast(idx * (key_size_ + value_size_) + key_size_)], + static_cast(value_size_)); } Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { - buckets->resize(static_cast(hash_table_size_ + cuckoo_block_size_ - 1)); + buckets->resize( + static_cast(hash_table_size_ + cuckoo_block_size_ - 1)); uint32_t make_space_for_key_call_id = 0; for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) { uint64_t bucket_id = 0; @@ -202,29 +209,33 @@ Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { autovector hash_vals; Slice user_key = GetUserKey(vector_idx); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found; - ++hash_cnt) { - uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_, - hash_table_size_, identity_as_first_hash_, get_slice_hash_); + ++hash_cnt) { + uint64_t hash_val = + CuckooHash(user_key, hash_cnt, use_module_hash_, hash_table_size_, + identity_as_first_hash_, get_slice_hash_); // If there is a collision, check next cuckoo_block_size_ locations for // empty locations. While checking, if we reach end of the hash table, // stop searching and proceed for next hash function. for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; - ++block_idx, ++hash_val) { - if ((*buckets)[static_cast(hash_val)].vector_idx == kMaxVectorIdx) { + ++block_idx, ++hash_val) { + if ((*buckets)[static_cast(hash_val)].vector_idx == + kMaxVectorIdx) { bucket_id = hash_val; bucket_found = true; break; } else { - if (ucomp_->Compare(user_key, - GetUserKey((*buckets)[static_cast(hash_val)].vector_idx)) == 0) { + if (ucomp_->Compare( + user_key, GetUserKey((*buckets)[static_cast(hash_val)] + .vector_idx)) == 0) { return Status::NotSupported("Same key is being inserted again."); } hash_vals.push_back(hash_val); } } } - while (!bucket_found && !MakeSpaceForKey(hash_vals, - ++make_space_for_key_call_id, buckets, &bucket_id)) { + while (!bucket_found && + !MakeSpaceForKey(hash_vals, ++make_space_for_key_call_id, buckets, + &bucket_id)) { // Rehash by increashing number of hash tables. if (num_hash_func_ >= max_num_hash_func_) { return Status::NotSupported("Too many collisions. Unable to hash."); @@ -232,11 +243,13 @@ Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { // We don't really need to rehash the entire table because old hashes are // still valid and we only increased the number of hash functions. uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_, - hash_table_size_, identity_as_first_hash_, get_slice_hash_); + hash_table_size_, identity_as_first_hash_, + get_slice_hash_); ++num_hash_func_; for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; - ++block_idx, ++hash_val) { - if ((*buckets)[static_cast(hash_val)].vector_idx == kMaxVectorIdx) { + ++block_idx, ++hash_val) { + if ((*buckets)[static_cast(hash_val)].vector_idx == + kMaxVectorIdx) { bucket_found = true; bucket_id = hash_val; break; @@ -259,7 +272,7 @@ Status CuckooTableBuilder::Finish() { // Calculate the real hash size if module hash is enabled. if (use_module_hash_) { hash_table_size_ = - static_cast(num_entries_ / max_hash_table_ratio_); + static_cast(num_entries_ / max_hash_table_ratio_); } status_ = MakeHashTable(&buckets); if (!status_.ok()) { @@ -300,9 +313,8 @@ Status CuckooTableBuilder::Finish() { properties_.num_entries = num_entries_; properties_.num_deletions = num_entries_ - num_values_; properties_.fixed_key_len = key_size_; - properties_.user_collected_properties[ - CuckooTablePropertyNames::kValueLength].assign( - reinterpret_cast(&value_size_), sizeof(value_size_)); + properties_.user_collected_properties[CuckooTablePropertyNames::kValueLength] + .assign(reinterpret_cast(&value_size_), sizeof(value_size_)); uint64_t bucket_size = key_size_ + value_size_; unused_bucket.resize(static_cast(bucket_size), 'a'); @@ -332,37 +344,35 @@ Status CuckooTableBuilder::Finish() { uint64_t offset = buckets.size() * bucket_size; properties_.data_size = offset; unused_bucket.resize(static_cast(properties_.fixed_key_len)); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kEmptyKey] = unused_bucket; - properties_.user_collected_properties[ - CuckooTablePropertyNames::kNumHashFunc].assign( - reinterpret_cast(&num_hash_func_), sizeof(num_hash_func_)); - - properties_.user_collected_properties[ - CuckooTablePropertyNames::kHashTableSize].assign( - reinterpret_cast(&hash_table_size_), - sizeof(hash_table_size_)); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kIsLastLevel].assign( - reinterpret_cast(&is_last_level_file_), - sizeof(is_last_level_file_)); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kCuckooBlockSize].assign( - reinterpret_cast(&cuckoo_block_size_), - sizeof(cuckoo_block_size_)); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kIdentityAsFirstHash].assign( - reinterpret_cast(&identity_as_first_hash_), - sizeof(identity_as_first_hash_)); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kUseModuleHash].assign( - reinterpret_cast(&use_module_hash_), - sizeof(use_module_hash_)); + properties_.user_collected_properties[CuckooTablePropertyNames::kEmptyKey] = + unused_bucket; + properties_.user_collected_properties[CuckooTablePropertyNames::kNumHashFunc] + .assign(reinterpret_cast(&num_hash_func_), sizeof(num_hash_func_)); + + properties_ + .user_collected_properties[CuckooTablePropertyNames::kHashTableSize] + .assign(reinterpret_cast(&hash_table_size_), + sizeof(hash_table_size_)); + properties_.user_collected_properties[CuckooTablePropertyNames::kIsLastLevel] + .assign(reinterpret_cast(&is_last_level_file_), + sizeof(is_last_level_file_)); + properties_ + .user_collected_properties[CuckooTablePropertyNames::kCuckooBlockSize] + .assign(reinterpret_cast(&cuckoo_block_size_), + sizeof(cuckoo_block_size_)); + properties_ + .user_collected_properties[CuckooTablePropertyNames::kIdentityAsFirstHash] + .assign(reinterpret_cast(&identity_as_first_hash_), + sizeof(identity_as_first_hash_)); + properties_ + .user_collected_properties[CuckooTablePropertyNames::kUseModuleHash] + .assign(reinterpret_cast(&use_module_hash_), + sizeof(use_module_hash_)); uint32_t user_key_len = static_cast(smallest_user_key_.size()); - properties_.user_collected_properties[ - CuckooTablePropertyNames::kUserKeyLength].assign( - reinterpret_cast(&user_key_len), - sizeof(user_key_len)); + properties_ + .user_collected_properties[CuckooTablePropertyNames::kUserKeyLength] + .assign(reinterpret_cast(&user_key_len), + sizeof(user_key_len)); // Write meta blocks. MetaIndexBuilder meta_index_builder; @@ -406,9 +416,7 @@ void CuckooTableBuilder::Abandon() { closed_ = true; } -uint64_t CuckooTableBuilder::NumEntries() const { - return num_entries_; -} +uint64_t CuckooTableBuilder::NumEntries() const { return num_entries_; } uint64_t CuckooTableBuilder::FileSize() const { if (closed_) { @@ -418,8 +426,8 @@ uint64_t CuckooTableBuilder::FileSize() const { } if (use_module_hash_) { - return static_cast((key_size_ + value_size_) * - num_entries_ / max_hash_table_ratio_); + return static_cast((key_size_ + value_size_) * num_entries_ / + max_hash_table_ratio_); } else { // Account for buckets being a power of two. // As elements are added, file size remains constant for a while and @@ -468,7 +476,8 @@ bool CuckooTableBuilder::MakeSpaceForKey( // no. of times this will be called is <= max_num_hash_func_ + num_entries_. for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { uint64_t bid = hash_vals[hash_cnt]; - (*buckets)[static_cast(bid)].make_space_for_key_call_id = make_space_for_key_call_id; + (*buckets)[static_cast(bid)].make_space_for_key_call_id = + make_space_for_key_call_id; tree.push_back(CuckooNode(bid, 0, 0)); } bool null_found = false; @@ -479,24 +488,25 @@ bool CuckooTableBuilder::MakeSpaceForKey( if (curr_depth >= max_search_depth_) { break; } - CuckooBucket& curr_bucket = (*buckets)[static_cast(curr_node.bucket_id)]; - for (uint32_t hash_cnt = 0; - hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) { - uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx), - hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_, - get_slice_hash_); + CuckooBucket& curr_bucket = + (*buckets)[static_cast(curr_node.bucket_id)]; + for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found; + ++hash_cnt) { + uint64_t child_bucket_id = CuckooHash( + GetUserKey(curr_bucket.vector_idx), hash_cnt, use_module_hash_, + hash_table_size_, identity_as_first_hash_, get_slice_hash_); // Iterate inside Cuckoo Block. for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; - ++block_idx, ++child_bucket_id) { - if ((*buckets)[static_cast(child_bucket_id)].make_space_for_key_call_id == - make_space_for_key_call_id) { + ++block_idx, ++child_bucket_id) { + if ((*buckets)[static_cast(child_bucket_id)] + .make_space_for_key_call_id == make_space_for_key_call_id) { continue; } - (*buckets)[static_cast(child_bucket_id)].make_space_for_key_call_id = - make_space_for_key_call_id; - tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, - curr_pos)); - if ((*buckets)[static_cast(child_bucket_id)].vector_idx == kMaxVectorIdx) { + (*buckets)[static_cast(child_bucket_id)] + .make_space_for_key_call_id = make_space_for_key_call_id; + tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1, curr_pos)); + if ((*buckets)[static_cast(child_bucket_id)].vector_idx == + kMaxVectorIdx) { null_found = true; break; } @@ -515,7 +525,7 @@ bool CuckooTableBuilder::MakeSpaceForKey( while (bucket_to_replace_pos >= num_hash_func_) { CuckooNode& curr_node = tree[bucket_to_replace_pos]; (*buckets)[static_cast(curr_node.bucket_id)] = - (*buckets)[static_cast(tree[curr_node.parent_pos].bucket_id)]; + (*buckets)[static_cast(tree[curr_node.parent_pos].bucket_id)]; bucket_to_replace_pos = curr_node.parent_pos; } *bucket_id = tree[bucket_to_replace_pos].bucket_id; diff --git a/table/cuckoo/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h index 20ed71bfc22..a125e1f4c5e 100644 --- a/table/cuckoo/cuckoo_table_builder.h +++ b/table/cuckoo/cuckoo_table_builder.h @@ -6,10 +6,12 @@ #pragma once #ifndef ROCKSDB_LITE #include + #include #include #include #include + #include "db/version_edit.h" #include "port/port.h" #include "rocksdb/status.h" @@ -20,7 +22,7 @@ namespace ROCKSDB_NAMESPACE { -class CuckooTableBuilder: public TableBuilder { +class CuckooTableBuilder : public TableBuilder { public: CuckooTableBuilder( WritableFileWriter* file, double max_hash_table_ratio, @@ -78,8 +80,7 @@ class CuckooTableBuilder: public TableBuilder { private: struct CuckooBucket { - CuckooBucket() - : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {} + CuckooBucket() : vector_idx(kMaxVectorIdx), make_space_for_key_call_id(0) {} uint32_t vector_idx; // This number will not exceed kvs_.size() + max_num_hash_func_. // We assume number of items is <= 2^32. @@ -125,7 +126,7 @@ class CuckooTableBuilder: public TableBuilder { bool use_module_hash_; bool identity_as_first_hash_; uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, - uint64_t max_num_buckets); + uint64_t max_num_buckets); std::string largest_user_key_ = ""; std::string smallest_user_key_ = ""; diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index 07b42cdf8c2..be1c62117da 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -42,11 +42,13 @@ class CuckooBuilderTest : public testing::Test { } void CheckFileContents(const std::vector& keys, - const std::vector& values, - const std::vector& expected_locations, - std::string expected_unused_bucket, uint64_t expected_table_size, - uint32_t expected_num_hash_func, bool expected_is_last_level, - uint32_t expected_cuckoo_block_size = 1) { + const std::vector& values, + const std::vector& expected_locations, + std::string expected_unused_bucket, + uint64_t expected_table_size, + uint32_t expected_num_hash_func, + bool expected_is_last_level, + uint32_t expected_cuckoo_block_size = 1) { uint64_t num_deletions = 0; for (const auto& key : keys) { ParsedInternalKey parsed; @@ -72,39 +74,44 @@ class CuckooBuilderTest : public testing::Test { ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, kCuckooTableMagicNumber, ioptions, &props)); // Check unused bucket. - std::string unused_key = props->user_collected_properties[ - CuckooTablePropertyNames::kEmptyKey]; - ASSERT_EQ(expected_unused_bucket.substr(0, - props->fixed_key_len), unused_key); - - uint64_t value_len_found = - *reinterpret_cast(props->user_collected_properties[ - CuckooTablePropertyNames::kValueLength].data()); + std::string unused_key = + props->user_collected_properties[CuckooTablePropertyNames::kEmptyKey]; + ASSERT_EQ(expected_unused_bucket.substr(0, props->fixed_key_len), + unused_key); + + uint64_t value_len_found = *reinterpret_cast( + props->user_collected_properties[CuckooTablePropertyNames::kValueLength] + .data()); ASSERT_EQ(values.empty() ? 0 : values[0].size(), value_len_found); - ASSERT_EQ(props->raw_value_size, values.size()*value_len_found); - const uint64_t table_size = - *reinterpret_cast(props->user_collected_properties[ - CuckooTablePropertyNames::kHashTableSize].data()); + ASSERT_EQ(props->raw_value_size, values.size() * value_len_found); + const uint64_t table_size = *reinterpret_cast( + props + ->user_collected_properties + [CuckooTablePropertyNames::kHashTableSize] + .data()); ASSERT_EQ(expected_table_size, table_size); - const uint32_t num_hash_func_found = - *reinterpret_cast(props->user_collected_properties[ - CuckooTablePropertyNames::kNumHashFunc].data()); + const uint32_t num_hash_func_found = *reinterpret_cast( + props->user_collected_properties[CuckooTablePropertyNames::kNumHashFunc] + .data()); ASSERT_EQ(expected_num_hash_func, num_hash_func_found); - const uint32_t cuckoo_block_size = - *reinterpret_cast(props->user_collected_properties[ - CuckooTablePropertyNames::kCuckooBlockSize].data()); + const uint32_t cuckoo_block_size = *reinterpret_cast( + props + ->user_collected_properties + [CuckooTablePropertyNames::kCuckooBlockSize] + .data()); ASSERT_EQ(expected_cuckoo_block_size, cuckoo_block_size); - const bool is_last_level_found = - *reinterpret_cast(props->user_collected_properties[ - CuckooTablePropertyNames::kIsLastLevel].data()); + const bool is_last_level_found = *reinterpret_cast( + props->user_collected_properties[CuckooTablePropertyNames::kIsLastLevel] + .data()); ASSERT_EQ(expected_is_last_level, is_last_level_found); ASSERT_EQ(props->num_entries, keys.size()); ASSERT_EQ(props->num_deletions, num_deletions); ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); - ASSERT_EQ(props->data_size, expected_unused_bucket.size() * - (expected_table_size + expected_cuckoo_block_size - 1)); - ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); + ASSERT_EQ(props->data_size, + expected_unused_bucket.size() * + (expected_table_size + expected_cuckoo_block_size - 1)); + ASSERT_EQ(props->raw_key_size, keys.size() * props->fixed_key_len); ASSERT_EQ(props->column_family_id, 0); ASSERT_EQ(props->column_family_name, kDefaultColumnFamilyName); @@ -156,7 +163,6 @@ class CuckooBuilderTest : public testing::Test { return NextPowOf2(static_cast(num / kHashTableRatio)); } - Env* env_; FileOptions file_options_; std::string fname; @@ -276,8 +282,8 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); - CheckFileContents(keys, values, expected_locations, - expected_unused_bucket, expected_table_size, 4, false); + CheckFileContents(keys, values, expected_locations, expected_unused_bucket, + expected_table_size, 4, false); } TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { @@ -324,8 +330,8 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); - CheckFileContents(keys, values, expected_locations, - expected_unused_bucket, expected_table_size, 3, false, cuckoo_block_size); + CheckFileContents(keys, values, expected_locations, expected_unused_bucket, + expected_table_size, 3, false, cuckoo_block_size); } TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { @@ -333,17 +339,14 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { // Finally insert an element with hash value somewhere in the middle // so that it displaces all the elements after that. uint32_t num_hash_fun = 2; - std::vector user_keys = {"key01", "key02", "key03", - "key04", "key05"}; + std::vector user_keys = {"key01", "key02", "key03", "key04", + "key05"}; std::vector values = {"v01", "v02", "v03", "v04", "v05"}; // Need to have a temporary variable here as VS compiler does not currently // support operator= with initializer_list as a parameter std::unordered_map> hm = { - {user_keys[0], {0, 1}}, - {user_keys[1], {1, 2}}, - {user_keys[2], {2, 3}}, - {user_keys[3], {3, 4}}, - {user_keys[4], {0, 2}}, + {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, {user_keys[4], {0, 2}}, }; hash_map = std::move(hm); @@ -376,23 +379,20 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); - CheckFileContents(keys, values, expected_locations, - expected_unused_bucket, expected_table_size, 2, false); + CheckFileContents(keys, values, expected_locations, expected_unused_bucket, + expected_table_size, 2, false); } TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { uint32_t num_hash_fun = 2; - std::vector user_keys = {"key01", "key02", "key03", - "key04", "key05"}; + std::vector user_keys = {"key01", "key02", "key03", "key04", + "key05"}; std::vector values = {"v01", "v02", "v03", "v04", "v05"}; // Need to have a temporary variable here as VS compiler does not currently // support operator= with initializer_list as a parameter std::unordered_map> hm = { - {user_keys[0], {0, 1}}, - {user_keys[1], {1, 2}}, - {user_keys[2], {3, 4}}, - {user_keys[3], {4, 5}}, - {user_keys[4], {0, 3}}, + {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {3, 4}}, + {user_keys[3], {4, 5}}, {user_keys[4], {0, 3}}, }; hash_map = std::move(hm); @@ -425,8 +425,8 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); - CheckFileContents(keys, values, expected_locations, - expected_unused_bucket, expected_table_size, 2, false, 2); + CheckFileContents(keys, values, expected_locations, expected_unused_bucket, + expected_table_size, 2, false, 2); } TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { @@ -469,7 +469,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, - expected_unused_bucket, expected_table_size, 2, true); + expected_unused_bucket, expected_table_size, 2, true); } TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { @@ -513,22 +513,19 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, - expected_unused_bucket, expected_table_size, 4, true); + expected_unused_bucket, expected_table_size, 4, true); } TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { uint32_t num_hash_fun = 2; - std::vector user_keys = {"key01", "key02", "key03", - "key04", "key05"}; + std::vector user_keys = {"key01", "key02", "key03", "key04", + "key05"}; std::vector values = {"v01", "v02", "v03", "v04", "v05"}; // Need to have a temporary variable here as VS compiler does not currently // support operator= with initializer_list as a parameter std::unordered_map> hm = { - {user_keys[0], {0, 1}}, - {user_keys[1], {1, 2}}, - {user_keys[2], {2, 3}}, - {user_keys[3], {3, 4}}, - {user_keys[4], {0, 2}}, + {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, {user_keys[4], {0, 2}}, }; hash_map = std::move(hm); @@ -559,7 +556,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, - expected_unused_bucket, expected_table_size, 2, true); + expected_unused_bucket, expected_table_size, 2, true); } TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { @@ -567,16 +564,13 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { // Finally try inserting an element with hash value somewhere in the middle // and it should fail because the no. of elements to displace is too high. uint32_t num_hash_fun = 2; - std::vector user_keys = {"key01", "key02", "key03", - "key04", "key05"}; + std::vector user_keys = {"key01", "key02", "key03", "key04", + "key05"}; // Need to have a temporary variable here as VS compiler does not currently // support operator= with initializer_list as a parameter std::unordered_map> hm = { - {user_keys[0], {0, 1}}, - {user_keys[1], {1, 2}}, - {user_keys[2], {2, 3}}, - {user_keys[3], {3, 4}}, - {user_keys[4], {0, 1}}, + {user_keys[0], {0, 1}}, {user_keys[1], {1, 2}}, {user_keys[2], {2, 3}}, + {user_keys[3], {3, 4}}, {user_keys[4], {0, 1}}, }; hash_map = std::move(hm); diff --git a/table/cuckoo/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h index a51f23e532a..9937c28dd29 100644 --- a/table/cuckoo/cuckoo_table_factory.h +++ b/table/cuckoo/cuckoo_table_factory.h @@ -7,9 +7,10 @@ #ifndef ROCKSDB_LITE #include + +#include "rocksdb/options.h" #include "rocksdb/table.h" #include "util/murmurhash.h" -#include "rocksdb/options.h" namespace ROCKSDB_NAMESPACE { diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index 69c93d76e81..1d70909a601 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -30,7 +30,7 @@ namespace ROCKSDB_NAMESPACE { namespace { const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); const uint32_t kInvalidIndex = std::numeric_limits::max(); -} +} // namespace extern const uint64_t kCuckooTableMagicNumber; @@ -87,26 +87,26 @@ CuckooTableReader::CuckooTableReader( status_ = Status::Corruption("User key length not found"); return; } - user_key_length_ = *reinterpret_cast( - user_key_len->second.data()); + user_key_length_ = + *reinterpret_cast(user_key_len->second.data()); auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); if (value_length == user_props.end()) { status_ = Status::Corruption("Value length not found"); return; } - value_length_ = *reinterpret_cast( - value_length->second.data()); + value_length_ = + *reinterpret_cast(value_length->second.data()); bucket_length_ = key_length_ + value_length_; - auto hash_table_size = user_props.find( - CuckooTablePropertyNames::kHashTableSize); + auto hash_table_size = + user_props.find(CuckooTablePropertyNames::kHashTableSize); if (hash_table_size == user_props.end()) { status_ = Status::Corruption("Hash table size not found"); return; } - table_size_ = *reinterpret_cast( - hash_table_size->second.data()); + table_size_ = + *reinterpret_cast(hash_table_size->second.data()); auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); if (is_last_level == user_props.end()) { @@ -115,31 +115,31 @@ CuckooTableReader::CuckooTableReader( } is_last_level_ = *reinterpret_cast(is_last_level->second.data()); - auto identity_as_first_hash = user_props.find( - CuckooTablePropertyNames::kIdentityAsFirstHash); + auto identity_as_first_hash = + user_props.find(CuckooTablePropertyNames::kIdentityAsFirstHash); if (identity_as_first_hash == user_props.end()) { status_ = Status::Corruption("identity as first hash not found"); return; } - identity_as_first_hash_ = *reinterpret_cast( - identity_as_first_hash->second.data()); + identity_as_first_hash_ = + *reinterpret_cast(identity_as_first_hash->second.data()); - auto use_module_hash = user_props.find( - CuckooTablePropertyNames::kUseModuleHash); + auto use_module_hash = + user_props.find(CuckooTablePropertyNames::kUseModuleHash); if (use_module_hash == user_props.end()) { status_ = Status::Corruption("hash type is not found"); return; } - use_module_hash_ = *reinterpret_cast( - use_module_hash->second.data()); - auto cuckoo_block_size = user_props.find( - CuckooTablePropertyNames::kCuckooBlockSize); + use_module_hash_ = + *reinterpret_cast(use_module_hash->second.data()); + auto cuckoo_block_size = + user_props.find(CuckooTablePropertyNames::kCuckooBlockSize); if (cuckoo_block_size == user_props.end()) { status_ = Status::Corruption("Cuckoo block size not found"); return; } - cuckoo_block_size_ = *reinterpret_cast( - cuckoo_block_size->second.data()); + cuckoo_block_size_ = + *reinterpret_cast(cuckoo_block_size->second.data()); cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; // TODO: rate limit reads of whole cuckoo tables. status_ = @@ -154,9 +154,10 @@ Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); Slice user_key = ExtractUserKey(key); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { - uint64_t offset = bucket_length_ * CuckooHash( - user_key, hash_cnt, use_module_hash_, table_size_, - identity_as_first_hash_, get_slice_hash_); + uint64_t offset = + bucket_length_ * CuckooHash(user_key, hash_cnt, use_module_hash_, + table_size_, identity_as_first_hash_, + get_slice_hash_); const char* bucket = &file_data_.data()[offset]; for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; ++block_idx, bucket += bucket_length_) { @@ -195,9 +196,10 @@ Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, void CuckooTableReader::Prepare(const Slice& key) { // Prefetch the first Cuckoo Block. Slice user_key = ExtractUserKey(key); - uint64_t addr = reinterpret_cast(file_data_.data()) + - bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, - identity_as_first_hash_, nullptr); + uint64_t addr = + reinterpret_cast(file_data_.data()) + + bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, + identity_as_first_hash_, nullptr); uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { PREFETCH(reinterpret_cast(addr), 0, 3); @@ -228,21 +230,22 @@ class CuckooTableIterator : public InternalIterator { BucketComparator(const Slice& file_data, const Comparator* ucomp, uint32_t bucket_len, uint32_t user_key_len, const Slice& target = Slice()) - : file_data_(file_data), - ucomp_(ucomp), - bucket_len_(bucket_len), - user_key_len_(user_key_len), - target_(target) {} + : file_data_(file_data), + ucomp_(ucomp), + bucket_len_(bucket_len), + user_key_len_(user_key_len), + target_(target) {} bool operator()(const uint32_t first, const uint32_t second) const { - const char* first_bucket = - (first == kInvalidIndex) ? target_.data() : - &file_data_.data()[first * bucket_len_]; + const char* first_bucket = (first == kInvalidIndex) + ? target_.data() + : &file_data_.data()[first * bucket_len_]; const char* second_bucket = - (second == kInvalidIndex) ? target_.data() : - &file_data_.data()[second * bucket_len_]; + (second == kInvalidIndex) ? target_.data() + : &file_data_.data()[second * bucket_len_]; return ucomp_->Compare(Slice(first_bucket, user_key_len_), Slice(second_bucket, user_key_len_)) < 0; } + private: const Slice file_data_; const Comparator* ucomp_; @@ -264,11 +267,11 @@ class CuckooTableIterator : public InternalIterator { }; CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) - : bucket_comparator_(reader->file_data_, reader->ucomp_, - reader->bucket_length_, reader->user_key_length_), - reader_(reader), - initialized_(false), - curr_key_idx_(kInvalidIndex) { + : bucket_comparator_(reader->file_data_, reader->ucomp_, + reader->bucket_length_, reader->user_key_length_), + reader_(reader), + initialized_(false), + curr_key_idx_(kInvalidIndex) { sorted_bucket_ids_.clear(); curr_value_.clear(); curr_key_.Clear(); @@ -278,7 +281,8 @@ void CuckooTableIterator::InitIfNeeded() { if (initialized_) { return; } - sorted_bucket_ids_.reserve(static_cast(reader_->GetTableProperties()->num_entries)); + sorted_bucket_ids_.reserve( + static_cast(reader_->GetTableProperties()->num_entries)); uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; assert(num_buckets < kInvalidIndex); const char* bucket = reader_->file_data_.data(); @@ -289,7 +293,7 @@ void CuckooTableIterator::InitIfNeeded() { bucket += reader_->bucket_length_; } assert(sorted_bucket_ids_.size() == - reader_->GetTableProperties()->num_entries); + reader_->GetTableProperties()->num_entries); std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), bucket_comparator_); curr_key_idx_ = kInvalidIndex; @@ -311,13 +315,11 @@ void CuckooTableIterator::SeekToLast() { void CuckooTableIterator::Seek(const Slice& target) { InitIfNeeded(); const BucketComparator seek_comparator( - reader_->file_data_, reader_->ucomp_, - reader_->bucket_length_, reader_->user_key_length_, - ExtractUserKey(target)); - auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(), - sorted_bucket_ids_.end(), - kInvalidIndex, - seek_comparator); + reader_->file_data_, reader_->ucomp_, reader_->bucket_length_, + reader_->user_key_length_, ExtractUserKey(target)); + auto seek_it = + std::lower_bound(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), + kInvalidIndex, seek_comparator); curr_key_idx_ = static_cast(std::distance(sorted_bucket_ids_.begin(), seek_it)); PrepareKVAtCurrIdx(); @@ -339,12 +341,12 @@ void CuckooTableIterator::PrepareKVAtCurrIdx() { return; } uint32_t id = sorted_bucket_ids_[curr_key_idx_]; - const char* offset = reader_->file_data_.data() + - id * reader_->bucket_length_; + const char* offset = + reader_->file_data_.data() + id * reader_->bucket_length_; if (reader_->is_last_level_) { // Always return internal key. - curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), - 0, kTypeValue); + curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), 0, + kTypeValue); } else { curr_key_.SetInternalKey(Slice(offset, reader_->key_length_)); } @@ -388,8 +390,7 @@ InternalIterator* CuckooTableReader::NewIterator( const ReadOptions& /*read_options*/, const SliceTransform* /* prefix_extractor */, Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, - size_t /*compaction_readahead_size*/, - bool /* allow_unprepared_value */) { + size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) { if (!status().ok()) { return NewErrorInternalIterator( Status::Corruption("CuckooTableReader status is not okay."), arena); diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index fb30e92cc35..f6c599ae808 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -9,8 +9,8 @@ #pragma once #ifndef ROCKSDB_LITE -#include #include +#include #include #include @@ -25,7 +25,7 @@ class Arena; class TableReader; struct ImmutableOptions; -class CuckooTableReader: public TableReader { +class CuckooTableReader : public TableReader { public: CuckooTableReader(const ImmutableOptions& ioptions, std::unique_ptr&& file, @@ -93,7 +93,7 @@ class CuckooTableReader: public TableReader { uint64_t table_size_; const Comparator* ucomp_; uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, - uint64_t max_num_buckets); + uint64_t max_num_buckets); }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index 6203a42d412..d3d1490c6ef 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -33,11 +33,12 @@ int main() { using GFLAGS_NAMESPACE::ParseCommandLineFlags; -DEFINE_string(file_dir, "", "Directory where the files will be created" - " for benchmark. Added for using tmpfs."); +DEFINE_string(file_dir, "", + "Directory where the files will be created" + " for benchmark. Added for using tmpfs."); DEFINE_bool(enable_perf, false, "Run Benchmark Tests too."); DEFINE_bool(write, false, - "Should write new values to file in performance tests?"); + "Should write new values to file in performance tests?"); DEFINE_bool(identity_as_first_hash, true, "use identity as first hash"); namespace ROCKSDB_NAMESPACE { @@ -45,10 +46,10 @@ namespace ROCKSDB_NAMESPACE { namespace { const uint32_t kNumHashFunc = 10; // Methods, variables related to Hash functions. -std::unordered_map> hash_map; +std::unordered_map > hash_map; void AddHashLookups(const std::string& s, uint64_t bucket_id, - uint32_t num_hash_fun) { + uint32_t num_hash_fun) { std::vector v; for (uint32_t i = 0; i < num_hash_fun; i++) { v.push_back(bucket_id + i); @@ -128,8 +129,8 @@ class CuckooReaderTest : public testing::Test { } void UpdateKeys(bool with_zero_seqno) { for (uint32_t i = 0; i < num_items; i++) { - ParsedInternalKey ikey(user_keys[i], - with_zero_seqno ? 0 : i + 1000, kTypeValue); + ParsedInternalKey ikey(user_keys[i], with_zero_seqno ? 0 : i + 1000, + kTypeValue); keys[i].clear(); AppendInternalKey(&keys[i], ikey); } @@ -189,11 +190,11 @@ class CuckooReaderTest : public testing::Test { TableReaderCaller::kUncategorized); ASSERT_OK(it->status()); ASSERT_TRUE(!it->Valid()); - it->Seek(keys[num_items/2]); + it->Seek(keys[num_items / 2]); ASSERT_TRUE(it->Valid()); ASSERT_OK(it->status()); - ASSERT_TRUE(keys[num_items/2] == it->key()); - ASSERT_TRUE(values[num_items/2] == it->value()); + ASSERT_TRUE(keys[num_items / 2] == it->key()); + ASSERT_TRUE(values[num_items / 2] == it->value()); ASSERT_OK(it->status()); it->~InternalIterator(); } @@ -273,7 +274,7 @@ TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) { } TEST_F(CuckooReaderTest, CheckIterator) { - SetUp(2*kNumHashFunc); + SetUp(2 * kNumHashFunc); fname = test::PerThreadDBPath("CuckooReader_CheckIterator"); for (uint64_t i = 0; i < num_items; i++) { user_keys[i] = "key" + NumToStr(i); @@ -281,7 +282,7 @@ TEST_F(CuckooReaderTest, CheckIterator) { AppendInternalKey(&keys[i], ikey); values[i] = "value" + NumToStr(i); // Give disjoint hash values, in reverse order. - AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc); + AddHashLookups(user_keys[i], num_items - i - 1, kNumHashFunc); } CreateCuckooFileAndCheckReader(); CheckIterator(); @@ -292,7 +293,7 @@ TEST_F(CuckooReaderTest, CheckIterator) { } TEST_F(CuckooReaderTest, CheckIteratorUint64) { - SetUp(2*kNumHashFunc); + SetUp(2 * kNumHashFunc); fname = test::PerThreadDBPath("CuckooReader_CheckIterator"); for (uint64_t i = 0; i < num_items; i++) { user_keys[i].resize(8); @@ -301,7 +302,7 @@ TEST_F(CuckooReaderTest, CheckIteratorUint64) { AppendInternalKey(&keys[i], ikey); values[i] = "value" + NumToStr(i); // Give disjoint hash values, in reverse order. - AddHashLookups(user_keys[i], num_items-i-1, kNumHashFunc); + AddHashLookups(user_keys[i], num_items - i - 1, kNumHashFunc); } CreateCuckooFileAndCheckReader(test::Uint64Comparator()); CheckIterator(test::Uint64Comparator()); @@ -366,11 +367,11 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { // Test read when key is unused key. std::string unused_key = - reader.GetTableProperties()->user_collected_properties.at( - CuckooTablePropertyNames::kEmptyKey); + reader.GetTableProperties()->user_collected_properties.at( + CuckooTablePropertyNames::kEmptyKey); // Add hash values that map to empty buckets. - AddHashLookups(ExtractUserKey(unused_key).ToString(), - kNumHashFunc, kNumHashFunc); + AddHashLookups(ExtractUserKey(unused_key).ToString(), kNumHashFunc, + kNumHashFunc); value.Reset(); GetContext get_context3( ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, Slice(unused_key), @@ -407,8 +408,8 @@ std::string GetFileName(uint64_t num) { // Create last level file as we are interested in measuring performance of // last level file only. -void WriteFile(const std::vector& keys, - const uint64_t num, double hash_ratio) { +void WriteFile(const std::vector& keys, const uint64_t num, + double hash_ratio) { Options options; options.allow_mmap_reads = true; const auto& fs = options.env->GetFileSystem(); @@ -478,13 +479,16 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); const UserCollectedProperties user_props = - reader.GetTableProperties()->user_collected_properties; + reader.GetTableProperties()->user_collected_properties; const uint32_t num_hash_fun = *reinterpret_cast( user_props.at(CuckooTablePropertyNames::kNumHashFunc).data()); const uint64_t table_size = *reinterpret_cast( user_props.at(CuckooTablePropertyNames::kHashTableSize).data()); - fprintf(stderr, "With %" PRIu64 " items, utilization is %.2f%%, number of" - " hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun); + fprintf(stderr, + "With %" PRIu64 + " items, utilization is %.2f%%, number of" + " hash functions: %u.\n", + num, num * 100.0 / (table_size), num_hash_fun); ReadOptions r_options; std::vector keys; @@ -502,10 +506,10 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { uint64_t start_time = env->NowMicros(); if (batch_size > 0) { for (uint64_t i = 0; i < num; i += batch_size) { - for (uint64_t j = i; j < i+batch_size && j < num; ++j) { + for (uint64_t j = i; j < i + batch_size && j < num; ++j) { reader.Prepare(Slice(reinterpret_cast(&keys[j]), 16)); } - for (uint64_t j = i; j < i+batch_size && j < num; ++j) { + for (uint64_t j = i; j < i + batch_size && j < num; ++j) { reader.Get(r_options, Slice(reinterpret_cast(&keys[j]), 16), &get_context, nullptr); } @@ -518,8 +522,8 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { } float time_per_op = (env->NowMicros() - start_time) * 1.0f / num; fprintf(stderr, - "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n", - time_per_op, 1.0 / time_per_op, batch_size); + "Time taken per op is %.3fus (%.1f Mqps) with batch size of %u\n", + time_per_op, 1.0 / time_per_op, batch_size); } } // namespace. @@ -531,10 +535,11 @@ TEST_F(CuckooReaderTest, TestReadPerformance) { // These numbers are chosen to have a hash utilization % close to // 0.9, 0.75, 0.6 and 0.5 respectively. // They all create 128 M buckets. - std::vector nums = {120*1024*1024, 100*1024*1024, 80*1024*1024, - 70*1024*1024}; + std::vector nums = {120 * 1024 * 1024, 100 * 1024 * 1024, + 80 * 1024 * 1024, 70 * 1024 * 1024}; #ifndef NDEBUG - fprintf(stdout, + fprintf( + stdout, "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n"); #endif for (uint64_t num : nums) { diff --git a/table/format.cc b/table/format.cc index efde5e16903..d3347cdb8c1 100644 --- a/table/format.cc +++ b/table/format.cc @@ -264,7 +264,8 @@ void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, } } -Status Footer::DecodeFrom(Slice input, uint64_t input_offset) { +Status Footer::DecodeFrom(Slice input, uint64_t input_offset, + uint64_t enforce_table_magic_number) { (void)input_offset; // Future use // Only decode to unused Footer @@ -280,6 +281,11 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset) { if (legacy) { magic = UpconvertLegacyFooterFormat(magic); } + if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) { + return Status::Corruption("Bad table magic number: expected " + + std::to_string(enforce_table_magic_number) + + ", found " + std::to_string(magic)); + } table_magic_number_ = magic; block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic); @@ -346,7 +352,7 @@ std::string Footer::ToString() const { } Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, + FileSystem& fs, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number) { if (file_size < Footer::kMinEncodedLength) { @@ -390,29 +396,27 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, // Check that we actually read the whole footer from the file. It may be // that size isn't correct. if (footer_input.size() < Footer::kMinEncodedLength) { - // FIXME: this error message is bad. We should be checking whether the - // provided file_size matches what's on disk, at least in this case. - // Unfortunately FileSystem/Env does not provide a way to get the size - // of an open file, so getting file size requires a full path seek. - return Status::Corruption("file is too short (" + - std::to_string(file_size) + - " bytes) to be an " - "sstable" + - file->file_name()); + uint64_t size_on_disk = 0; + if (fs.GetFileSize(file->file_name(), IOOptions(), &size_on_disk, nullptr) + .ok()) { + // Similar to CheckConsistency message, but not completely sure the + // expected size always came from manifest. + return Status::Corruption("Sst file size mismatch: " + file->file_name() + + ". Expected " + std::to_string(file_size) + + ", actual size " + + std::to_string(size_on_disk) + "\n"); + } else { + return Status::Corruption( + "Missing SST footer data in file " + file->file_name() + + " File too short? Expected size: " + std::to_string(file_size)); + } } - s = footer->DecodeFrom(footer_input, read_offset); + s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number); if (!s.ok()) { + s = Status::CopyAppendMessage(s, " in ", file->file_name()); return s; } - if (enforce_table_magic_number != 0 && - enforce_table_magic_number != footer->table_magic_number()) { - return Status::Corruption("Bad table magic number: expected " + - std::to_string(enforce_table_magic_number) + - ", found " + - std::to_string(footer->table_magic_number()) + - " in " + file->file_name()); - } return Status::OK(); } diff --git a/table/format.h b/table/format.h index ffb9fb0ca43..71d3706c42d 100644 --- a/table/format.h +++ b/table/format.h @@ -138,7 +138,10 @@ class Footer { // Deserialize a footer (populate fields) from `input` and check for various // corruptions. `input_offset` is the offset within the target file of // `input` buffer (future use). - Status DecodeFrom(Slice input, uint64_t input_offset); + // If enforce_table_magic_number != 0, will return corruption if table magic + // number is not equal to enforce_table_magic_number. + Status DecodeFrom(Slice input, uint64_t input_offset, + uint64_t enforce_table_magic_number = 0); // Table magic number identifies file as RocksDB SST file and which kind of // SST format is use. @@ -238,7 +241,7 @@ class FooterBuilder { // If enforce_table_magic_number != 0, ReadFooterFromFile() will return // corruption if table_magic number is not equal to enforce_table_magic_number Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, - FilePrefetchBuffer* prefetch_buffer, + FileSystem& fs, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number = 0); @@ -273,7 +276,7 @@ uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data, // decompression function. // * "Parsed block" - an in-memory form of a block in block cache, as it is // used by the table reader. Different C++ types are used depending on the -// block type (see block_like_traits.h). Only trivially parsable block types +// block type (see block_cache.h). Only trivially parsable block types // use BlockContents as the parsed form. // struct BlockContents { diff --git a/table/get_context.cc b/table/get_context.cc index fca809cc3bc..2b5a7ae6596 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -351,9 +351,17 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, Slice blob_value(pin_val); push_operand(blob_value, nullptr); } else if (type == kTypeWideColumnEntity) { - // TODO: support wide-column entities - state_ = kUnexpectedWideColumnEntity; - return false; + Slice value_copy = value; + Slice value_of_default; + + if (!WideColumnSerialization::GetValueOfDefaultColumn( + value_copy, value_of_default) + .ok()) { + state_ = kCorrupt; + return false; + } + + push_operand(value_of_default, value_pinner); } else { assert(type == kTypeValue); push_operand(value, value_pinner); @@ -377,9 +385,26 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, push_operand(blob_value, nullptr); } } else if (type == kTypeWideColumnEntity) { - // TODO: support wide-column entities - state_ = kUnexpectedWideColumnEntity; - return false; + state_ = kFound; + + if (do_merge_) { + MergeWithEntity(value); + } else { + // It means this function is called as part of DB GetMergeOperands + // API and the current value should be part of + // merge_context_->operand_list + Slice value_copy = value; + Slice value_of_default; + + if (!WideColumnSerialization::GetValueOfDefaultColumn( + value_copy, value_of_default) + .ok()) { + state_ = kCorrupt; + return false; + } + + push_operand(value_of_default, value_pinner); + } } else { assert(type == kTypeValue); @@ -407,7 +432,9 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, state_ = kDeleted; } else if (kMerge == state_) { state_ = kFound; - Merge(nullptr); + if (do_merge_) { + Merge(nullptr); + } // If do_merge_ = false then the current value shouldn't be part of // merge_context_->operand_list } @@ -438,16 +465,89 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } void GetContext::Merge(const Slice* value) { + assert(do_merge_); + assert(!pinnable_val_ || !columns_); + + std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, value, merge_context_->GetOperands(), &result, + logger_, statistics_, clock_, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + state_ = kCorrupt; + return; + } + if (LIKELY(pinnable_val_ != nullptr)) { - if (do_merge_) { - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, value, merge_context_->GetOperands(), - pinnable_val_->GetSelf(), logger_, statistics_, clock_); - pinnable_val_->PinSelf(); - if (!merge_status.ok()) { + *(pinnable_val_->GetSelf()) = std::move(result); + pinnable_val_->PinSelf(); + return; + } + + assert(columns_); + columns_->SetPlainValue(result); +} + +void GetContext::MergeWithEntity(Slice entity) { + assert(do_merge_); + assert(!pinnable_val_ || !columns_); + + if (LIKELY(pinnable_val_ != nullptr)) { + Slice value_of_default; + + { + const Status s = WideColumnSerialization::GetValueOfDefaultColumn( + entity, value_of_default); + if (!s.ok()) { state_ = kCorrupt; + return; } } + + { + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, &value_of_default, + merge_context_->GetOperands(), pinnable_val_->GetSelf(), logger_, + statistics_, clock_, /* result_operand */ nullptr, + /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + state_ = kCorrupt; + return; + } + } + + pinnable_val_->PinSelf(); + return; + } + + std::string result; + + { + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMergeWithEntity( + merge_operator_, user_key_, entity, merge_context_->GetOperands(), + &result, logger_, statistics_, clock_, /* update_num_ops_stats */ true, + /* op_failure_scope */ nullptr); + if (!s.ok()) { + state_ = kCorrupt; + return; + } + } + + { + assert(columns_); + const Status s = columns_->SetWideColumnValue(result); + if (!s.ok()) { + state_ = kCorrupt; + return; + } } } @@ -472,6 +572,7 @@ bool GetContext::GetBlobValue(const Slice& blob_index, } void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) { + // TODO(yanqin) preserve timestamps information in merge_context if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() && value_pinner != nullptr) { value_pinner->DelegateCleanupsTo(pinned_iters_mgr()); diff --git a/table/get_context.h b/table/get_context.h index 57f8b7eea0c..dcc7ab8d60a 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -75,8 +75,6 @@ class GetContext { kCorrupt, kMerge, // saver contains the current merge result (the operands) kUnexpectedBlobIndex, - // TODO: remove once wide-column entities are supported by Get/MultiGet - kUnexpectedWideColumnEntity, }; GetContextStats get_context_stats_; @@ -185,6 +183,7 @@ class GetContext { private: void Merge(const Slice* value); + void MergeWithEntity(Slice entity); bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value); const Comparator* ucmp_; diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 945dec80695..8015ed63511 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -203,8 +203,6 @@ class InternalIteratorBase : public Cleanable { Prev(); } } - - bool is_mutable_; }; using InternalIterator = InternalIteratorBase; diff --git a/table/iter_heap.h b/table/iter_heap.h index f6812fa03a0..6ad94be9b69 100644 --- a/table/iter_heap.h +++ b/table/iter_heap.h @@ -21,6 +21,7 @@ class MaxIteratorComparator { bool operator()(IteratorWrapper* a, IteratorWrapper* b) const { return comparator_->Compare(a->key(), b->key()) < 0; } + private: const InternalKeyComparator* comparator_; }; @@ -35,6 +36,7 @@ class MinIteratorComparator { bool operator()(IteratorWrapper* a, IteratorWrapper* b) const { return comparator_->Compare(a->key(), b->key()) > 0; } + private: const InternalKeyComparator* comparator_; }; diff --git a/table/iterator.cc b/table/iterator.cc index f66afc862a2..14e280a07b6 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -29,7 +29,7 @@ Status Iterator::GetProperty(std::string prop_name, std::string* prop) { namespace { class EmptyIterator : public Iterator { public: - explicit EmptyIterator(const Status& s) : status_(s) { } + explicit EmptyIterator(const Status& s) : status_(s) {} bool Valid() const override { return false; } void Seek(const Slice& /*target*/) override {} void SeekForPrev(const Slice& /*target*/) override {} diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index d10330e0673..17abef4ac79 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -145,9 +145,7 @@ class IteratorWrapperBase { return iter_->IsValuePinned(); } - bool IsValuePrepared() const { - return result_.value_prepared; - } + bool IsValuePrepared() const { return result_.value_prepared; } Slice user_key() const { assert(Valid()); diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 9d42768017b..309ae69c5e8 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -43,7 +43,7 @@ struct HeapItem { enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END }; IteratorWrapper iter; size_t level = 0; - std::string pinned_key; + ParsedInternalKey parsed_ikey; // Will be overwritten before use, initialize here so compiler does not // complain. Type type = ITERATOR; @@ -54,26 +54,14 @@ struct HeapItem { } void SetTombstoneKey(ParsedInternalKey&& pik) { - pinned_key.clear(); - // Range tombstone end key is exclusive. If a point internal key has the - // same user key and sequence number as the start or end key of a range - // tombstone, the order will be start < end key < internal key with the - // following op_type change. This is helpful to ensure keys popped from - // heap are in expected order since range tombstone start/end keys will - // be distinct from point internal keys. Strictly speaking, this is only - // needed for tombstone end points that are truncated in - // TruncatedRangeDelIterator since untruncated tombstone end points always - // have kMaxSequenceNumber and kTypeRangeDeletion (see - // TruncatedRangeDelIterator::start_key()/end_key()). - ParsedInternalKey p(pik.user_key, pik.sequence, kTypeMaxValid); - AppendInternalKey(&pinned_key, p); + // op_type is already initialized in MergingIterator::Finish(). + parsed_ikey.user_key = pik.user_key; + parsed_ikey.sequence = pik.sequence; } Slice key() const { - if (type == Type::ITERATOR) { - return iter.key(); - } - return pinned_key; + assert(type == ITERATOR); + return iter.key(); } bool IsDeleteRangeSentinelKey() const { @@ -89,7 +77,19 @@ class MinHeapItemComparator { MinHeapItemComparator(const InternalKeyComparator* comparator) : comparator_(comparator) {} bool operator()(HeapItem* a, HeapItem* b) const { - return comparator_->Compare(a->key(), b->key()) > 0; + if (LIKELY(a->type == HeapItem::ITERATOR)) { + if (LIKELY(b->type == HeapItem::ITERATOR)) { + return comparator_->Compare(a->key(), b->key()) > 0; + } else { + return comparator_->Compare(a->key(), b->parsed_ikey) > 0; + } + } else { + if (LIKELY(b->type == HeapItem::ITERATOR)) { + return comparator_->Compare(a->parsed_ikey, b->key()) > 0; + } else { + return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) > 0; + } + } } private: @@ -101,7 +101,19 @@ class MaxHeapItemComparator { MaxHeapItemComparator(const InternalKeyComparator* comparator) : comparator_(comparator) {} bool operator()(HeapItem* a, HeapItem* b) const { - return comparator_->Compare(a->key(), b->key()) < 0; + if (LIKELY(a->type == HeapItem::ITERATOR)) { + if (LIKELY(b->type == HeapItem::ITERATOR)) { + return comparator_->Compare(a->key(), b->key()) < 0; + } else { + return comparator_->Compare(a->key(), b->parsed_ikey) < 0; + } + } else { + if (LIKELY(b->type == HeapItem::ITERATOR)) { + return comparator_->Compare(a->parsed_ikey, b->key()) < 0; + } else { + return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) < 0; + } + } } private: @@ -177,6 +189,17 @@ class MergingIterator : public InternalIterator { pinned_heap_item_.resize(range_tombstone_iters_.size()); for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { pinned_heap_item_[i].level = i; + // Range tombstone end key is exclusive. If a point internal key has the + // same user key and sequence number as the start or end key of a range + // tombstone, the order will be start < end key < internal key with the + // following op_type change. This is helpful to ensure keys popped from + // heap are in expected order since range tombstone start/end keys will + // be distinct from point internal keys. Strictly speaking, this is only + // needed for tombstone end points that are truncated in + // TruncatedRangeDelIterator since untruncated tombstone end points + // always have kMaxSequenceNumber and kTypeRangeDeletion (see + // TruncatedRangeDelIterator::start_key()/end_key()). + pinned_heap_item_[i].parsed_ikey.type = kTypeMaxValid; } } } @@ -824,14 +847,18 @@ bool MergingIterator::SkipNextDeleted() { // SetTombstoneKey()). assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion || active_.count(current->level) == 0); - // LevelIterator enters a new SST file - current->iter.Next(); - if (current->iter.Valid()) { - assert(current->iter.status().ok()); - minHeap_.replace_top(current); - } else { - minHeap_.pop(); - } + // When entering a new file, old range tombstone iter is freed, + // but the last key from that range tombstone iter may still be in the heap. + // We need to ensure the data underlying its corresponding key Slice is + // still alive. We do so by popping the range tombstone key from heap before + // calling iter->Next(). Technically, this change is not needed: if there is + // a range tombstone end key that is after file boundary sentinel key in + // minHeap_, the range tombstone end key must have been truncated at file + // boundary. The underlying data of the range tombstone end key Slice is the + // SST file's largest internal key stored as file metadata in Version. + // However, since there are too many implicit assumptions made, it is safer + // to just ensure range tombstone iter is still alive. + minHeap_.pop(); // Remove last SST file's range tombstone end key if there is one. // This means file boundary is before range tombstone end key, // which could happen when a range tombstone and a user key @@ -842,6 +869,12 @@ bool MergingIterator::SkipNextDeleted() { minHeap_.pop(); active_.erase(current->level); } + // LevelIterator enters a new SST file + current->iter.Next(); + if (current->iter.Valid()) { + assert(current->iter.status().ok()); + minHeap_.push(current); + } if (range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()) { InsertRangeTombstoneToMinHeap(current->level); @@ -852,7 +885,8 @@ bool MergingIterator::SkipNextDeleted() { // Point key case: check active_ for range tombstone coverage. ParsedInternalKey pik; ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError(); - for (auto& i : active_) { + if (!active_.empty()) { + auto i = *active_.begin(); if (i < current->level) { // range tombstone is from a newer level, definitely covers assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), @@ -1037,18 +1071,19 @@ bool MergingIterator::SkipPrevDeleted() { } if (current->iter.IsDeleteRangeSentinelKey()) { // LevelIterator enters a new SST file - current->iter.Prev(); - if (current->iter.Valid()) { - assert(current->iter.status().ok()); - maxHeap_->replace_top(current); - } else { - maxHeap_->pop(); - } + maxHeap_->pop(); + // Remove last SST file's range tombstone key if there is one. if (!maxHeap_->empty() && maxHeap_->top()->level == current->level && maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) { maxHeap_->pop(); active_.erase(current->level); } + current->iter.Prev(); + if (current->iter.Valid()) { + assert(current->iter.status().ok()); + maxHeap_->push(current); + } + if (range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()) { InsertRangeTombstoneToMaxHeap(current->level); @@ -1059,7 +1094,8 @@ bool MergingIterator::SkipPrevDeleted() { // Point key case: check active_ for range tombstone coverage. ParsedInternalKey pik; ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError(); - for (auto& i : active_) { + if (!active_.empty()) { + auto i = *active_.begin(); if (i < current->level) { // range tombstone is from a newer level, definitely covers assert(comparator_->Compare(range_tombstone_iters_[i]->start_key(), @@ -1098,6 +1134,7 @@ bool MergingIterator::SkipPrevDeleted() { return false /* current key not deleted */; } } + assert(active_.empty()); assert(maxHeap_->top()->type == HeapItem::ITERATOR); return false /* current key not deleted */; diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 78402482b7f..6530f6a80c4 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -35,8 +35,7 @@ const std::string kRangeDelBlockName = "rocksdb.range_del"; MetaIndexBuilder::MetaIndexBuilder() : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {} -void MetaIndexBuilder::Add(const std::string& key, - const BlockHandle& handle) { +void MetaIndexBuilder::Add(const std::string& key, const BlockHandle& handle) { std::string handle_encoding; handle.EncodeTo(&handle_encoding); meta_block_handles_.insert({key, handle_encoding}); @@ -173,8 +172,8 @@ void LogPropertiesCollectionError(Logger* info_log, const std::string& method, assert(method == "Add" || method == "Finish"); std::string msg = - "Encountered error when calling TablePropertiesCollector::" + - method + "() with collector name: " + name; + "Encountered error when calling TablePropertiesCollector::" + method + + "() with collector name: " + name; ROCKS_LOG_ERROR(info_log, "%s", msg.c_str()); } @@ -346,8 +345,9 @@ Status ReadTablePropertiesHelper( if (!GetVarint64(&raw_val, &val)) { // skip malformed value auto error_msg = - "Detect malformed value in properties meta-block:" - "\tkey: " + key + "\tval: " + raw_val.ToString(); + "Detect malformed value in properties meta-block:" + "\tkey: " + + key + "\tval: " + raw_val.ToString(); ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str()); continue; } @@ -479,8 +479,8 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, Footer* footer_out) { Footer footer; IOOptions opts; - auto s = ReadFooterFromFile(opts, file, prefetch_buffer, file_size, &footer, - table_magic_number); + auto s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, + file_size, &footer, table_magic_number); if (!s.ok()) { return s; } diff --git a/table/multiget_context.h b/table/multiget_context.h index 933f4e17de8..76027a9520f 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -123,8 +123,7 @@ class MultiGetContext { assert(num_keys <= MAX_BATCH_SIZE); if (num_keys > MAX_LOOKUP_KEYS_ON_STACK) { lookup_key_heap_buf.reset(new char[sizeof(LookupKey) * num_keys]); - lookup_key_ptr_ = reinterpret_cast( - lookup_key_heap_buf.get()); + lookup_key_ptr_ = reinterpret_cast(lookup_key_heap_buf.get()); } for (size_t iter = 0; iter != num_keys_; ++iter) { @@ -157,8 +156,9 @@ class MultiGetContext { private: static const int MAX_LOOKUP_KEYS_ON_STACK = 16; - alignas(alignof(LookupKey)) - char lookup_key_stack_buf[sizeof(LookupKey) * MAX_LOOKUP_KEYS_ON_STACK]; + alignas( + alignof(LookupKey)) char lookup_key_stack_buf[sizeof(LookupKey) * + MAX_LOOKUP_KEYS_ON_STACK]; std::array sorted_keys_; size_t num_keys_; Mask value_mask_; @@ -250,8 +250,7 @@ class MultiGetContext { size_t index_; }; - Range(const Range& mget_range, - const Iterator& first, + Range(const Range& mget_range, const Iterator& first, const Iterator& last) { ctx_ = mget_range.ctx_; if (first == last) { diff --git a/table/persistent_cache_helper.cc b/table/persistent_cache_helper.cc index 8435b294e4d..eece8100e6c 100644 --- a/table/persistent_cache_helper.cc +++ b/table/persistent_cache_helper.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "table/persistent_cache_helper.h" + #include "table/block_based/block_based_table_reader.h" #include "table/format.h" diff --git a/table/plain/plain_table_bloom.cc b/table/plain/plain_table_bloom.cc index 7b683352417..21441f6161b 100644 --- a/table/plain/plain_table_bloom.cc +++ b/table/plain/plain_table_bloom.cc @@ -7,9 +7,9 @@ #include #include -#include "util/dynamic_bloom.h" #include "memory/allocator.h" +#include "util/dynamic_bloom.h" namespace ROCKSDB_NAMESPACE { diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index a93cf4c4917..04723955cf8 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -82,8 +82,9 @@ PlainTableBuilder::PlainTableBuilder( index_builder_.reset(new PlainTableIndexBuilder( &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness, hash_table_ratio, huge_page_tlb_size_)); - properties_.user_collected_properties - [PlainTablePropertyNames::kBloomVersion] = "1"; // For future use + properties_ + .user_collected_properties[PlainTablePropertyNames::kBloomVersion] = + "1"; // For future use } properties_.fixed_key_len = user_key_len; @@ -112,8 +113,8 @@ PlainTableBuilder::PlainTableBuilder( std::string val; PutFixed32(&val, static_cast(encoder_.GetEncodingType())); - properties_.user_collected_properties - [PlainTablePropertyNames::kEncodingType] = val; + properties_ + .user_collected_properties[PlainTablePropertyNames::kEncodingType] = val; assert(int_tbl_prop_collector_factories); for (auto& factory : *int_tbl_prop_collector_factories) { @@ -303,17 +304,13 @@ Status PlainTableBuilder::Finish() { return status_; } -void PlainTableBuilder::Abandon() { - closed_ = true; -} +void PlainTableBuilder::Abandon() { closed_ = true; } uint64_t PlainTableBuilder::NumEntries() const { return properties_.num_entries; } -uint64_t PlainTableBuilder::FileSize() const { - return offset_; -} +uint64_t PlainTableBuilder::FileSize() const { return offset_; } std::string PlainTableBuilder::GetFileChecksum() const { if (file_ != nullptr) { diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h index 0903b96e87e..445491c2ab4 100644 --- a/table/plain/plain_table_builder.h +++ b/table/plain/plain_table_builder.h @@ -7,8 +7,10 @@ #ifndef ROCKSDB_LITE #include + #include #include + #include "db/version_edit.h" #include "rocksdb/options.h" #include "rocksdb/status.h" @@ -29,7 +31,7 @@ class TableBuilder; // The builder class of PlainTable. For description of PlainTable format // See comments of class PlainTableFactory, where instances of // PlainTableReader are created. -class PlainTableBuilder: public TableBuilder { +class PlainTableBuilder : public TableBuilder { public: // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the diff --git a/table/plain/plain_table_factory.h b/table/plain/plain_table_factory.h index e482403277d..ce60b9d1990 100644 --- a/table/plain/plain_table_factory.h +++ b/table/plain/plain_table_factory.h @@ -6,9 +6,10 @@ #pragma once #ifndef ROCKSDB_LITE +#include + #include #include -#include #include "rocksdb/table.h" @@ -177,6 +178,5 @@ class PlainTableFactory : public TableFactory { PlainTableOptions table_options_; }; - } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_index.cc b/table/plain/plain_table_index.cc index 35b5fc8cb21..b7e07cfb224 100644 --- a/table/plain/plain_table_index.cc +++ b/table/plain/plain_table_index.cc @@ -19,7 +19,7 @@ inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) { assert(num_buckets > 0); return hash % num_buckets; } -} +} // namespace Status PlainTableIndex::InitFromRawData(Slice data) { if (!GetVarint32(&data, &index_size_)) { @@ -114,7 +114,7 @@ void PlainTableIndexBuilder::AllocateIndex() { } else { double hash_table_size_multipier = 1.0 / hash_table_ratio_; index_size_ = - static_cast(num_prefixes_ * hash_table_size_multipier) + 1; + static_cast(num_prefixes_ * hash_table_size_multipier) + 1; assert(index_size_ > 0); } } @@ -180,7 +180,8 @@ Slice PlainTableIndexBuilder::FillIndexes( break; default: // point to second level indexes. - PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask); + PutUnaligned(index + i, + sub_index_offset | PlainTableIndex::kSubIndexMask); char* prev_ptr = &sub_index[sub_index_offset]; char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket); sub_index_offset += static_cast(cur_ptr - prev_ptr); diff --git a/table/plain/plain_table_index.h b/table/plain/plain_table_index.h index 565a4e0f686..9f5f0eeff1e 100644 --- a/table/plain/plain_table_index.h +++ b/table/plain/plain_table_index.h @@ -188,8 +188,8 @@ class PlainTableIndexBuilder { num_records_in_current_group_; } IndexRecord* At(size_t index) { - return &(groups_[index / kNumRecordsPerGroup] - [index % kNumRecordsPerGroup]); + return &( + groups_[index / kNumRecordsPerGroup][index % kNumRecordsPerGroup]); } private: diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc index 93b198fc5b7..800d8d76fbc 100644 --- a/table/plain/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -8,6 +8,7 @@ #include #include + #include "db/dbformat.h" #include "file/writable_file_writer.h" #include "table/plain/plain_table_factory.h" diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 5e04f37995f..6ce3d0ab994 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -11,14 +11,15 @@ #include #include "db/dbformat.h" - +#include "memory/arena.h" +#include "monitoring/histogram.h" +#include "monitoring/perf_context_imp.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" - #include "table/block_based/block.h" #include "table/block_based/filter_block.h" #include "table/format.h" @@ -29,10 +30,6 @@ #include "table/plain/plain_table_factory.h" #include "table/plain/plain_table_key_coding.h" #include "table/two_level_iterator.h" - -#include "memory/arena.h" -#include "monitoring/histogram.h" -#include "monitoring/perf_context_imp.h" #include "util/coding.h" #include "util/dynamic_bloom.h" #include "util/hash.h" @@ -194,14 +191,12 @@ Status PlainTableReader::Open( return s; } -void PlainTableReader::SetupForCompaction() { -} +void PlainTableReader::SetupForCompaction() {} InternalIterator* PlainTableReader::NewIterator( const ReadOptions& options, const SliceTransform* /* prefix_extractor */, Arena* arena, bool /*skip_filters*/, TableReaderCaller /*caller*/, - size_t /*compaction_readahead_size*/, - bool /* allow_unprepared_value */) { + size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) { // Not necessarily used here, but make sure this has been initialized assert(table_properties_); @@ -640,8 +635,7 @@ PlainTableIterator::PlainTableIterator(PlainTableReader* table, next_offset_ = offset_ = table_->file_info_.data_end_offset; } -PlainTableIterator::~PlainTableIterator() { -} +PlainTableIterator::~PlainTableIterator() {} bool PlainTableIterator::Valid() const { return offset_ < table_->file_info_.data_end_offset && @@ -671,9 +665,8 @@ void PlainTableIterator::Seek(const Slice& target) { // it. This is needed for compaction: it creates iterator with // total_order_seek = true but usually never does Seek() on it, // only SeekToFirst(). - status_ = - Status::InvalidArgument( - "total_order_seek not implemented for PlainTable."); + status_ = Status::InvalidArgument( + "total_order_seek not implemented for PlainTable."); offset_ = next_offset_ = table_->file_info_.data_end_offset; return; } @@ -754,9 +747,7 @@ void PlainTableIterator::Next() { } } -void PlainTableIterator::Prev() { - assert(false); -} +void PlainTableIterator::Prev() { assert(false); } Slice PlainTableIterator::key() const { assert(Valid()); @@ -768,9 +759,7 @@ Slice PlainTableIterator::value() const { return value_; } -Status PlainTableIterator::status() const { - return status_; -} +Status PlainTableIterator::status() const { return status_; } } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index 168beac4860..62bda693aeb 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -6,11 +6,12 @@ #pragma once #ifndef ROCKSDB_LITE -#include +#include + #include -#include #include -#include +#include +#include #include "file/random_access_file_reader.h" #include "memory/arena.h" @@ -58,14 +59,14 @@ struct PlainTableReaderFileInfo { // The reader class of PlainTable. For description of PlainTable format // See comments of class PlainTableFactory, where instances of // PlainTableReader are created. -class PlainTableReader: public TableReader { +class PlainTableReader : public TableReader { public: -// Based on following output file format shown in plain_table_factory.h -// When opening the output file, PlainTableReader creates a hash table -// from key prefixes to offset of the output file. PlainTable will decide -// whether it points to the data offset of the first key with the key prefix -// or the offset of it. If there are too many keys share this prefix, it will -// create a binary search-able index from the suffix to offset on disk. + // Based on following output file format shown in plain_table_factory.h + // When opening the output file, PlainTableReader creates a hash table + // from key prefixes to offset of the output file. PlainTable will decide + // whether it points to the data offset of the first key with the key prefix + // or the offset of it. If there are too many keys share this prefix, it will + // create a binary search-able index from the suffix to offset on disk. static Status Open(const ImmutableOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, @@ -165,10 +166,11 @@ class PlainTableReader: public TableReader { const ImmutableOptions& ioptions_; std::unique_ptr dummy_cleanable_; uint64_t file_size_; - protected: // for testing + + protected: // for testing std::shared_ptr table_properties_; - private: + private: bool IsFixedLength() const { return user_key_len_ != kPlainTableVariableLength; } diff --git a/table/scoped_arena_iterator.h b/table/scoped_arena_iterator.h index 619b93c180d..2b8824d95e4 100644 --- a/table/scoped_arena_iterator.h +++ b/table/scoped_arena_iterator.h @@ -7,8 +7,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "table/internal_iterator.h" #include "port/port.h" +#include "table/internal_iterator.h" namespace ROCKSDB_NAMESPACE { class ScopedArenaIterator { @@ -20,7 +20,6 @@ class ScopedArenaIterator { } public: - explicit ScopedArenaIterator(InternalIterator* iter = nullptr) : iter_(iter) {} @@ -50,9 +49,7 @@ class ScopedArenaIterator { return res; } - ~ScopedArenaIterator() { - reset(nullptr); - } + ~ScopedArenaIterator() { reset(nullptr); } private: InternalIterator* iter_; diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index eefbaaeee1a..3357099e829 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -113,7 +113,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { static_cast(prefetch_size), Env::IO_TOTAL /* rate_limiter_priority */); - s = ReadFooterFromFile(opts, file_.get(), &prefetch_buffer, file_size, + s = ReadFooterFromFile(opts, file_.get(), *fs, &prefetch_buffer, file_size, &footer); } if (s.ok()) { @@ -223,9 +223,8 @@ Status SstFileDumper::CalculateCompressedTableSize( table_options.block_size = block_size; BlockBasedTableFactory block_based_tf(table_options); std::unique_ptr table_builder; - table_builder.reset(block_based_tf.NewTableBuilder( - tb_options, - dest_writer.get())); + table_builder.reset( + block_based_tf.NewTableBuilder(tb_options, dest_writer.get())); std::unique_ptr iter(table_reader_->NewIterator( read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool)); diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index e2241918b02..273c2fc4a7a 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -25,7 +25,7 @@ const std::string ExternalSstFilePropertyNames::kGlobalSeqno = #ifndef ROCKSDB_LITE -const size_t kFadviseTrigger = 1024 * 1024; // 1MB +const size_t kFadviseTrigger = 1024 * 1024; // 1MB struct SstFileWriter::Rep { Rep(const EnvOptions& _env_options, const Options& options, @@ -214,8 +214,7 @@ struct SstFileWriter::Rep { // Fadvise disabled return s; } - uint64_t bytes_since_last_fadvise = - builder->FileSize() - last_fadvise_size; + uint64_t bytes_since_last_fadvise = builder->FileSize() - last_fadvise_size; if (bytes_since_last_fadvise > kFadviseTrigger || closing) { TEST_SYNC_POINT_CALLBACK("SstFileWriter::Rep::InvalidatePageCache", &(bytes_since_last_fadvise)); @@ -430,9 +429,7 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { return s; } -uint64_t SstFileWriter::FileSize() { - return rep_->file_info.file_size; -} +uint64_t SstFileWriter::FileSize() { return rep_->file_info.file_size; } #endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_properties.cc b/table/table_properties.cc index a88686651b3..b382281f857 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -21,32 +21,25 @@ const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily = std::numeric_limits::max(); namespace { - void AppendProperty( - std::string& props, - const std::string& key, - const std::string& value, - const std::string& prop_delim, - const std::string& kv_delim) { - props.append(key); - props.append(kv_delim); - props.append(value); - props.append(prop_delim); - } +void AppendProperty(std::string& props, const std::string& key, + const std::string& value, const std::string& prop_delim, + const std::string& kv_delim) { + props.append(key); + props.append(kv_delim); + props.append(value); + props.append(prop_delim); +} - template - void AppendProperty( - std::string& props, - const std::string& key, - const TValue& value, - const std::string& prop_delim, - const std::string& kv_delim) { - AppendProperty(props, key, std::to_string(value), prop_delim, kv_delim); - } +template +void AppendProperty(std::string& props, const std::string& key, + const TValue& value, const std::string& prop_delim, + const std::string& kv_delim) { + AppendProperty(props, key, std::to_string(value), prop_delim, kv_delim); } +} // namespace -std::string TableProperties::ToString( - const std::string& prop_delim, - const std::string& kv_delim) const { +std::string TableProperties::ToString(const std::string& prop_delim, + const std::string& kv_delim) const { std::string result; result.reserve(1024); @@ -81,8 +74,8 @@ std::string TableProperties::ToString( if (index_partitions != 0) { AppendProperty(result, "# index partitions", index_partitions, prop_delim, kv_delim); - AppendProperty(result, "top-level index size", top_level_index_size, prop_delim, - kv_delim); + AppendProperty(result, "top-level index size", top_level_index_size, + prop_delim, kv_delim); } AppendProperty(result, "filter block size", filter_size, prop_delim, kv_delim); @@ -256,10 +249,8 @@ const std::string TablePropertiesNames::kDbHostId = "rocksdb.creating.host.identity"; const std::string TablePropertiesNames::kOriginalFileNumber = "rocksdb.original.file.number"; -const std::string TablePropertiesNames::kDataSize = - "rocksdb.data.size"; -const std::string TablePropertiesNames::kIndexSize = - "rocksdb.index.size"; +const std::string TablePropertiesNames::kDataSize = "rocksdb.data.size"; +const std::string TablePropertiesNames::kIndexSize = "rocksdb.index.size"; const std::string TablePropertiesNames::kIndexPartitions = "rocksdb.index.partitions"; const std::string TablePropertiesNames::kTopLevelIndexSize = @@ -268,16 +259,13 @@ const std::string TablePropertiesNames::kIndexKeyIsUserKey = "rocksdb.index.key.is.user.key"; const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded = "rocksdb.index.value.is.delta.encoded"; -const std::string TablePropertiesNames::kFilterSize = - "rocksdb.filter.size"; -const std::string TablePropertiesNames::kRawKeySize = - "rocksdb.raw.key.size"; +const std::string TablePropertiesNames::kFilterSize = "rocksdb.filter.size"; +const std::string TablePropertiesNames::kRawKeySize = "rocksdb.raw.key.size"; const std::string TablePropertiesNames::kRawValueSize = "rocksdb.raw.value.size"; const std::string TablePropertiesNames::kNumDataBlocks = "rocksdb.num.data.blocks"; -const std::string TablePropertiesNames::kNumEntries = - "rocksdb.num.entries"; +const std::string TablePropertiesNames::kNumEntries = "rocksdb.num.entries"; const std::string TablePropertiesNames::kNumFilterEntries = "rocksdb.num.filter_entries"; const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys"; @@ -285,8 +273,7 @@ const std::string TablePropertiesNames::kMergeOperands = "rocksdb.merge.operands"; const std::string TablePropertiesNames::kNumRangeDeletions = "rocksdb.num.range-deletions"; -const std::string TablePropertiesNames::kFilterPolicy = - "rocksdb.filter.policy"; +const std::string TablePropertiesNames::kFilterPolicy = "rocksdb.filter.policy"; const std::string TablePropertiesNames::kFormatVersion = "rocksdb.format.version"; const std::string TablePropertiesNames::kFixedKeyLen = diff --git a/table/table_reader.h b/table/table_reader.h index 4a904991d2d..391072eec1b 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -9,6 +9,7 @@ #pragma once #include + #include "db/range_tombstone_fragmenter.h" #if USE_COROUTINES #include "folly/experimental/coro/Coroutine.h" @@ -161,8 +162,8 @@ class TableReader { // persists the data on a non volatile storage medium like disk/SSD virtual Status Prefetch(const Slice* begin = nullptr, const Slice* end = nullptr) { - (void) begin; - (void) end; + (void)begin; + (void)end; // Default implementation is NOOP. // The child class should implement functionality when applicable return Status::OK(); diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index 53d956b7926..b13caf68d52 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -224,9 +224,10 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } } if (count != r2_len) { - fprintf( - stderr, "Iterator cannot iterate expected number of entries. " - "Expected %d but got %d\n", r2_len, count); + fprintf(stderr, + "Iterator cannot iterate expected number of entries. " + "Expected %d but got %d\n", + r2_len, count); assert(false); } delete iter; @@ -261,16 +262,16 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } // namespace } // namespace ROCKSDB_NAMESPACE -DEFINE_bool(query_empty, false, "query non-existing keys instead of existing " - "ones."); +DEFINE_bool(query_empty, false, + "query non-existing keys instead of existing ones."); DEFINE_int32(num_keys1, 4096, "number of distinguish prefix of keys"); DEFINE_int32(num_keys2, 512, "number of distinguish keys for each prefix"); DEFINE_int32(iter, 3, "query non-existing keys instead of existing ones"); DEFINE_int32(prefix_len, 16, "Prefix length used for iterators and indexes"); DEFINE_bool(iterator, false, "For test iterator"); -DEFINE_bool(through_db, false, "If enable, a DB instance will be created and " - "the query will be against DB. Otherwise, will be directly against " - "a table reader."); +DEFINE_bool(through_db, false, + "If enable, a DB instance will be created and the query will be " + "against DB. Otherwise, will be directly against a table reader."); DEFINE_bool(mmap_read, true, "Whether use mmap read"); DEFINE_string(table_factory, "block_based", "Table factory to use: `block_based` (default), `plain_table` or " diff --git a/table/table_test.cc b/table/table_test.cc index 65b98ffba2d..d5fff82da4f 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -186,7 +186,7 @@ class Constructor { public: explicit Constructor(const Comparator* cmp) : data_(stl_wrappers::LessOfComparator(cmp)) {} - virtual ~Constructor() { } + virtual ~Constructor() {} void Add(const std::string& key, const Slice& value) { data_[key] = value.ToString(); @@ -492,7 +492,7 @@ class TableConstructor : public Constructor { }; uint64_t TableConstructor::cur_file_num_ = 1; -class MemTableConstructor: public Constructor { +class MemTableConstructor : public Constructor { public: explicit MemTableConstructor(const Comparator* cmp, WriteBufferManager* wb) : Constructor(cmp), @@ -566,11 +566,10 @@ class InternalIteratorFromIterator : public InternalIterator { std::unique_ptr it_; }; -class DBConstructor: public Constructor { +class DBConstructor : public Constructor { public: explicit DBConstructor(const Comparator* cmp) - : Constructor(cmp), - comparator_(cmp) { + : Constructor(cmp), comparator_(cmp) { db_ = nullptr; NewDB(); } @@ -654,15 +653,15 @@ std::ostream& operator<<(std::ostream& os, const TestArgs& args) { static std::vector GenerateArgList() { std::vector test_args; - std::vector test_types = { - BLOCK_BASED_TABLE_TEST, + std::vector test_types = {BLOCK_BASED_TABLE_TEST, #ifndef ROCKSDB_LITE - PLAIN_TABLE_SEMI_FIXED_PREFIX, - PLAIN_TABLE_FULL_STR_PREFIX, - PLAIN_TABLE_TOTAL_ORDER, + PLAIN_TABLE_SEMI_FIXED_PREFIX, + PLAIN_TABLE_FULL_STR_PREFIX, + PLAIN_TABLE_TOTAL_ORDER, #endif // !ROCKSDB_LITE - BLOCK_TEST, - MEMTABLE_TEST, DB_TEST}; + BLOCK_TEST, + MEMTABLE_TEST, + DB_TEST}; std::vector reverse_compare_types = {false, true}; std::vector restart_intervals = {16, 1, 1024}; std::vector compression_parallel_threads = {1, 4}; @@ -747,9 +746,8 @@ class FixedOrLessPrefixTransform : public SliceTransform { const size_t prefix_len_; public: - explicit FixedOrLessPrefixTransform(size_t prefix_len) : - prefix_len_(prefix_len) { - } + explicit FixedOrLessPrefixTransform(size_t prefix_len) + : prefix_len_(prefix_len) {} const char* Name() const override { return "rocksdb.FixedPrefix"; } @@ -964,8 +962,8 @@ class HarnessTest : public testing::Test { case 2: { std::string key = PickRandomKey(rnd, keys); model_iter = data.lower_bound(key); - if (kVerbose) fprintf(stderr, "Seek '%s'\n", - EscapeString(key).c_str()); + if (kVerbose) + fprintf(stderr, "Seek '%s'\n", EscapeString(key).c_str()); iter->Seek(Slice(key)); ASSERT_OK(iter->status()); ASSERT_EQ(ToString(data, model_iter), ToString(iter)); @@ -978,7 +976,7 @@ class HarnessTest : public testing::Test { iter->Prev(); ASSERT_OK(iter->status()); if (model_iter == data.begin()) { - model_iter = data.end(); // Wrap around to invalid value + model_iter = data.end(); // Wrap around to invalid value } else { --model_iter; } @@ -1047,14 +1045,14 @@ class HarnessTest : public testing::Test { break; case 1: { // Attempt to return something smaller than an existing key - if (result.size() > 0 && result[result.size() - 1] > '\0' - && (!only_support_prefix_seek_ - || options_.prefix_extractor->Transform(result).size() - < result.size())) { + if (result.size() > 0 && result[result.size() - 1] > '\0' && + (!only_support_prefix_seek_ || + options_.prefix_extractor->Transform(result).size() < + result.size())) { result[result.size() - 1]--; } break; - } + } case 2: { // Return something larger than an existing key Increment(options_.comparator, &result); @@ -1103,8 +1101,7 @@ static bool Between(uint64_t val, uint64_t low, uint64_t high) { bool result = (val >= low) && (val <= high); if (!result) { fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), + (unsigned long long)(val), (unsigned long long)(low), (unsigned long long)(high)); } return result; @@ -1183,8 +1180,8 @@ class BlockBasedTableTest { std::unique_ptr trace_reader; - Status s = - NewFileTraceReader(env_, EnvOptions(), trace_file_path_, &trace_reader); + Status s = NewFileTraceReader(env_, EnvOptions(), trace_file_path_, + &trace_reader); EXPECT_OK(s); BlockCacheTraceReader reader(std::move(trace_reader)); BlockCacheTraceHeader header; @@ -1249,8 +1246,7 @@ class BBTTailPrefetchTest : public TableTest {}; class FileChecksumTestHelper { public: FileChecksumTestHelper(bool convert_to_internal_key = false) - : convert_to_internal_key_(convert_to_internal_key) { - } + : convert_to_internal_key_(convert_to_internal_key) {} ~FileChecksumTestHelper() {} void CreateWritableFile() { @@ -1368,22 +1364,18 @@ INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest, // This test serves as the living tutorial for the prefix scan of user collected // properties. TEST_F(TablePropertyTest, PrefixScanTest) { - UserCollectedProperties props{{"num.111.1", "1"}, - {"num.111.2", "2"}, - {"num.111.3", "3"}, - {"num.333.1", "1"}, - {"num.333.2", "2"}, - {"num.333.3", "3"}, - {"num.555.1", "1"}, - {"num.555.2", "2"}, - {"num.555.3", "3"}, }; + UserCollectedProperties props{ + {"num.111.1", "1"}, {"num.111.2", "2"}, {"num.111.3", "3"}, + {"num.333.1", "1"}, {"num.333.2", "2"}, {"num.333.3", "3"}, + {"num.555.1", "1"}, {"num.555.2", "2"}, {"num.555.3", "3"}, + }; // prefixes that exist for (const std::string prefix : {"num.111", "num.333", "num.555"}) { int num = 0; for (auto pos = props.lower_bound(prefix); pos != props.end() && - pos->first.compare(0, prefix.size(), prefix) == 0; + pos->first.compare(0, prefix.size(), prefix) == 0; ++pos) { ++num; auto key = prefix + "." + std::to_string(num); @@ -2031,7 +2023,6 @@ TEST_P(BlockBasedTableTest, PrefetchTest) { // [ k05 ] k05 // [ k06 k07 ] k07 - // Simple PrefetchRange(&c, &opt, &table_options, /*key_range=*/"k01", "k05", @@ -2069,35 +2060,35 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { // Make each key/value an individual block table_options.block_size = 64; switch (i) { - case 0: - // Binary search index - table_options.index_type = BlockBasedTableOptions::kBinarySearch; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - break; - case 1: - // Hash search index - table_options.index_type = BlockBasedTableOptions::kHashSearch; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - options.prefix_extractor.reset(NewFixedPrefixTransform(4)); - break; - case 2: - // Hash search index with filter policy - table_options.index_type = BlockBasedTableOptions::kHashSearch; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - options.prefix_extractor.reset(NewFixedPrefixTransform(4)); - break; - case 3: - // Two-level index - table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - break; - case 4: - // Binary search with first key - table_options.index_type = - BlockBasedTableOptions::kBinarySearchWithFirstKey; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - break; + case 0: + // Binary search index + table_options.index_type = BlockBasedTableOptions::kBinarySearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; + case 1: + // Hash search index + table_options.index_type = BlockBasedTableOptions::kHashSearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + break; + case 2: + // Hash search index with filter policy + table_options.index_type = BlockBasedTableOptions::kHashSearch; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.prefix_extractor.reset(NewFixedPrefixTransform(4)); + break; + case 3: + // Two-level index + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; + case 4: + // Binary search with first key + table_options.index_type = + BlockBasedTableOptions::kBinarySearchWithFirstKey; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + break; } TableConstructor c(BytewiseComparator(), @@ -2265,8 +2256,9 @@ TEST_P(BlockBasedTableTest, BadChecksumType) { const MutableCFOptions new_moptions(options); Status s = c.Reopen(new_ioptions, new_moptions); ASSERT_NOK(s); + // "test" is file name ASSERT_EQ(s.ToString(), - "Corruption: Corrupt or unsupported checksum type: 123"); + "Corruption: Corrupt or unsupported checksum type: 123 in test"); } namespace { @@ -2452,7 +2444,12 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) { } // find the upper bound of prefixes - std::vector upper_bound = {keys[1], keys[2], keys[7], keys[9], }; + std::vector upper_bound = { + keys[1], + keys[2], + keys[7], + keys[9], + }; // find existing keys for (const auto& item : kvmap) { @@ -3969,19 +3966,19 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) { c.Finish(options, ioptions, moptions, table_options, internal_comparator, &keys, &kvmap); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000)); // k04 and k05 will be in two consecutive blocks, the index is // an arbitrary slice between k04 and k05, either before or after k04a ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000)); c.ResetTableReader(); } @@ -4045,8 +4042,7 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) { if (!XPRESS_Supported()) { fprintf(stderr, "skipping xpress and xpress compression tests\n"); - } - else { + } else { compression_state.push_back(kXpressCompression); } @@ -4811,9 +4807,9 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { Footer footer; IOOptions opts; - ASSERT_OK(ReadFooterFromFile(opts, file, nullptr /* prefetch_buffer */, - file_size, &footer, - kBlockBasedTableMagicNumber)); + ASSERT_OK(ReadFooterFromFile(opts, file, *FileSystem::Default(), + nullptr /* prefetch_buffer */, file_size, + &footer, kBlockBasedTableMagicNumber)); auto BlockFetchHelper = [&](const BlockHandle& handle, BlockType block_type, BlockContents* contents) { @@ -4897,7 +4893,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { // read footer Footer footer; IOOptions opts; - ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), + ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), *FileSystem::Default(), nullptr /* prefetch_buffer */, table_size, &footer, kBlockBasedTableMagicNumber)); @@ -4975,7 +4971,7 @@ TEST_P(BlockBasedTableTest, SeekMetaBlocks) { // read footer Footer footer; IOOptions opts; - ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), + ASSERT_OK(ReadFooterFromFile(opts, table_reader.get(), *FileSystem::Default(), nullptr /* prefetch_buffer */, table_size, &footer, kBlockBasedTableMagicNumber)); diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index a744cf92100..4b6634e5cfe 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "table/two_level_iterator.h" + #include "db/pinned_iterators_manager.h" #include "memory/arena.h" #include "rocksdb/options.h" diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index 885dff84b78..1fed9341752 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -8,8 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "rocksdb/iterator.h" #include "rocksdb/env.h" +#include "rocksdb/iterator.h" #include "table/iterator_wrapper.h" namespace ROCKSDB_NAMESPACE { diff --git a/test_util/sync_point.cc b/test_util/sync_point.cc index 067fc8234fe..bec02d4f67a 100644 --- a/test_util/sync_point.cc +++ b/test_util/sync_point.cc @@ -21,22 +21,20 @@ SyncPoint* SyncPoint::GetInstance() { SyncPoint::SyncPoint() : impl_(new Data) {} -SyncPoint:: ~SyncPoint() { - delete impl_; -} +SyncPoint::~SyncPoint() { delete impl_; } void SyncPoint::LoadDependency(const std::vector& dependencies) { impl_->LoadDependency(dependencies); } void SyncPoint::LoadDependencyAndMarkers( - const std::vector& dependencies, - const std::vector& markers) { + const std::vector& dependencies, + const std::vector& markers) { impl_->LoadDependencyAndMarkers(dependencies, markers); } void SyncPoint::SetCallBack(const std::string& point, - const std::function& callback) { + const std::function& callback) { impl_->SetCallBack(point, callback); } @@ -44,21 +42,13 @@ void SyncPoint::ClearCallBack(const std::string& point) { impl_->ClearCallBack(point); } -void SyncPoint::ClearAllCallBacks() { - impl_->ClearAllCallBacks(); -} +void SyncPoint::ClearAllCallBacks() { impl_->ClearAllCallBacks(); } -void SyncPoint::EnableProcessing() { - impl_->EnableProcessing(); -} +void SyncPoint::EnableProcessing() { impl_->EnableProcessing(); } -void SyncPoint::DisableProcessing() { - impl_->DisableProcessing(); -} +void SyncPoint::DisableProcessing() { impl_->DisableProcessing(); } -void SyncPoint::ClearTrace() { - impl_->ClearTrace(); -} +void SyncPoint::ClearTrace() { impl_->ClearTrace(); } void SyncPoint::Process(const Slice& point, void* cb_arg) { impl_->Process(point, cb_arg); diff --git a/test_util/sync_point.h b/test_util/sync_point.h index 410176dc821..65f1239ec44 100644 --- a/test_util/sync_point.h +++ b/test_util/sync_point.h @@ -138,9 +138,9 @@ class SyncPoint { struct Data; private: - // Singleton + // Singleton SyncPoint(); - Data* impl_; + Data* impl_; }; // Sets up sync points to mock direct IO instead of actually issuing direct IO diff --git a/test_util/sync_point_impl.cc b/test_util/sync_point_impl.cc index 4add4988966..2a4bd3ccdf1 100644 --- a/test_util/sync_point_impl.cc +++ b/test_util/sync_point_impl.cc @@ -37,7 +37,8 @@ void KillPoint::TestKillRandom(std::string kill_point, int odds_weight, } } -void SyncPoint::Data::LoadDependency(const std::vector& dependencies) { +void SyncPoint::Data::LoadDependency( + const std::vector& dependencies) { std::lock_guard lock(mutex_); successors_.clear(); predecessors_.clear(); @@ -52,8 +53,8 @@ void SyncPoint::Data::LoadDependency(const std::vector& dependenc } void SyncPoint::Data::LoadDependencyAndMarkers( - const std::vector& dependencies, - const std::vector& markers) { + const std::vector& dependencies, + const std::vector& markers) { std::lock_guard lock(mutex_); successors_.clear(); predecessors_.clear(); diff --git a/test_util/sync_point_impl.h b/test_util/sync_point_impl.h index 52601804095..64cc0445e06 100644 --- a/test_util/sync_point_impl.h +++ b/test_util/sync_point_impl.h @@ -52,11 +52,11 @@ struct SyncPoint::Data { // successor/predecessor map loaded from LoadDependency std::unordered_map> successors_; std::unordered_map> predecessors_; - std::unordered_map > callbacks_; - std::unordered_map > markers_; + std::unordered_map> callbacks_; + std::unordered_map> markers_; std::unordered_map marked_thread_id_; - std::mutex mutex_; + std::mutex mutex_; std::condition_variable cv_; // sync points that have been passed through std::unordered_set cleared_points_; @@ -68,29 +68,24 @@ struct SyncPoint::Data { void LoadDependency(const std::vector& dependencies); void LoadDependencyAndMarkers(const std::vector& dependencies, - const std::vector& markers); + const std::vector& markers); bool PredecessorsAllCleared(const std::string& point); void SetCallBack(const std::string& point, - const std::function& callback) { - std::lock_guard lock(mutex_); - callbacks_[point] = callback; - point_filter_.Add(point); -} + const std::function& callback) { + std::lock_guard lock(mutex_); + callbacks_[point] = callback; + point_filter_.Add(point); + } void ClearCallBack(const std::string& point); void ClearAllCallBacks(); - void EnableProcessing() { - enabled_ = true; - } - void DisableProcessing() { - enabled_ = false; - } + void EnableProcessing() { enabled_ = true; } + void DisableProcessing() { enabled_ = false; } void ClearTrace() { std::lock_guard lock(mutex_); cleared_points_.clear(); } - bool DisabledByMarker(const std::string& point, - std::thread::id thread_id) { + bool DisabledByMarker(const std::string& point, std::thread::id thread_id) { auto marked_point_iter = marked_thread_id_.find(point); return marked_point_iter != marked_thread_id_.end() && thread_id != marked_point_iter->second; @@ -98,4 +93,4 @@ struct SyncPoint::Data { void Process(const Slice& point, void* cb_arg); }; } // namespace ROCKSDB_NAMESPACE -#endif // NDEBUG +#endif // NDEBUG diff --git a/test_util/testharness.cc b/test_util/testharness.cc index 32d8a07d7f8..3c7b835d2f7 100644 --- a/test_util/testharness.cc +++ b/test_util/testharness.cc @@ -28,8 +28,7 @@ ::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) { if (s.ok()) { return ::testing::AssertionSuccess(); } else { - return ::testing::AssertionFailure() << s_expr << std::endl - << s.ToString(); + return ::testing::AssertionFailure() << s_expr << std::endl << s.ToString(); } } diff --git a/test_util/testutil.h b/test_util/testutil.h index dc02b84b13c..c2289dd819f 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -144,9 +144,8 @@ class StringSink : public FSWritableFile { if (reader_contents_ != nullptr) { assert(reader_contents_->size() <= last_flush_); size_t offset = last_flush_ - reader_contents_->size(); - *reader_contents_ = Slice( - contents_.data() + offset, - contents_.size() - offset); + *reader_contents_ = + Slice(contents_.data() + offset, contents_.size() - offset); last_flush_ = contents_.size(); } @@ -165,8 +164,8 @@ class StringSink : public FSWritableFile { void Drop(size_t bytes) { if (reader_contents_ != nullptr) { contents_.resize(contents_.size() - bytes); - *reader_contents_ = Slice( - reader_contents_->data(), reader_contents_->size() - bytes); + *reader_contents_ = + Slice(reader_contents_->data(), reader_contents_->size() - bytes); last_flush_ = contents_.size(); } } @@ -282,7 +281,7 @@ class StringSource : public FSRandomAccessFile { mmap_(mmap), total_reads_(0) {} - virtual ~StringSource() { } + virtual ~StringSource() {} uint64_t Size() const { return contents_.size(); } @@ -324,7 +323,7 @@ class StringSource : public FSRandomAccessFile { char* rid = id; rid = EncodeVarint64(rid, uniq_id_); rid = EncodeVarint64(rid, 0); - return static_cast(rid-id); + return static_cast(rid - id); } int total_reads() const { return total_reads_; } @@ -364,6 +363,16 @@ class SleepingBackgroundTask { done_with_sleep_(false), sleeping_(false) {} + ~SleepingBackgroundTask() { + MutexLock l(&mutex_); + should_sleep_ = false; + while (sleeping_) { + assert(!should_sleep_); + bg_cv_.SignalAll(); + bg_cv_.Wait(); + } + } + bool IsSleeping() { MutexLock l(&mutex_); return sleeping_; diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc index b90534341f1..99286d83617 100644 --- a/test_util/transaction_test_util.cc +++ b/test_util/transaction_test_util.cc @@ -13,14 +13,13 @@ #include #include +#include "db/dbformat.h" +#include "db/snapshot_impl.h" +#include "logging/logging.h" #include "rocksdb/db.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" - -#include "db/dbformat.h" -#include "db/snapshot_impl.h" -#include "logging/logging.h" #include "util/random.h" #include "util/string_util.h" diff --git a/test_util/transaction_test_util.h b/test_util/transaction_test_util.h index 175376f5f3a..7a38ab62681 100644 --- a/test_util/transaction_test_util.h +++ b/test_util/transaction_test_util.h @@ -7,8 +7,8 @@ #ifndef ROCKSDB_LITE -#include "rocksdb/options.h" #include "port/port.h" +#include "rocksdb/options.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/transaction_db.h" diff --git a/tools/blob_dump.cc b/tools/blob_dump.cc index ab39b8513a8..1f75eb20d8a 100644 --- a/tools/blob_dump.cc +++ b/tools/blob_dump.cc @@ -5,6 +5,7 @@ #ifndef ROCKSDB_LITE #include + #include #include #include diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc index 963719e95cc..f0bb6975bad 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc @@ -1175,7 +1175,8 @@ void BlockCacheTraceAnalyzer::WriteReuseLifetime( } void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline( - const uint64_t reuse_window, bool user_access_only, TraceType block_type) const { + const uint64_t reuse_window, bool user_access_only, + TraceType block_type) const { // A map from block key to an array of bools that states whether a block is // accessed in a time window. std::map> block_accessed; @@ -1214,7 +1215,8 @@ void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline( TraverseBlocks(block_callback); // A cell is the number of blocks accessed in a reuse window. - std::unique_ptr reuse_table(new uint64_t[reuse_vector_size * reuse_vector_size]); + std::unique_ptr reuse_table( + new uint64_t[reuse_vector_size * reuse_vector_size]); for (uint64_t start_time = 0; start_time < reuse_vector_size; start_time++) { // Initialize the reuse_table. for (uint64_t i = 0; i < reuse_vector_size; i++) { @@ -1255,8 +1257,9 @@ void BlockCacheTraceAnalyzer::WriteBlockReuseTimeline( if (j < start_time) { row += "100.0"; } else { - row += std::to_string(percent(reuse_table[start_time * reuse_vector_size + j], - reuse_table[start_time * reuse_vector_size + start_time])); + row += std::to_string( + percent(reuse_table[start_time * reuse_vector_size + j], + reuse_table[start_time * reuse_vector_size + start_time])); } } out << row << std::endl; @@ -1811,9 +1814,10 @@ void BlockCacheTraceAnalyzer::PrintDataBlockAccessStats() const { return; } // Use four decimal points. - uint64_t percent_referenced_for_existing_keys = (uint64_t)( - ((double)block.key_num_access_map.size() / (double)block.num_keys) * - 10000.0); + uint64_t percent_referenced_for_existing_keys = + (uint64_t)(((double)block.key_num_access_map.size() / + (double)block.num_keys) * + 10000.0); uint64_t percent_referenced_for_non_existing_keys = (uint64_t)(((double)block.non_exist_key_num_access_map.size() / (double)block.num_keys) * diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h index e5bc3da31f1..2f1ebd139ba 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.h +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h @@ -106,7 +106,7 @@ struct BlockAccessInfo { ParsedInternalKey internal_key; Status s = ParseInternalKey(access.referenced_key, &internal_key, false /* log_err_key */); // TODO - assert(s.ok()); // TODO + assert(s.ok()); // TODO } } else { non_exist_key_num_access_map[access.referenced_key][access.caller]++; @@ -292,7 +292,8 @@ class BlockCacheTraceAnalyzer { // The file is named // "block_type_user_access_only_reuse_window_reuse_timeline". The file format // is start_time,0,1,...,N where N equals trace_duration / reuse_window. - void WriteBlockReuseTimeline(const uint64_t reuse_window, bool user_access_only, + void WriteBlockReuseTimeline(const uint64_t reuse_window, + bool user_access_only, TraceType block_type) const; // Write the Get spatical locality into csv files saved in 'output_dir'. diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc index c5d9b145215..60834480538 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -95,7 +95,8 @@ class BlockCacheTracerTest : public testing::Test { } void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id, - TraceType block_type, uint32_t nblocks) { + TraceType block_type, uint32_t nblocks, + bool is_referenced_key_null = false) { assert(writer); for (uint32_t i = 0; i < nblocks; i++) { uint32_t key_id = from_key_id + i; @@ -122,6 +123,11 @@ class BlockCacheTracerTest : public testing::Test { record.referenced_key = kRefKeyPrefix + std::to_string(key_id) + std::string(8, 0); record.referenced_key_exist_in_block = true; + if (is_referenced_key_null && + record.caller == TableReaderCaller::kUserMultiGet) { + record.referenced_key = ""; + record.get_from_user_specified_snapshot = true; + } record.num_keys_in_block = kNumKeysInBlock; ASSERT_OK(writer->WriteBlockAccess( record, record.block_key, record.cf_name, record.referenced_key)); @@ -717,6 +723,65 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) { } } +TEST_F(BlockCacheTracerTest, MultiGetWithNullReferenceKey) { + { + // Generate a trace file containing MultiGet records with reference key + // being 0. + BlockCacheTraceWriterOptions trace_writer_opt; + std::unique_ptr trace_writer; + const auto& clock = env_->GetSystemClock(); + ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_, + &trace_writer)); + std::unique_ptr block_cache_trace_writer = + NewBlockCacheTraceWriter(clock.get(), trace_writer_opt, + std::move(trace_writer)); + ASSERT_NE(block_cache_trace_writer, nullptr); + ASSERT_OK(block_cache_trace_writer->WriteHeader()); + // Write blocks of different types. + + WriteBlockAccess(block_cache_trace_writer.get(), 0, + TraceType::kBlockTraceUncompressionDictBlock, 10, true); + WriteBlockAccess(block_cache_trace_writer.get(), 10, + TraceType::kBlockTraceDataBlock, 10, true); + WriteBlockAccess(block_cache_trace_writer.get(), 20, + TraceType::kBlockTraceFilterBlock, 10, true); + WriteBlockAccess(block_cache_trace_writer.get(), 30, + TraceType::kBlockTraceIndexBlock, 10, true); + WriteBlockAccess(block_cache_trace_writer.get(), 40, + TraceType::kBlockTraceRangeDeletionBlock, 10, true); + ASSERT_OK(env_->FileExists(trace_file_path_)); + } + + { + // Verify trace file is generated correctly. + std::unique_ptr trace_reader; + ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_, + &trace_reader)); + BlockCacheTraceReader reader(std::move(trace_reader)); + BlockCacheTraceHeader header; + ASSERT_OK(reader.ReadHeader(&header)); + ASSERT_EQ(static_cast(kMajorVersion), + header.rocksdb_major_version); + ASSERT_EQ(static_cast(kMinorVersion), + header.rocksdb_minor_version); + std::string human_readable_trace_file_path = + test_path_ + "/readable_block_cache_trace"; + // Read blocks. + BlockCacheTraceAnalyzer analyzer( + trace_file_path_, + /*output_dir=*/"", + /*human_readable_trace_file_path=*/human_readable_trace_file_path, + /*compute_reuse_distance=*/true, + /*mrc_only=*/false, + /*is_human_readable_trace_file=*/false, + /*cache_simulator=*/nullptr); + // The analyzer ends when it detects an incomplete access record. + ASSERT_EQ(Status::Incomplete(""), analyzer.Analyze()); + + ASSERT_OK(env_->DeleteFile(human_readable_trace_file_path)); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index aaf21a0d7ec..e7efd4f3191 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -125,7 +125,7 @@ EOF # To check for DB forward compatibility with loading options (old version # reading data from new), as well as backward compatibility -declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb") +declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb") # To check for DB forward compatibility without loading options (in addition # to the "with loading options" set), as well as backward compatibility declare -a db_forward_no_options_refs=() # N/A at the moment diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 739db1d4f26..dfe0f18cb9a 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -39,7 +39,6 @@ #include #include "cloud/aws/aws_file_system.h" -#include "cache/fast_lru_cache.h" #include "db/db_impl/db_impl.h" #include "db/malloc_stats.h" #include "db/version_set.h" @@ -179,8 +178,8 @@ IF_ROCKSDB_LITE("", " mode\n" "\tfilluniquerandomdeterministic -- write N values in a random" " key order and keep the shape of the LSM tree\n" - "\toverwrite -- overwrite N values in random key order in" - " async mode\n" + "\toverwrite -- overwrite N values in random key order in " + "async mode\n" "\tfillsync -- write N/1000 values in random key order in " "sync mode\n" "\tfill100K -- write N/1000 100K values in random order in" @@ -291,10 +290,12 @@ DEFINE_string(column_family_distribution, "", "and `num_hot_column_families=0`, a valid list could be " "\"10,20,30,40\"."); -DEFINE_int64(reads, -1, "Number of read operations to do. " +DEFINE_int64(reads, -1, + "Number of read operations to do. " "If negative, do FLAGS_num reads."); -DEFINE_int64(deletes, -1, "Number of delete operations to do. " +DEFINE_int64(deletes, -1, + "Number of delete operations to do. " "If negative, do FLAGS_num deletions."); DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); @@ -306,7 +307,8 @@ static int64_t seed_base; DEFINE_int32(threads, 1, "Number of concurrent threads to run."); -DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run." +DEFINE_int32(duration, 0, + "Time in seconds for the random-ops tests to run." " When 0 then num & reads determine the test duration"); DEFINE_string(value_size_distribution_type, "fixed", @@ -359,8 +361,9 @@ DEFINE_int32(user_timestamp_size, 0, DEFINE_int32(num_multi_db, 0, "Number of DBs used in the benchmark. 0 means single DB."); -DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink" - " to this fraction of their original size after compression"); +DEFINE_double(compression_ratio, 0.5, + "Arrange to generate values that shrink to this fraction of " + "their original size after compression"); DEFINE_double( overwrite_probability, 0.0, @@ -514,11 +517,12 @@ DEFINE_int32(max_background_compactions, " that can occur in parallel."); DEFINE_uint64(subcompactions, 1, + "For CompactRange, set max_subcompactions for each compaction " + "job in this CompactRange, for auto compactions, this is " "Maximum number of subcompactions to divide L0-L1 compactions " "into."); -static const bool FLAGS_subcompactions_dummy - __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions, - &ValidateUint32Range); +static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) = + RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); DEFINE_int32(max_background_flushes, ROCKSDB_NAMESPACE::Options().max_background_flushes, @@ -536,14 +540,16 @@ DEFINE_int32(compaction_pri, "priority of files to compaction: by size or by data age"); DEFINE_int32(universal_size_ratio, 0, - "Percentage flexibility while comparing file size" - " (for universal compaction only)."); + "Percentage flexibility while comparing file size " + "(for universal compaction only)."); -DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files in a" - " single compaction run (for universal compaction only)."); +DEFINE_int32(universal_min_merge_width, 0, + "The minimum number of files in a single compaction run " + "(for universal compaction only)."); -DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact" - " in universal style compaction"); +DEFINE_int32(universal_max_merge_width, 0, + "The max number of files to compact in universal style " + "compaction"); DEFINE_int32(universal_max_size_amplification_percent, 0, "The max size amplification for universal style compaction"); @@ -749,9 +755,10 @@ DEFINE_bool(whole_key_filtering, ROCKSDB_NAMESPACE::BlockBasedTableOptions().whole_key_filtering, "Use whole keys (in addition to prefixes) in SST bloom filter."); -DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing" - " database. If you set this flag and also specify a benchmark that" - " wants a fresh database, that benchmark will fail."); +DEFINE_bool(use_existing_db, false, + "If true, do not destroy the existing database. If you set this " + "flag and also specify a benchmark that wants a fresh database, " + "that benchmark will fail."); DEFINE_bool(use_existing_keys, false, "If true, uses existing keys in the DB, " @@ -789,16 +796,15 @@ DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter"); static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) { if (value >= 20) { - fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", - flagname, value); + fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n", flagname, + value); return false; } return true; } DEFINE_bool(verify_checksum, true, - "Verify checksum for every block read" - " from storage"); + "Verify checksum for every block read from storage"); DEFINE_int32(checksum_type, ROCKSDB_NAMESPACE::BlockBasedTableOptions().checksum, @@ -810,10 +816,11 @@ DEFINE_int32(stats_level, ROCKSDB_NAMESPACE::StatsLevel::kExceptDetailedTimers, DEFINE_string(statistics_string, "", "Serialized statistics string"); static class std::shared_ptr dbstats; -DEFINE_int64(writes, -1, "Number of write operations to do. If negative, do" - " --num reads."); +DEFINE_int64(writes, -1, + "Number of write operations to do. If negative, do --num reads."); -DEFINE_bool(finish_after_writes, false, "Write thread terminates after all writes are finished"); +DEFINE_bool(finish_after_writes, false, + "Write thread terminates after all writes are finished"); DEFINE_bool(sync, false, "Sync all writes to disk"); @@ -878,25 +885,28 @@ DEFINE_uint64(periodic_compaction_seconds, DEFINE_uint64(ttl_seconds, ROCKSDB_NAMESPACE::Options().ttl, "Set options.ttl"); static bool ValidateInt32Percent(const char* flagname, int32_t value) { - if (value <= 0 || value>=100) { - fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", - flagname, value); + if (value <= 0 || value >= 100) { + fprintf(stderr, "Invalid value for --%s: %d, 0< pct <100 \n", flagname, + value); return false; } return true; } -DEFINE_int32(readwritepercent, 90, "Ratio of reads to reads/writes (expressed" - " as percentage) for the ReadRandomWriteRandom workload. The " - "default value 90 means 90% operations out of all reads and writes" - " operations are reads. In other words, 9 gets for every 1 put."); - -DEFINE_int32(mergereadpercent, 70, "Ratio of merges to merges&reads (expressed" - " as percentage) for the ReadRandomMergeRandom workload. The" - " default value 70 means 70% out of all read and merge operations" - " are merges. In other words, 7 merges for every 3 gets."); - -DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" - "deletes (used in RandomWithVerify only). RandomWithVerify " +DEFINE_int32(readwritepercent, 90, + "Ratio of reads to reads/writes (expressed as percentage) for " + "the ReadRandomWriteRandom workload. The default value 90 means " + "90% operations out of all reads and writes operations are " + "reads. In other words, 9 gets for every 1 put."); + +DEFINE_int32(mergereadpercent, 70, + "Ratio of merges to merges&reads (expressed as percentage) for " + "the ReadRandomMergeRandom workload. The default value 70 means " + "70% out of all read and merge operations are merges. In other " + "words, 7 merges for every 3 gets."); + +DEFINE_int32(deletepercent, 2, + "Percentage of deletes out of reads/writes/deletes (used in " + "RandomWithVerify only). RandomWithVerify " "calculates writepercent as (100 - FLAGS_readwritepercent - " "deletepercent), so deletepercent must be smaller than (100 - " "FLAGS_readwritepercent)"); @@ -1308,7 +1318,8 @@ DEFINE_int32(compression_zstd_max_train_bytes, "Maximum size of training data passed to zstd's dictionary " "trainer."); -DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts" +DEFINE_int32(min_level_to_compress, -1, + "If non-negative, compression starts" " from this level. Levels with number < min_level_to_compress are" " not compressed. Otherwise, apply compression_type to " "all levels."); @@ -1351,8 +1362,8 @@ DEFINE_bool(keep_local_sst_files, true, #endif // ROCKSDB_LITE DEFINE_string(simulate_hybrid_fs_file, "", "File for Store Metadata for Simulate hybrid FS. Empty means " - "disable the feature. Now, if it is set, " - "last_level_temperature is set to kWarm."); + "disable the feature. Now, if it is set, last_level_temperature " + "is set to kWarm."); DEFINE_int32(simulate_hybrid_hdd_multipliers, 1, "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs " "are simulated."); @@ -1369,18 +1380,21 @@ static std::shared_ptr env_guard; static ROCKSDB_NAMESPACE::Env* FLAGS_env = ROCKSDB_NAMESPACE::Env::Default(); -DEFINE_int64(stats_interval, 0, "Stats are reported every N operations when " - "this is greater than zero. When 0 the interval grows over time."); +DEFINE_int64(stats_interval, 0, + "Stats are reported every N operations when this is greater than " + "zero. When 0 the interval grows over time."); -DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This " - "overrides stats_interval when both are > 0."); +DEFINE_int64(stats_interval_seconds, 0, + "Report stats every N seconds. This overrides stats_interval when" + " both are > 0."); -DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when" - " this is greater than 0."); +DEFINE_int32(stats_per_interval, 0, + "Reports additional stats per interval when this is greater than " + "0."); DEFINE_uint64(slow_usecs, 1000000, - "A message is printed for operations that " - "take at least this many microseconds."); + "A message is printed for operations that take at least this " + "many microseconds."); DEFINE_int64(report_interval_seconds, 0, "If greater than zero, it will write simple stats in CSV format " @@ -1450,24 +1464,19 @@ DEFINE_bool(rate_limiter_auto_tuned, false, "Enable dynamic adjustment of rate limit according to demand for " "background I/O"); +DEFINE_bool(sine_write_rate, false, "Use a sine wave write_rate_limit"); -DEFINE_bool(sine_write_rate, false, - "Use a sine wave write_rate_limit"); - -DEFINE_uint64(sine_write_rate_interval_milliseconds, 10000, - "Interval of which the sine wave write_rate_limit is recalculated"); +DEFINE_uint64( + sine_write_rate_interval_milliseconds, 10000, + "Interval of which the sine wave write_rate_limit is recalculated"); -DEFINE_double(sine_a, 1, - "A in f(x) = A sin(bx + c) + d"); +DEFINE_double(sine_a, 1, "A in f(x) = A sin(bx + c) + d"); -DEFINE_double(sine_b, 1, - "B in f(x) = A sin(bx + c) + d"); +DEFINE_double(sine_b, 1, "B in f(x) = A sin(bx + c) + d"); -DEFINE_double(sine_c, 0, - "C in f(x) = A sin(bx + c) + d"); +DEFINE_double(sine_c, 0, "C in f(x) = A sin(bx + c) + d"); -DEFINE_double(sine_d, 1, - "D in f(x) = A sin(bx + c) + d"); +DEFINE_double(sine_d, 1, "D in f(x) = A sin(bx + c) + d"); DEFINE_bool(rate_limit_bg_reads, false, "Use options.rate_limiter on compaction reads"); @@ -1557,8 +1566,8 @@ DEFINE_bool(print_malloc_stats, false, DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions"); DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds."); -DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files" - " in MB."); +DEFINE_uint64(wal_size_limit_MB, 0, + "Set the size limit for the WAL Files in MB."); DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size"); DEFINE_bool(mmap_read, ROCKSDB_NAMESPACE::Options().allow_mmap_reads, @@ -1625,11 +1634,12 @@ DEFINE_int32(num_deletion_threads, 1, "Number of threads to do deletion (used in TimeSeries and delete " "expire_style only)."); -DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge" - " operations on a key in the memtable"); +DEFINE_int32(max_successive_merges, 0, + "Maximum number of successive merge operations on a key in the " + "memtable"); static bool ValidatePrefixSize(const char* flagname, int32_t value) { - if (value < 0 || value>=2000000000) { + if (value < 0 || value >= 2000000000) { fprintf(stderr, "Invalid value for --%s: %d. 0<= PrefixSize <=2000000000\n", flagname, value); return false; @@ -1637,11 +1647,12 @@ static bool ValidatePrefixSize(const char* flagname, int32_t value) { return true; } -DEFINE_int32(prefix_size, 0, "control the prefix size for HashSkipList and " - "plain table"); -DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated " - "per prefix, 0 means no special handling of the prefix, " - "i.e. use the prefix comes with the generated random number."); +DEFINE_int32(prefix_size, 0, + "control the prefix size for HashSkipList and plain table"); +DEFINE_int64(keys_per_prefix, 0, + "control average number of keys generated per prefix, 0 means no " + "special handling of the prefix, i.e. use the prefix comes with " + "the generated random number."); DEFINE_bool(total_order_seek, false, "Enable total order seek regardless of index format."); DEFINE_bool(prefix_same_as_start, false, @@ -1653,13 +1664,13 @@ DEFINE_bool( DEFINE_int32(memtable_insert_with_hint_prefix_size, 0, "If non-zero, enable " "memtable insert with hint with the given prefix size."); -DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction " - "threads' IO priority"); -DEFINE_bool(enable_cpu_prio, false, "Lower the background flush/compaction " - "threads' CPU priority"); -DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo " - "table becomes an identity function. This is only valid when key " - "is 8 bytes"); +DEFINE_bool(enable_io_prio, false, + "Lower the background flush/compaction threads' IO priority"); +DEFINE_bool(enable_cpu_prio, false, + "Lower the background flush/compaction threads' CPU priority"); +DEFINE_bool(identity_as_first_hash, false, + "the first hash function of cuckoo table becomes an identity " + "function. This is only valid when key is 8 bytes"); DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG "); DEFINE_uint64(stats_dump_period_sec, ROCKSDB_NAMESPACE::Options().stats_dump_period_sec, @@ -1753,22 +1764,23 @@ static enum RepFactory StringToRepFactory(const char* ctype) { static enum RepFactory FLAGS_rep_factory; DEFINE_string(memtablerep, "skip_list", ""); DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count"); -DEFINE_bool(use_plain_table, false, "if use plain table " - "instead of block-based table format"); +DEFINE_bool(use_plain_table, false, + "if use plain table instead of block-based table format"); DEFINE_bool(use_cuckoo_table, false, "if use cuckoo table format"); DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table."); -DEFINE_bool(use_hash_search, false, "if use kHashSearch " - "instead of kBinarySearch. " +DEFINE_bool(use_hash_search, false, + "if use kHashSearch instead of kBinarySearch. " "This is valid if only we use BlockTable"); -DEFINE_string(merge_operator, "", "The merge operator to use with the database." +DEFINE_string(merge_operator, "", + "The merge operator to use with the database." "If a new merge operator is specified, be sure to use fresh" " database The possible merge operators are defined in" " utilities/merge_operators.h"); -DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try " - "linear search first for this many steps from the previous " - "position"); -DEFINE_bool(report_file_operations, false, "if report number of file " - "operations"); +DEFINE_int32(skip_list_lookahead, 0, + "Used with skip_list memtablerep; try linear search first for " + "this many steps from the previous position"); +DEFINE_bool(report_file_operations, false, + "if report number of file operations"); DEFINE_bool(report_open_timing, false, "if report open timing"); DEFINE_int32(readahead_size, 0, "Iterator readahead size"); @@ -1804,9 +1816,9 @@ DEFINE_bool(allow_data_in_errors, static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent); -static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((__unused__)) = - RegisterFlagValidator(&FLAGS_table_cache_numshardbits, - &ValidateTableCacheNumshardbits); +static const bool FLAGS_table_cache_numshardbits_dummy + __attribute__((__unused__)) = RegisterFlagValidator( + &FLAGS_table_cache_numshardbits, &ValidateTableCacheNumshardbits); DEFINE_uint32(write_batch_protection_bytes_per_key, 0, "Size of per-key-value checksum in each write batch. Currently " @@ -1855,11 +1867,7 @@ static Status CreateMemTableRepFactory( } // namespace -enum DistributionType : unsigned char { - kFixed = 0, - kUniform, - kNormal -}; +enum DistributionType : unsigned char { kFixed = 0, kUniform, kNormal }; static enum DistributionType FLAGS_value_size_distribution_type_e = kFixed; @@ -1891,33 +1899,27 @@ class BaseDistribution { } return val; } + private: virtual unsigned int Get() = 0; - virtual bool NeedTruncate() { - return true; - } + virtual bool NeedTruncate() { return true; } unsigned int min_value_size_; unsigned int max_value_size_; }; -class FixedDistribution : public BaseDistribution -{ +class FixedDistribution : public BaseDistribution { public: - FixedDistribution(unsigned int size) : - BaseDistribution(size, size), - size_(size) {} + FixedDistribution(unsigned int size) + : BaseDistribution(size, size), size_(size) {} + private: - virtual unsigned int Get() override { - return size_; - } - virtual bool NeedTruncate() override { - return false; - } + virtual unsigned int Get() override { return size_; } + virtual bool NeedTruncate() override { return false; } unsigned int size_; }; -class NormalDistribution - : public BaseDistribution, public std::normal_distribution { +class NormalDistribution : public BaseDistribution, + public std::normal_distribution { public: NormalDistribution(unsigned int _min, unsigned int _max) : BaseDistribution(_min, _max), @@ -1935,9 +1937,8 @@ class NormalDistribution std::mt19937 gen_; }; -class UniformDistribution - : public BaseDistribution, - public std::uniform_int_distribution { +class UniformDistribution : public BaseDistribution, + public std::uniform_int_distribution { public: UniformDistribution(unsigned int _min, unsigned int _max) : BaseDistribution(_min, _max), @@ -1945,12 +1946,8 @@ class UniformDistribution gen_(rd_()) {} private: - virtual unsigned int Get() override { - return (*this)(gen_); - } - virtual bool NeedTruncate() override { - return false; - } + virtual unsigned int Get() override { return (*this)(gen_); } + virtual bool NeedTruncate() override { return false; } std::random_device rd_; std::mt19937 gen_; }; @@ -1963,7 +1960,6 @@ class RandomGenerator { std::unique_ptr dist_; public: - RandomGenerator() { auto max_value_size = FLAGS_value_size_max; switch (FLAGS_value_size_distribution_type_e) { @@ -1972,8 +1968,8 @@ class RandomGenerator { FLAGS_value_size_max)); break; case kNormal: - dist_.reset(new NormalDistribution(FLAGS_value_size_min, - FLAGS_value_size_max)); + dist_.reset( + new NormalDistribution(FLAGS_value_size_min, FLAGS_value_size_max)); break; case kFixed: default: @@ -2022,7 +2018,7 @@ struct DBWithColumnFamilies { DB* db; #ifndef ROCKSDB_LITE OptimisticTransactionDB* opt_txn_db; -#endif // ROCKSDB_LITE +#endif // ROCKSDB_LITE std::atomic num_created; // Need to be updated after all the // new entries in cfh are set. size_t num_hot; // Number of column families to be queried at each moment. @@ -2035,7 +2031,8 @@ struct DBWithColumnFamilies { DBWithColumnFamilies() : db(nullptr) #ifndef ROCKSDB_LITE - , opt_txn_db(nullptr) + , + opt_txn_db(nullptr) #endif // ROCKSDB_LITE { cfh.clear(); @@ -2218,19 +2215,12 @@ enum OperationType : unsigned char { }; static std::unordered_map> - OperationTypeString = { - {kRead, "read"}, - {kWrite, "write"}, - {kDelete, "delete"}, - {kSeek, "seek"}, - {kMerge, "merge"}, - {kUpdate, "update"}, - {kCompress, "compress"}, - {kCompress, "uncompress"}, - {kCrc, "crc"}, - {kHash, "hash"}, - {kOthers, "op"} -}; + OperationTypeString = {{kRead, "read"}, {kWrite, "write"}, + {kDelete, "delete"}, {kSeek, "seek"}, + {kMerge, "merge"}, {kUpdate, "update"}, + {kCompress, "compress"}, {kCompress, "uncompress"}, + {kCrc, "crc"}, {kHash, "hash"}, + {kOthers, "op"}}; class CombinedStats; class Stats { @@ -2248,7 +2238,8 @@ class Stats { uint64_t last_op_finish_; uint64_t last_report_finish_; std::unordered_map, - std::hash> hist_; + std::hash> + hist_; std::string message_; bool exclude_from_merge_; ReporterAgent* reporter_agent_; // does not own @@ -2280,15 +2271,14 @@ class Stats { } void Merge(const Stats& other) { - if (other.exclude_from_merge_) - return; + if (other.exclude_from_merge_) return; for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) { auto this_it = hist_.find(it->first); if (this_it != hist_.end()) { this_it->second->Merge(*(other.hist_.at(it->first))); } else { - hist_.insert({ it->first, it->second }); + hist_.insert({it->first, it->second}); } } @@ -2307,9 +2297,7 @@ class Stats { seconds_ = (finish_ - start_) * 1e-6; } - void AddMessage(Slice msg) { - AppendWithSpace(&message_, msg); - } + void AddMessage(Slice msg) { AppendWithSpace(&message_, msg); } void SetId(int id) { id_ = id; } void SetExcludeFromMerge() { exclude_from_merge_ = true; } @@ -2318,27 +2306,27 @@ class Stats { std::vector thread_list; FLAGS_env->GetThreadList(&thread_list); - fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", - "ThreadID", "ThreadType", "cfName", "Operation", - "ElapsedTime", "Stage", "State", "OperationProperties"); + fprintf(stderr, "\n%18s %10s %12s %20s %13s %45s %12s %s\n", "ThreadID", + "ThreadType", "cfName", "Operation", "ElapsedTime", "Stage", + "State", "OperationProperties"); int64_t current_time = 0; clock_->GetCurrentTime(¤t_time).PermitUncheckedError(); for (auto ts : thread_list) { fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s", - ts.thread_id, - ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(), - ts.cf_name.c_str(), - ThreadStatus::GetOperationName(ts.operation_type).c_str(), - ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(), - ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(), - ThreadStatus::GetStateName(ts.state_type).c_str()); + ts.thread_id, + ThreadStatus::GetThreadTypeName(ts.thread_type).c_str(), + ts.cf_name.c_str(), + ThreadStatus::GetOperationName(ts.operation_type).c_str(), + ThreadStatus::MicrosToString(ts.op_elapsed_micros).c_str(), + ThreadStatus::GetOperationStageName(ts.operation_stage).c_str(), + ThreadStatus::GetStateName(ts.state_type).c_str()); auto op_properties = ThreadStatus::InterpretOperationProperties( ts.operation_type, ts.op_properties); for (const auto& op_prop : op_properties) { - fprintf(stderr, " %s %" PRIu64" |", - op_prop.first.c_str(), op_prop.second); + fprintf(stderr, " %s %" PRIu64 " |", op_prop.first.c_str(), + op_prop.second); } fprintf(stderr, "\n"); } @@ -2346,13 +2334,9 @@ class Stats { void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); } - uint64_t GetSineInterval() { - return sine_interval_; - } + uint64_t GetSineInterval() { return sine_interval_; } - uint64_t GetStart() { - return start_; - } + uint64_t GetStart() { return start_; } void ResetLastOpTime() { // Set to now to avoid latency from calls to SleepForMicroseconds. @@ -2368,8 +2352,7 @@ class Stats { uint64_t now = clock_->NowMicros(); uint64_t micros = now - last_op_finish_; - if (hist_.find(op_type) == hist_.end()) - { + if (hist_.find(op_type) == hist_.end()) { auto hist_temp = std::make_shared(); hist_.insert({op_type, std::move(hist_temp)}); } @@ -2385,13 +2368,20 @@ class Stats { done_ += num_ops; if (done_ >= next_report_ && FLAGS_progress_reports) { if (!FLAGS_stats_interval) { - if (next_report_ < 1000) next_report_ += 100; - else if (next_report_ < 5000) next_report_ += 500; - else if (next_report_ < 10000) next_report_ += 1000; - else if (next_report_ < 50000) next_report_ += 5000; - else if (next_report_ < 100000) next_report_ += 10000; - else if (next_report_ < 500000) next_report_ += 50000; - else next_report_ += 100000; + if (next_report_ < 1000) + next_report_ += 100; + else if (next_report_ < 5000) + next_report_ += 500; + else if (next_report_ < 10000) + next_report_ += 1000; + else if (next_report_ < 50000) + next_report_ += 5000; + else if (next_report_ < 100000) + next_report_ += 10000; + else if (next_report_ < 500000) + next_report_ += 50000; + else + next_report_ += 100000; fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, ""); } else { uint64_t now = clock_->NowMicros(); @@ -2477,9 +2467,7 @@ class Stats { } } - void AddBytes(int64_t n) { - bytes_ += n; - } + void AddBytes(int64_t n) { bytes_ += n; } void Report(const Slice& name) { // Pretend at least one op was done in case we are running a benchmark @@ -2497,7 +2485,7 @@ class Stats { extra = rate; } AppendWithSpace(&extra, message_); - double throughput = (double)done_/elapsed; + double throughput = (double)done_ / elapsed; fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec %.3f seconds %" PRIu64 @@ -2722,13 +2710,13 @@ struct SharedState { long num_done; bool start; - SharedState() : cv(&mu), perf_level(FLAGS_perf_level) { } + SharedState() : cv(&mu), perf_level(FLAGS_perf_level) {} }; // Per-thread state for concurrent executions of the same benchmark. struct ThreadState { - int tid; // 0..n-1 when running in n threads - Random64 rand; // Has different seeds for different threads + int tid; // 0..n-1 when running in n threads + Random64 rand; // Has different seeds for different threads Stats stats; SharedState* shared; @@ -2740,7 +2728,7 @@ class Duration { public: Duration(uint64_t max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) { max_seconds_ = max_seconds; - max_ops_= max_ops; + max_ops_ = max_ops; ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops; ops_ = 0; start_at_ = FLAGS_env->NowMicros(); @@ -2749,7 +2737,7 @@ class Duration { int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; } bool Done(int64_t increment) { - if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops + if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops ops_ += increment; if (max_seconds_) { @@ -2806,7 +2794,7 @@ class Benchmark { int64_t readwrites_; int64_t merge_keys_; bool report_file_operations_; - bool use_blob_db_; // Stacked BlobDB + bool use_blob_db_; // Stacked BlobDB bool read_operands_; // read via GetMergeOperands() std::vector keys_; @@ -2890,28 +2878,30 @@ class Benchmark { FLAGS_key_size, FLAGS_user_timestamp_size); auto avg_value_size = FLAGS_value_size; if (FLAGS_value_size_distribution_type_e == kFixed) { - fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", + fprintf(stdout, + "Values: %d bytes each (%d bytes after compression)\n", avg_value_size, static_cast(avg_value_size * FLAGS_compression_ratio + 0.5)); } else { avg_value_size = (FLAGS_value_size_min + FLAGS_value_size_max) / 2; - fprintf(stdout, "Values: %d avg bytes each (%d bytes after compression)\n", + fprintf(stdout, + "Values: %d avg bytes each (%d bytes after compression)\n", avg_value_size, static_cast(avg_value_size * FLAGS_compression_ratio + 0.5)); fprintf(stdout, "Values Distribution: %s (min: %d, max: %d)\n", - FLAGS_value_size_distribution_type.c_str(), - FLAGS_value_size_min, FLAGS_value_size_max); + FLAGS_value_size_distribution_type.c_str(), FLAGS_value_size_min, + FLAGS_value_size_max); } fprintf(stdout, "Entries: %" PRIu64 "\n", num_); fprintf(stdout, "Prefix: %d bytes\n", FLAGS_prefix_size); fprintf(stdout, "Keys per prefix: %" PRIu64 "\n", keys_per_prefix_); fprintf(stdout, "RawSize: %.1f MB (estimated)\n", - ((static_cast(FLAGS_key_size + avg_value_size) * num_) - / 1048576.0)); - fprintf(stdout, "FileSize: %.1f MB (estimated)\n", - (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio) - * num_) - / 1048576.0)); + ((static_cast(FLAGS_key_size + avg_value_size) * num_) / + 1048576.0)); + fprintf( + stdout, "FileSize: %.1f MB (estimated)\n", + (((FLAGS_key_size + avg_value_size * FLAGS_compression_ratio) * num_) / + 1048576.0)); fprintf(stdout, "Write rate: %" PRIu64 " bytes/second\n", FLAGS_benchmark_write_rate_limit); fprintf(stdout, "Read rate: %" PRIu64 " ops/second\n", @@ -2945,9 +2935,9 @@ class Benchmark { void PrintWarnings(const char* compression) { #if defined(__GNUC__) && !defined(__OPTIMIZE__) - fprintf(stdout, - "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" - ); + fprintf( + stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"); #endif #ifndef NDEBUG fprintf(stdout, @@ -2983,7 +2973,7 @@ class Benchmark { start++; } unsigned int limit = static_cast(s.size()); - while (limit > start && isspace(s[limit-1])) { + while (limit > start && isspace(s[limit - 1])) { limit--; } return Slice(s.data() + start, limit - start); @@ -3146,11 +3136,6 @@ class Benchmark { FLAGS_block_size /*estimated_entry_charge*/, FLAGS_cache_numshardbits) .MakeSharedCache(); - } else if (FLAGS_cache_type == "fast_lru_cache") { - return NewFastLRUCache(static_cast(capacity), FLAGS_block_size, - FLAGS_cache_numshardbits, - false /*strict_capacity_limit*/, - kDefaultCacheMetadataChargePolicy); } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts( static_cast(capacity), FLAGS_cache_numshardbits, @@ -3589,6 +3574,11 @@ class Benchmark { fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", entries_per_batch_); method = &Benchmark::MultiReadRandom; + } else if (name == "multireadwhilewriting") { + fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", + entries_per_batch_); + num_threads++; + method = &Benchmark::MultiReadWhileWriting; } else if (name == "approximatesizerandom") { fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", entries_per_batch_); @@ -3740,7 +3730,7 @@ class Benchmark { method = &Benchmark::VerifyChecksum; } else if (name == "verifyfilechecksums") { method = &Benchmark::VerifyFileChecksums; -#endif // ROCKSDB_LITE +#endif // ROCKSDB_LITE } else if (name == "readrandomoperands") { read_operands_ = true; method = &Benchmark::ReadRandom; @@ -3958,7 +3948,7 @@ class Benchmark { } } - SetPerfLevel(static_cast (shared->perf_level)); + SetPerfLevel(static_cast(shared->perf_level)); perf_context.EnablePerLevelPerfContext(); thread->stats.Start(thread->tid); (arg->bm->*(arg->method))(thread); @@ -4060,7 +4050,7 @@ class Benchmark { template static inline void ChecksumBenchmark(FnType fn, ThreadState* thread, Args... args) { - const int size = FLAGS_block_size; // use --block_size option for db_bench + const int size = FLAGS_block_size; // use --block_size option for db_bench std::string labels = "(" + std::to_string(FLAGS_block_size) + " per op)"; const char* label = labels.c_str(); @@ -4099,7 +4089,7 @@ class Benchmark { int dummy; std::atomic ap(&dummy); int count = 0; - void *ptr = nullptr; + void* ptr = nullptr; thread->stats.AddMessage("(each op is 1000 loads)"); while (count < 100000) { for (int i = 0; i < 1000; i++) { @@ -4111,7 +4101,7 @@ class Benchmark { if (ptr == nullptr) exit(1); // Disable unused variable warning. } - void Compress(ThreadState *thread) { + void Compress(ThreadState* thread) { RandomGenerator gen; Slice input = gen.Generate(FLAGS_block_size); int64_t bytes = 0; @@ -4143,7 +4133,7 @@ class Benchmark { } } - void Uncompress(ThreadState *thread) { + void Uncompress(ThreadState* thread) { RandomGenerator gen; Slice input = gen.Generate(FLAGS_block_size); std::string compressed; @@ -4245,7 +4235,7 @@ class Benchmark { options.write_buffer_size = FLAGS_write_buffer_size; options.max_write_buffer_number = FLAGS_max_write_buffer_number; options.min_write_buffer_number_to_merge = - FLAGS_min_write_buffer_number_to_merge; + FLAGS_min_write_buffer_number_to_merge; options.max_write_buffer_number_to_maintain = FLAGS_max_write_buffer_number_to_maintain; options.max_write_buffer_size_to_maintain = @@ -4313,8 +4303,9 @@ class Benchmark { } else if ((FLAGS_prefix_size == 0) && (options.memtable_factory->IsInstanceOf("prefix_hash") || options.memtable_factory->IsInstanceOf("hash_linkedlist"))) { - fprintf(stderr, "prefix_size should be non-zero if PrefixHash or " - "HashLinkedList memtablerep is used\n"); + fprintf(stderr, + "prefix_size should be non-zero if PrefixHash or " + "HashLinkedList memtablerep is used\n"); exit(1); } if (FLAGS_use_plain_table) { @@ -4355,8 +4346,8 @@ class Benchmark { ROCKSDB_NAMESPACE::CuckooTableOptions table_options; table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio; table_options.identity_as_first_hash = FLAGS_identity_as_first_hash; - options.table_factory = std::shared_ptr( - NewCuckooTableFactory(table_options)); + options.table_factory = + std::shared_ptr(NewCuckooTableFactory(table_options)); #else fprintf(stderr, "Cuckoo table is not supported in lite mode\n"); exit(1); @@ -4368,7 +4359,7 @@ class Benchmark { if (FLAGS_use_hash_search) { if (FLAGS_prefix_size == 0) { fprintf(stderr, - "prefix_size not assigned when enable use_hash_search \n"); + "prefix_size not assigned when enable use_hash_search \n"); exit(1); } block_based_options.index_type = BlockBasedTableOptions::kHashSearch; @@ -4603,13 +4594,13 @@ class Benchmark { exit(1); } options.max_bytes_for_level_multiplier_additional = - FLAGS_max_bytes_for_level_multiplier_additional_v; + FLAGS_max_bytes_for_level_multiplier_additional_v; } options.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger; options.level0_file_num_compaction_trigger = FLAGS_level0_file_num_compaction_trigger; options.level0_slowdown_writes_trigger = - FLAGS_level0_slowdown_writes_trigger; + FLAGS_level0_slowdown_writes_trigger; options.compression = FLAGS_compression_type_e; if (FLAGS_simulate_hybrid_fs_file != "") { options.bottommost_temperature = Temperature::kWarm; @@ -4629,8 +4620,7 @@ class Benchmark { for (int i = 0; i < FLAGS_min_level_to_compress; i++) { options.compression_per_level[i] = kNoCompression; } - for (int i = FLAGS_min_level_to_compress; - i < FLAGS_num_levels; i++) { + for (int i = FLAGS_min_level_to_compress; i < FLAGS_num_levels; i++) { options.compression_per_level[i] = FLAGS_compression_type_e; } } @@ -4684,23 +4674,23 @@ class Benchmark { // set universal style compaction configurations, if applicable if (FLAGS_universal_size_ratio != 0) { options.compaction_options_universal.size_ratio = - FLAGS_universal_size_ratio; + FLAGS_universal_size_ratio; } if (FLAGS_universal_min_merge_width != 0) { options.compaction_options_universal.min_merge_width = - FLAGS_universal_min_merge_width; + FLAGS_universal_min_merge_width; } if (FLAGS_universal_max_merge_width != 0) { options.compaction_options_universal.max_merge_width = - FLAGS_universal_max_merge_width; + FLAGS_universal_max_merge_width; } if (FLAGS_universal_max_size_amplification_percent != 0) { options.compaction_options_universal.max_size_amplification_percent = - FLAGS_universal_max_size_amplification_percent; + FLAGS_universal_max_size_amplification_percent; } if (FLAGS_universal_compression_size_percent != -1) { options.compaction_options_universal.compression_size_percent = - FLAGS_universal_compression_size_percent; + FLAGS_universal_compression_size_percent; } options.compaction_options_universal.allow_trivial_move = FLAGS_universal_allow_trivial_move; @@ -4890,7 +4880,7 @@ class Benchmark { } void OpenDb(Options options, const std::string& db_name, - DBWithColumnFamilies* db) { + DBWithColumnFamilies* db) { uint64_t open_start = FLAGS_report_open_timing ? FLAGS_env->NowNanos() : 0; Status s; // Open with column families if necessary. @@ -4905,7 +4895,7 @@ class Benchmark { std::vector column_families; for (size_t i = 0; i < num_hot; i++) { column_families.push_back(ColumnFamilyDescriptor( - ColumnFamilyName(i), ColumnFamilyOptions(options))); + ColumnFamilyName(i), ColumnFamilyOptions(options))); } std::vector cfh_idx_to_prob; if (!FLAGS_column_family_distribution.empty()) { @@ -4931,8 +4921,8 @@ class Benchmark { } #ifndef ROCKSDB_LITE if (FLAGS_readonly) { - s = DB::OpenForReadOnly(options, db_name, column_families, - &db->cfh, &db->db); + s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh, + &db->db); } else if (FLAGS_optimistic_transaction_db) { s = OptimisticTransactionDB::Open(options, db_name, column_families, &db->cfh, &db->opt_txn_db); @@ -5043,9 +5033,7 @@ class Benchmark { } } - enum WriteMode { - RANDOM, SEQUENTIAL, UNIQUE_RANDOM - }; + enum WriteMode { RANDOM, SEQUENTIAL, UNIQUE_RANDOM }; void WriteSeqDeterministic(ThreadState* thread) { DoDeterministicCompact(thread, open_options_.compaction_style, SEQUENTIAL); @@ -5056,13 +5044,9 @@ class Benchmark { UNIQUE_RANDOM); } - void WriteSeq(ThreadState* thread) { - DoWrite(thread, SEQUENTIAL); - } + void WriteSeq(ThreadState* thread) { DoWrite(thread, SEQUENTIAL); } - void WriteRandom(ThreadState* thread) { - DoWrite(thread, RANDOM); - } + void WriteRandom(ThreadState* thread) { DoWrite(thread, RANDOM); } void WriteUniqueRandom(ThreadState* thread) { DoWrite(thread, UNIQUE_RANDOM); @@ -5116,9 +5100,7 @@ class Benchmark { std::vector values_; }; - DB* SelectDB(ThreadState* thread) { - return SelectDBWithCfh(thread)->db; - } + DB* SelectDB(ThreadState* thread) { return SelectDBWithCfh(thread)->db; } DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) { return SelectDBWithCfh(thread->rand.Next()); @@ -5127,13 +5109,13 @@ class Benchmark { DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) { if (db_.db != nullptr) { return &db_; - } else { + } else { return &multi_dbs_[rand_int % multi_dbs_.size()]; } } double SineRate(double x) { - return FLAGS_sine_a*sin((FLAGS_sine_b*x) + FLAGS_sine_c) + FLAGS_sine_d; + return FLAGS_sine_a * sin((FLAGS_sine_b * x) + FLAGS_sine_c) + FLAGS_sine_d; } void DoWrite(ThreadState* thread, WriteMode write_mode) { @@ -5437,8 +5419,7 @@ class Benchmark { // We use same rand_num as seed for key and column family so that we // can deterministically find the cfh corresponding to a particular // key while reading the key. - batch.Put(db_with_cfh->GetCfh(rand_num), key, - val); + batch.Put(db_with_cfh->GetCfh(rand_num), key, val); } batch_bytes += val.size() + key_size_ + user_timestamp_size_; bytes += val.size() + key_size_ + user_timestamp_size_; @@ -5510,8 +5491,8 @@ class Benchmark { } if (thread->shared->write_rate_limiter.get() != nullptr) { thread->shared->write_rate_limiter->Request( - batch_bytes, Env::IO_HIGH, - nullptr /* stats */, RateLimiter::OpType::kWrite); + batch_bytes, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); // Set time at which last op finished to Now() to hide latency and // sleep from rate limiter. Also, do the check once per batch, not // once per write. @@ -5546,12 +5527,12 @@ class Benchmark { if (usecs_since_last > (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) { double usecs_since_start = - static_cast(now - thread->stats.GetStart()); + static_cast(now - thread->stats.GetStart()); thread->stats.ResetSineInterval(); uint64_t write_rate = - static_cast(SineRate(usecs_since_start / 1000000.0)); + static_cast(SineRate(usecs_since_start / 1000000.0)); thread->shared->write_rate_limiter.reset( - NewGenericRateLimiter(write_rate)); + NewGenericRateLimiter(write_rate)); } } if (!s.ok()) { @@ -5647,11 +5628,13 @@ class Benchmark { continue; } } - writes_ /= static_cast(open_options_.max_bytes_for_level_multiplier); + writes_ /= + static_cast(open_options_.max_bytes_for_level_multiplier); } for (size_t i = 0; i < num_db; i++) { if (sorted_runs[i].size() < num_levels - 1) { - fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels); + fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", + num_levels); exit(1); } } @@ -5662,13 +5645,14 @@ class Benchmark { auto options = db->GetOptions(); MutableCFOptions mutable_cf_options(options); for (size_t j = 0; j < sorted_runs[i].size(); j++) { - compactionOptions.output_file_size_limit = - MaxFileSizeForLevel(mutable_cf_options, - static_cast(output_level), compaction_style); + compactionOptions.output_file_size_limit = MaxFileSizeForLevel( + mutable_cf_options, static_cast(output_level), + compaction_style); std::cout << sorted_runs[i][j].size() << std::endl; - db->CompactFiles(compactionOptions, {sorted_runs[i][j].back().name, - sorted_runs[i][j].front().name}, - static_cast(output_level - j) /*level*/); + db->CompactFiles( + compactionOptions, + {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name}, + static_cast(output_level - j) /*level*/); } } } else if (compaction_style == kCompactionStyleUniversal) { @@ -5699,11 +5683,13 @@ class Benchmark { } num_files_at_level0[i] = meta.levels[0].files.size(); } - writes_ = static_cast(writes_* static_cast(100) / (ratio + 200)); + writes_ = static_cast(writes_ * static_cast(100) / + (ratio + 200)); } for (size_t i = 0; i < num_db; i++) { if (sorted_runs[i].size() < num_levels) { - fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", num_levels); + fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n", + num_levels); exit(1); } } @@ -5714,9 +5700,9 @@ class Benchmark { auto options = db->GetOptions(); MutableCFOptions mutable_cf_options(options); for (size_t j = 0; j < sorted_runs[i].size(); j++) { - compactionOptions.output_file_size_limit = - MaxFileSizeForLevel(mutable_cf_options, - static_cast(output_level), compaction_style); + compactionOptions.output_file_size_limit = MaxFileSizeForLevel( + mutable_cf_options, static_cast(output_level), + compaction_style); db->CompactFiles( compactionOptions, {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name}, @@ -5727,7 +5713,7 @@ class Benchmark { } else if (compaction_style == kCompactionStyleFIFO) { if (num_levels != 1) { return Status::InvalidArgument( - "num_levels should be 1 for FIFO compaction"); + "num_levels should be 1 for FIFO compaction"); } if (FLAGS_num_multi_db != 0) { return Status::InvalidArgument("Doesn't support multiDB"); @@ -5744,7 +5730,7 @@ class Benchmark { db->GetColumnFamilyMetaData(&meta); auto total_size = meta.levels[0].size; if (total_size >= - db->GetOptions().compaction_options_fifo.max_table_files_size) { + db->GetOptions().compaction_options_fifo.max_table_files_size) { for (auto file_meta : meta.levels[0].files) { file_names.emplace_back(file_meta.name); } @@ -5755,6 +5741,8 @@ class Benchmark { // auto compactionOptions = CompactionOptions(); // db->CompactFiles(compactionOptions, file_names, 0); auto compactionOptions = CompactRangeOptions(); + compactionOptions.max_subcompactions = + static_cast(FLAGS_subcompactions); db->CompactRange(compactionOptions, nullptr, nullptr); } else { fprintf(stdout, @@ -5781,8 +5769,8 @@ class Benchmark { db->GetColumnFamilyMetaData(&meta); auto total_size = meta.levels[0].size; assert(total_size <= - db->GetOptions().compaction_options_fifo.max_table_files_size); - break; + db->GetOptions().compaction_options_fifo.max_table_files_size); + break; } // verify smallest/largest seqno and key range of each sorted run @@ -5848,7 +5836,9 @@ class Benchmark { for (size_t k = 0; k < num_db; k++) { auto db = db_list[k]; fprintf(stdout, - "---------------------- DB %" ROCKSDB_PRIszt " LSM ---------------------\n", k); + "---------------------- DB %" ROCKSDB_PRIszt + " LSM ---------------------\n", + k); db->GetColumnFamilyMetaData(&meta); for (auto& levelMeta : meta.levels) { if (levelMeta.files.empty()) { @@ -6066,7 +6056,9 @@ class Benchmark { } while (!duration.Done(100)); char msg[100]; - snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, " + snprintf(msg, sizeof(msg), + "(%" PRIu64 " of %" PRIu64 + " found, " "issued %" PRIu64 " non-exist keys)\n", found, read, nonexist); @@ -6202,8 +6194,8 @@ class Benchmark { } char msg[100]; - snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", - found, read); + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found, + read); thread->stats.AddBytes(bytes); thread->stats.AddMessage(msg); @@ -6218,7 +6210,7 @@ class Benchmark { int64_t found = 0; ReadOptions options = read_options_; std::vector keys; - std::vector > key_guards; + std::vector> key_guards; std::vector values(entries_per_batch_); PinnableSlice* pin_values = new PinnableSlice[entries_per_batch_]; std::unique_ptr pin_values_guard(pin_values); @@ -6302,8 +6294,8 @@ class Benchmark { } char msg[100]; - snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", - found, read); + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)", found, + read); thread->stats.AddBytes(bytes); thread->stats.AddMessage(msg); } @@ -6705,8 +6697,8 @@ class Benchmark { } else if (query_type == 1) { // the Put query puts++; - int64_t val_size = ParetoCdfInversion( - u, FLAGS_value_theta, FLAGS_value_k, FLAGS_value_sigma); + int64_t val_size = ParetoCdfInversion(u, FLAGS_value_theta, + FLAGS_value_k, FLAGS_value_sigma); if (val_size < 10) { val_size = 10; } else if (val_size > value_max) { @@ -6913,8 +6905,8 @@ class Benchmark { } char msg[100]; - snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", - found, read); + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found)\n", found, + read); thread->stats.AddBytes(bytes); thread->stats.AddMessage(msg); } @@ -6977,13 +6969,9 @@ class Benchmark { } } - void DeleteSeq(ThreadState* thread) { - DoDelete(thread, true); - } + void DeleteSeq(ThreadState* thread) { DoDelete(thread, true); } - void DeleteRandom(ThreadState* thread) { - DoDelete(thread, false); - } + void DeleteRandom(ThreadState* thread) { DoDelete(thread, false); } void ReadWhileWriting(ThreadState* thread) { if (thread->tid > 0) { @@ -6993,6 +6981,14 @@ class Benchmark { } } + void MultiReadWhileWriting(ThreadState* thread) { + if (thread->tid > 0) { + MultiReadRandom(thread); + } else { + BGWriter(thread, kWrite); + } + } + void ReadWhileMerging(ThreadState* thread) { if (thread->tid > 0) { ReadRandom(thread); @@ -7089,9 +7085,9 @@ class Benchmark { thread->stats.FinishedOps(&db_, db_.db, 1, kWrite); if (FLAGS_benchmark_write_rate_limit > 0) { - write_rate_limiter->Request( - key.size() + val.size(), Env::IO_HIGH, - nullptr /* stats */, RateLimiter::OpType::kWrite); + write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH, + nullptr /* stats */, + RateLimiter::OpType::kWrite); } if (writes_per_range_tombstone_ > 0 && @@ -7215,7 +7211,6 @@ class Benchmark { return s; } - // Given a key K, this deletes (K+"0", V), (K+"1", V), (K+"2", V) // in DB atomically i.e in a single batch. Also refer GetMany. Status DeleteMany(DB* db, const WriteOptions& writeoptions, @@ -7327,7 +7322,7 @@ class Benchmark { put_weight = 100 - get_weight - delete_weight; } GenerateKeyFromInt(thread->rand.Next() % FLAGS_numdistinct, - FLAGS_numdistinct, &key); + FLAGS_numdistinct, &key); if (get_weight > 0) { // do all the gets first Status s = GetMany(db, key, &value); @@ -7365,8 +7360,8 @@ class Benchmark { } char msg[128]; snprintf(msg, sizeof(msg), - "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" \ - PRIu64 " found:%" PRIu64 ")", + "( get:%" PRIu64 " put:%" PRIu64 " del:%" PRIu64 " total:%" PRIu64 + " found:%" PRIu64 ")", gets_done, puts_done, deletes_done, readwrites_, found); thread->stats.AddMessage(msg); } @@ -7420,7 +7415,7 @@ class Benchmark { get_weight--; reads_done++; thread->stats.FinishedOps(nullptr, db, 1, kRead); - } else if (put_weight > 0) { + } else if (put_weight > 0) { // then do all the corresponding number of puts // for all the gets we have done earlier Status s; @@ -7440,8 +7435,9 @@ class Benchmark { } } char msg[100]; - snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \ - " total:%" PRIu64 " found:%" PRIu64 ")", + snprintf(msg, sizeof(msg), + "( reads:%" PRIu64 " writes:%" PRIu64 " total:%" PRIu64 + " found:%" PRIu64 ")", reads_done, writes_done, readwrites_, found); thread->stats.AddMessage(msg); } @@ -7505,8 +7501,8 @@ class Benchmark { thread->stats.FinishedOps(nullptr, db, 1, kUpdate); } char msg[100]; - snprintf(msg, sizeof(msg), - "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found); + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", + readwrites_, found); thread->stats.AddBytes(bytes); thread->stats.AddMessage(msg); } @@ -7549,7 +7545,8 @@ class Benchmark { exit(1); } - Slice value = gen.Generate(static_cast(existing_value.size())); + Slice value = + gen.Generate(static_cast(existing_value.size())); std::string new_value; if (status.ok()) { @@ -7573,8 +7570,8 @@ class Benchmark { thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; - snprintf(msg, sizeof(msg), - "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found); + snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", + readwrites_, found); thread->stats.AddMessage(msg); } @@ -7622,7 +7619,7 @@ class Benchmark { Slice operand = gen.Generate(); if (value.size() > 0) { // Use a delimiter to match the semantics for StringAppendOperator - value.append(1,','); + value.append(1, ','); } value.append(operand.data(), operand.size()); @@ -7644,7 +7641,7 @@ class Benchmark { char msg[100]; snprintf(msg, sizeof(msg), "( updates:%" PRIu64 " found:%" PRIu64 ")", - readwrites_, found); + readwrites_, found); thread->stats.AddBytes(bytes); thread->stats.AddMessage(msg); } @@ -7675,12 +7672,10 @@ class Benchmark { Slice val = gen.Generate(); if (FLAGS_num_column_families > 1) { s = db_with_cfh->db->Merge(write_options_, - db_with_cfh->GetCfh(key_rand), key, - val); + db_with_cfh->GetCfh(key_rand), key, val); } else { - s = db_with_cfh->db->Merge(write_options_, - db_with_cfh->db->DefaultColumnFamily(), key, - val); + s = db_with_cfh->db->Merge( + write_options_, db_with_cfh->db->DefaultColumnFamily(), key, val); } if (!s.ok()) { @@ -7733,8 +7728,7 @@ class Benchmark { thread->stats.FinishedOps(nullptr, db, 1, kMerge); } else { Status s = db->Get(read_options_, key, &value); - if (value.length() > max_length) - max_length = value.length(); + if (value.length() > max_length) max_length = value.length(); if (!s.ok() && !s.IsNotFound()) { fprintf(stderr, "get error: %s\n", s.ToString().c_str()); @@ -8003,9 +7997,8 @@ class Benchmark { return; } - Status s = - RandomTransactionInserter::Verify(db_.db, - static_cast(FLAGS_transaction_sets)); + Status s = RandomTransactionInserter::Verify( + db_.db, static_cast(FLAGS_transaction_sets)); if (s.ok()) { fprintf(stdout, "RandomTransactionVerify Success.\n"); @@ -8225,9 +8218,9 @@ class Benchmark { thread->stats.AddBytes(bytes); if (FLAGS_benchmark_write_rate_limit > 0) { - write_rate_limiter->Request( - key.size() + val.size(), Env::IO_HIGH, - nullptr /* stats */, RateLimiter::OpType::kWrite); + write_rate_limiter->Request(key.size() + val.size(), Env::IO_HIGH, + nullptr /* stats */, + RateLimiter::OpType::kWrite); } } } @@ -8249,15 +8242,18 @@ class Benchmark { CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + cro.max_subcompactions = static_cast(FLAGS_subcompactions); db->CompactRange(cro, nullptr, nullptr); } void CompactAll() { + CompactRangeOptions cro; + cro.max_subcompactions = static_cast(FLAGS_subcompactions); if (db_.db != nullptr) { - db_.db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + db_.db->CompactRange(cro, nullptr, nullptr); } for (const auto& db_with_cfh : multi_dbs_) { - db_with_cfh.db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + db_with_cfh.db->CompactRange(cro, nullptr, nullptr); } } @@ -8666,7 +8662,7 @@ int db_bench_tool(int argc, char** argv) { } FLAGS_compression_type_e = - StringToCompressionType(FLAGS_compression_type.c_str()); + StringToCompressionType(FLAGS_compression_type.c_str()); FLAGS_wal_compression_e = StringToCompressionType(FLAGS_wal_compression.c_str()); @@ -8677,7 +8673,7 @@ int db_bench_tool(int argc, char** argv) { #ifndef ROCKSDB_LITE // Stacked BlobDB FLAGS_blob_db_compression_type_e = - StringToCompressionType(FLAGS_blob_db_compression_type.c_str()); + StringToCompressionType(FLAGS_blob_db_compression_type.c_str()); int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty(); if (env_opts > 1) { @@ -8746,7 +8742,7 @@ int db_bench_tool(int argc, char** argv) { } FLAGS_value_size_distribution_type_e = - StringToDistributionType(FLAGS_value_size_distribution_type.c_str()); + StringToDistributionType(FLAGS_value_size_distribution_type.c_str()); // Note options sanitization may increase thread pool sizes according to // max_background_flushes/max_background_compactions/max_background_jobs diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index e52afbfd524..83678289463 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -122,7 +122,6 @@ "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1), "mock_direct_io": False, "cache_type": lambda: random.choice(["lru_cache", "hyper_clock_cache"]), - # fast_lru_cache is incompatible with stress tests, because it doesn't support strict_capacity_limit == false. "use_full_merge_v1": lambda: random.randint(0, 1), "use_merge": lambda: random.randint(0, 1), # use_put_entity_one_in has to be the same across invocations for verification to work, hence no lambda @@ -140,6 +139,7 @@ # 0 = never (used by some), 10 = often (for threading bugs), 600 = default "stats_dump_period_sec": lambda: random.choice([0, 10, 600]), "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]), + "fifo_allow_compaction": lambda: random.randint(0, 1), # Test small max_manifest_file_size in a smaller chance, as most of the # time we wnat manifest history to be preserved to help debug "max_manifest_file_size": lambda: random.choice( @@ -209,6 +209,7 @@ _DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL" stress_cmd = "./db_stress" +cleanup_cmd = None def is_release_mode(): @@ -223,6 +224,10 @@ def get_dbname(test_name): else: dbname = test_tmpdir + "/" + test_dir_name shutil.rmtree(dbname, True) + if cleanup_cmd is not None: + print("Running DB cleanup command - %s\n" % cleanup_cmd) + # Ignore failure + os.system(cleanup_cmd) os.mkdir(dbname) return dbname @@ -690,6 +695,7 @@ def gen_cmd(params, unknown_params): "write_policy", "stress_cmd", "test_tiered_storage", + "cleanup_cmd", } and v is not None ] @@ -925,6 +931,12 @@ def whitebox_crash_main(args, unknown_args): # we need to clean up after ourselves -- only do this on test # success shutil.rmtree(dbname, True) + if cleanup_cmd is not None: + print("Running DB cleanup command - %s\n" % cleanup_cmd) + ret = os.system(cleanup_cmd) + if ret != 0: + print("TEST FAILED. DB cleanup returned error %d\n" % ret) + sys.exit(1) os.mkdir(dbname) if (expected_values_dir is not None): shutil.rmtree(expected_values_dir, True) @@ -937,6 +949,7 @@ def whitebox_crash_main(args, unknown_args): def main(): global stress_cmd + global cleanup_cmd parser = argparse.ArgumentParser( description="This script runs and kills \ @@ -952,6 +965,7 @@ def main(): parser.add_argument("--write_policy", choices=["write_committed", "write_prepared"]) parser.add_argument("--stress_cmd") parser.add_argument("--test_tiered_storage", action="store_true") + parser.add_argument("--cleanup_cmd") all_params = dict( list(default_params.items()) @@ -986,6 +1000,8 @@ def main(): if args.stress_cmd: stress_cmd = args.stress_cmd + if args.cleanup_cmd: + cleanup_cmd = args.cleanup_cmd if args.test_type == "blackbox": blackbox_crash_main(args, unknown_args) if args.test_type == "whitebox": diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc index 1c16bf392db..8cc67f5d5a6 100644 --- a/tools/db_sanity_test.cc +++ b/tools/db_sanity_test.cc @@ -5,19 +5,19 @@ #include #include -#include #include +#include +#include "port/port.h" +#include "rocksdb/comparator.h" #include "rocksdb/db.h" -#include "rocksdb/options.h" #include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" #include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" #include "rocksdb/status.h" -#include "rocksdb/comparator.h" #include "rocksdb/table.h" -#include "rocksdb/slice_transform.h" -#include "rocksdb/filter_policy.h" -#include "port/port.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc index be3ff796282..427a54d99eb 100644 --- a/tools/dump/db_dump_tool.cc +++ b/tools/dump/db_dump_tool.cc @@ -5,11 +5,12 @@ #ifndef ROCKSDB_LITE +#include "rocksdb/db_dump_tool.h" + #include #include #include "rocksdb/db.h" -#include "rocksdb/db_dump_tool.h" #include "rocksdb/env.h" #include "util/coding.h" diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index c4f311c7913..323d5f8b40a 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -123,7 +123,7 @@ void DumpSstFile(Options options, std::string filename, bool output_hex, void DumpBlobFile(const std::string& filename, bool is_key_hex, bool is_value_hex, bool dump_uncompressed_blobs); -}; +}; // namespace LDBCommand* LDBCommand::InitFromCmdLineArgs( int argc, char const* const* argv, const Options& options, @@ -166,7 +166,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( const std::string OPTION_PREFIX = "--"; for (const auto& arg : args) { - if (arg[0] == '-' && arg[1] == '-'){ + if (arg[0] == '-' && arg[1] == '-') { std::vector splits = StringSplit(arg, '='); // --option_name=option_value if (splits.size() == 2) { @@ -296,8 +296,7 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) { parsed_params.flags); } else if (parsed_params.cmd == CheckPointCommand::Name()) { return new CheckPointCommand(parsed_params.cmd_params, - parsed_params.option_map, - parsed_params.flags); + parsed_params.option_map, parsed_params.flags); } else if (parsed_params.cmd == RepairCommand::Name()) { return new RepairCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); @@ -889,7 +888,7 @@ void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) { int write_buffer_size; if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size, - exec_state_)) { + exec_state_)) { if (write_buffer_size > 0) { cf_opts->write_buffer_size = write_buffer_size; } else { @@ -1289,7 +1288,7 @@ void DBLoaderCommand::DoCommand() { } else if (0 == line.find("Created bg thread 0x")) { // ignore this line } else { - bad_lines ++; + bad_lines++; } } @@ -1378,7 +1377,6 @@ ManifestDumpCommand::ManifestDumpCommand( } void ManifestDumpCommand::DoCommand() { - std::string manifestfile; if (!path_.empty()) { @@ -1735,7 +1733,7 @@ void IncBucketCounts(std::vector& bucket_counts, int ttl_start, (void)num_buckets; #endif assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 && - timekv < (ttl_start + time_range) && num_buckets > 1); + timekv < (ttl_start + time_range) && num_buckets > 1); int bucket = (timekv - ttl_start) / bucket_size; bucket_counts[bucket]++; } @@ -1744,7 +1742,7 @@ void PrintBucketCounts(const std::vector& bucket_counts, int ttl_start, int ttl_end, int bucket_size, int num_buckets) { int time_point = ttl_start; - for(int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) { + for (int i = 0; i < num_buckets - 1; i++, time_point += bucket_size) { fprintf(stdout, "Keys in range %s to %s : %lu\n", TimeToHumanString(time_point).c_str(), TimeToHumanString(time_point + bucket_size).c_str(), @@ -1789,10 +1787,10 @@ InternalDumpCommand::InternalDumpCommand( if (itr != options.end()) { delim_ = itr->second; count_delim_ = true; - // fprintf(stdout,"delim = %c\n",delim_[0]); + // fprintf(stdout,"delim = %c\n",delim_[0]); } else { count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); - delim_="."; + delim_ = "."; } print_stats_ = IsFlagPresent(flags, ARG_STATS); @@ -1846,8 +1844,8 @@ void InternalDumpCommand::DoCommand() { } std::string rtype1, rtype2, row, val; rtype2 = ""; - uint64_t c=0; - uint64_t s1=0,s2=0; + uint64_t c = 0; + uint64_t s1 = 0, s2 = 0; long long count = 0; for (auto& key_version : key_versions) { @@ -1862,25 +1860,24 @@ void InternalDumpCommand::DoCommand() { int k; if (count_delim_) { rtype1 = ""; - s1=0; + s1 = 0; row = ikey.Encode().ToString(); val = key_version.value; - for(k=0;row[k]!='\x01' && row[k]!='\0';k++) - s1++; - for(k=0;val[k]!='\x01' && val[k]!='\0';k++) - s1++; - for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++) - rtype1+=row[j]; - if(rtype2.compare("") && rtype2.compare(rtype1)!=0) { + for (k = 0; row[k] != '\x01' && row[k] != '\0'; k++) s1++; + for (k = 0; val[k] != '\x01' && val[k] != '\0'; k++) s1++; + for (int j = 0; row[j] != delim_[0] && row[j] != '\0' && row[j] != '\x01'; + j++) + rtype1 += row[j]; + if (rtype2.compare("") && rtype2.compare(rtype1) != 0) { fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", rtype2.c_str(), c, s2); - c=1; - s2=s1; + c = 1; + s2 = s1; rtype2 = rtype1; } else { c++; - s2+=s1; - rtype2=rtype1; + s2 += s1; + rtype2 = rtype1; } } @@ -1906,7 +1903,7 @@ void InternalDumpCommand::DoCommand() { // Terminate if maximum number of keys have been dumped if (max_keys_ > 0 && count >= max_keys_) break; } - if(count_delim_) { + if (count_delim_) { fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", rtype2.c_str(), c, s2); } else { @@ -1971,7 +1968,7 @@ DBDumperCommand::DBDumperCommand( count_delim_ = true; } else { count_delim_ = IsFlagPresent(flags, ARG_COUNT_DELIM); - delim_="."; + delim_ = "."; } print_stats_ = IsFlagPresent(flags, ARG_STATS); @@ -2119,13 +2116,13 @@ void DBDumperCommand::DoDumpCommand() { int bucket_size; if (!ParseIntOption(option_map_, ARG_TTL_BUCKET, bucket_size, exec_state_) || bucket_size <= 0) { - bucket_size = time_range; // Will have just 1 bucket by default + bucket_size = time_range; // Will have just 1 bucket by default } - //cretaing variables for row count of each type + // cretaing variables for row count of each type std::string rtype1, rtype2, row, val; rtype2 = ""; - uint64_t c=0; - uint64_t s1=0,s2=0; + uint64_t c = 0; + uint64_t s1 = 0, s2 = 0; // At this point, bucket_size=0 => time_range=0 int num_buckets = (bucket_size >= time_range) @@ -2143,11 +2140,9 @@ void DBDumperCommand::DoDumpCommand() { for (; iter->Valid(); iter->Next()) { int rawtime = 0; // If end marker was specified, we stop before it - if (!null_to_ && (iter->key().ToString() >= to_)) - break; + if (!null_to_ && (iter->key().ToString() >= to_)) break; // Terminate if maximum number of keys have been dumped - if (max_keys == 0) - break; + if (max_keys == 0) break; if (is_db_ttl_) { TtlIterator* it_ttl = static_cast_with_check(iter); rawtime = it_ttl->ttl_timestamp(); @@ -2167,21 +2162,20 @@ void DBDumperCommand::DoDumpCommand() { rtype1 = ""; row = iter->key().ToString(); val = iter->value().ToString(); - s1 = row.size()+val.size(); - for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++) - rtype1+=row[j]; - if(rtype2.compare("") && rtype2.compare(rtype1)!=0) { + s1 = row.size() + val.size(); + for (int j = 0; row[j] != delim_[0] && row[j] != '\0'; j++) + rtype1 += row[j]; + if (rtype2.compare("") && rtype2.compare(rtype1) != 0) { fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", rtype2.c_str(), c, s2); - c=1; - s2=s1; + c = 1; + s2 = s1; rtype2 = rtype1; } else { - c++; - s2+=s1; - rtype2=rtype1; + c++; + s2 += s1; + rtype2 = rtype1; } - } if (count_only_) { @@ -2202,7 +2196,7 @@ void DBDumperCommand::DoDumpCommand() { if (num_buckets > 1 && is_db_ttl_) { PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size, num_buckets); - } else if(count_delim_) { + } else if (count_delim_) { fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n", rtype2.c_str(), c, s2); } else { @@ -2233,7 +2227,7 @@ ReduceDBLevelsCommand::ReduceDBLevelsCommand( ParseIntOption(option_map_, ARG_NEW_LEVELS, new_levels_, exec_state_); print_old_levels_ = IsFlagPresent(flags, ARG_PRINT_OLD_LEVELS); - if(new_levels_ <= 0) { + if (new_levels_ <= 0) { exec_state_ = LDBCommandExecuteResult::Failed( " Use --" + ARG_NEW_LEVELS + " to specify a new level number\n"); } @@ -2245,7 +2239,7 @@ std::vector ReduceDBLevelsCommand::PrepareArgs( ret.push_back("reduce_levels"); ret.push_back("--" + ARG_DB + "=" + db_path); ret.push_back("--" + ARG_NEW_LEVELS + "=" + std::to_string(new_levels)); - if(print_old_level) { + if (print_old_level) { ret.push_back("--" + ARG_PRINT_OLD_LEVELS); } return ret; @@ -2270,8 +2264,7 @@ void ReduceDBLevelsCommand::OverrideBaseCFOptions( cf_opts->max_bytes_for_level_multiplier = 1; } -Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, - int* levels) { +Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) { ImmutableDBOptions db_options(opt); EnvOptions soptions; std::shared_ptr tc( @@ -2369,9 +2362,9 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand( old_compaction_style_(-1), new_compaction_style_(-1) { ParseIntOption(option_map_, ARG_OLD_COMPACTION_STYLE, old_compaction_style_, - exec_state_); + exec_state_); if (old_compaction_style_ != kCompactionStyleLevel && - old_compaction_style_ != kCompactionStyleUniversal) { + old_compaction_style_ != kCompactionStyleUniversal) { exec_state_ = LDBCommandExecuteResult::Failed( "Use --" + ARG_OLD_COMPACTION_STYLE + " to specify old compaction " + "style. Check ldb help for proper compaction style value.\n"); @@ -2379,9 +2372,9 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand( } ParseIntOption(option_map_, ARG_NEW_COMPACTION_STYLE, new_compaction_style_, - exec_state_); + exec_state_); if (new_compaction_style_ != kCompactionStyleLevel && - new_compaction_style_ != kCompactionStyleUniversal) { + new_compaction_style_ != kCompactionStyleUniversal) { exec_state_ = LDBCommandExecuteResult::Failed( "Use --" + ARG_NEW_COMPACTION_STYLE + " to specify new compaction " + "style. Check ldb help for proper compaction style value.\n"); @@ -2721,7 +2714,6 @@ WALDumperCommand::WALDumperCommand( wal_file_ = itr->second; } - print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER); print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE); is_write_committed_ = ParseBooleanOption(options, ARG_WRITE_COMMITTED, true); @@ -2784,7 +2776,7 @@ void GetCommand::DoCommand() { Status st = db_->Get(ReadOptions(), GetCfHandle(), key_, &value); if (st.ok()) { fprintf(stdout, "%s\n", - (is_value_hex_ ? StringToHex(value) : value).c_str()); + (is_value_hex_ ? StringToHex(value) : value).c_str()); } else { std::stringstream oss; oss << "Get failed: " << st.ToString(); @@ -3022,9 +3014,9 @@ void ScanCommand::DoCommand() { TimeToHumanString(ttl_start).c_str(), TimeToHumanString(ttl_end).c_str()); } - for ( ; - it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_); - it->Next()) { + for (; + it->Valid() && (!end_key_specified_ || it->key().ToString() < end_key_); + it->Next()) { if (is_db_ttl_) { TtlIterator* it_ttl = static_cast_with_check(it); int rawtime = it_ttl->ttl_timestamp(); @@ -3258,8 +3250,9 @@ void DBQuerierCommand::Help(std::string& ret) { ret.append(DBQuerierCommand::Name()); ret.append(" [--" + ARG_TTL + "]"); ret.append("\n"); - ret.append(" Starts a REPL shell. Type help for list of available " - "commands."); + ret.append( + " Starts a REPL shell. Type help for list of available " + "commands."); ret.append("\n"); } @@ -3286,7 +3279,7 @@ void DBQuerierCommand::DoCommand() { if (pos2 == std::string::npos) { break; } - tokens.push_back(line.substr(pos, pos2-pos)); + tokens.push_back(line.substr(pos, pos2 - pos)); pos = pos2 + 1; } tokens.push_back(line.substr(pos)); @@ -3320,8 +3313,8 @@ void DBQuerierCommand::DoCommand() { key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); s = db_->Get(read_options, GetCfHandle(), Slice(key), &value); if (s.ok()) { - fprintf(stdout, "%s\n", PrintKeyValue(key, value, - is_key_hex_, is_value_hex_).c_str()); + fprintf(stdout, "%s\n", + PrintKeyValue(key, value, is_key_hex_, is_value_hex_).c_str()); } else { if (s.IsNotFound()) { fprintf(stdout, "Not found %s\n", tokens[1].c_str()); diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h index dfe9cdb36a0..ce8d07f2801 100644 --- a/tools/ldb_cmd_impl.h +++ b/tools/ldb_cmd_impl.h @@ -5,13 +5,13 @@ #pragma once -#include "rocksdb/utilities/ldb_cmd.h" - #include #include #include #include +#include "rocksdb/utilities/ldb_cmd.h" + namespace ROCKSDB_NAMESPACE { class CompactorCommand : public LDBCommand { @@ -85,7 +85,7 @@ class DBDumperCommand : public LDBCommand { /** * Extract file name from the full path. We handle both the forward slash (/) * and backslash (\) to make sure that different OS-s are supported. - */ + */ static std::string GetFileNameFromPath(const std::string& s) { std::size_t n = s.find_last_of("/\\"); @@ -573,14 +573,15 @@ class CheckPointCommand : public LDBCommand { static std::string Name() { return "checkpoint"; } CheckPointCommand(const std::vector& params, - const std::map& options, - const std::vector& flags); + const std::map& options, + const std::vector& flags); void DoCommand() override; static void Help(std::string& ret); std::string checkpoint_dir_; + private: static const std::string ARG_CHECKPOINT_DIR; }; diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc index 5f9e05bb6dc..5d83a6cd97c 100644 --- a/tools/ldb_cmd_test.cc +++ b/tools/ldb_cmd_test.cc @@ -26,9 +26,9 @@ #include "util/file_checksum_helper.h" #include "util/random.h" +using std::map; using std::string; using std::vector; -using std::map; namespace ROCKSDB_NAMESPACE { @@ -70,7 +70,7 @@ TEST_F(LdbCmdTest, HexToString) { auto actual = ROCKSDB_NAMESPACE::LDBCommand::HexToString(inPair.first); auto expected = inPair.second; for (unsigned int i = 0; i < actual.length(); i++) { - EXPECT_EQ(expected[i], static_cast((signed char) actual[i])); + EXPECT_EQ(expected[i], static_cast((signed char)actual[i])); } auto reverse = ROCKSDB_NAMESPACE::LDBCommand::StringToHex(actual); EXPECT_STRCASEEQ(inPair.first.c_str(), reverse.c_str()); diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index caa4f5ce80f..ab517383883 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -5,6 +5,7 @@ // #ifndef ROCKSDB_LITE #include "rocksdb/ldb_tool.h" + #include "rocksdb/utilities/ldb_cmd.h" #include "tools/ldb_cmd_impl.h" diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index c538554a0a2..c8604bf439b 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -19,7 +19,7 @@ namespace ROCKSDB_NAMESPACE { class ReduceLevelTest : public testing::Test { -public: + public: ReduceLevelTest() { dbname_ = test::PerThreadDBPath("db_reduce_levels_test"); EXPECT_OK(DestroyDB(dbname_, Options())); @@ -75,7 +75,7 @@ class ReduceLevelTest : public testing::Test { return atoi(property.c_str()); } -private: + private: std::string dbname_; DB* db_; }; diff --git a/tools/simulated_hybrid_file_system.cc b/tools/simulated_hybrid_file_system.cc index 675d2593fb3..a474417c78a 100644 --- a/tools/simulated_hybrid_file_system.cc +++ b/tools/simulated_hybrid_file_system.cc @@ -6,13 +6,12 @@ #include "util/stop_watch.h" #ifndef ROCKSDB_LITE -#include "tools/simulated_hybrid_file_system.h" - #include #include #include #include "rocksdb/rate_limiter.h" +#include "tools/simulated_hybrid_file_system.h" namespace ROCKSDB_NAMESPACE { diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 7053366e7e6..0a2c282808a 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -259,9 +259,9 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { try { in_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(in_key); } catch (...) { - std::cerr << "ERROR: Invalid key input '" - << in_key - << "' Use 0x{hex representation of internal rocksdb key}" << std::endl; + std::cerr << "ERROR: Invalid key input '" << in_key + << "' Use 0x{hex representation of internal rocksdb key}" + << std::endl; return -1; } Slice sl_key = ROCKSDB_NAMESPACE::Slice(in_key); @@ -331,14 +331,15 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { } } - if(has_compression_level_from && has_compression_level_to) { - if(!has_specified_compression_types || compression_types.size() != 1) { + if (has_compression_level_from && has_compression_level_to) { + if (!has_specified_compression_types || compression_types.size() != 1) { fprintf(stderr, "Specify one compression type.\n\n"); exit(1); } - } else if(has_compression_level_from || has_compression_level_to) { - fprintf(stderr, "Specify both --compression_level_from and " - "--compression_level_to.\n\n"); + } else if (has_compression_level_from || has_compression_level_to) { + fprintf(stderr, + "Specify both --compression_level_from and " + "--compression_level_to.\n\n"); exit(1); } @@ -476,8 +477,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { has_from || use_from_as_prefix, from_key, has_to, to_key, use_from_as_prefix); if (!st.ok()) { - fprintf(stderr, "%s: %s\n", filename.c_str(), - st.ToString().c_str()); + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); } total_read += dumper.GetReadNumber(); if (read_num > 0 && total_read > read_num) { diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc index 146c0c6f49d..d7f9e4da81f 100644 --- a/tools/trace_analyzer_test.cc +++ b/tools/trace_analyzer_test.cc @@ -111,7 +111,7 @@ class TraceAnalyzerTest : public testing::Test { single_iter->SeekForPrev("b"); ASSERT_OK(single_iter->status()); delete single_iter; - std::this_thread::sleep_for (std::chrono::seconds(1)); + std::this_thread::sleep_for(std::chrono::seconds(1)); db_->Get(ro, "g", &value).PermitUncheckedError(); diff --git a/tools/write_stress.cc b/tools/write_stress.cc index 31161ce1c23..ba5bd3f4f00 100644 --- a/tools/write_stress.cc +++ b/tools/write_stress.cc @@ -208,13 +208,16 @@ class WriteStress { SystemClock::Default()->SleepForMicroseconds( static_cast(FLAGS_prefix_mutate_period_sec * 1000 * 1000LL)); if (dist(rng) < FLAGS_first_char_mutate_probability) { - key_prefix_[0].store(static_cast(char_dist(rng)), std::memory_order_relaxed); + key_prefix_[0].store(static_cast(char_dist(rng)), + std::memory_order_relaxed); } if (dist(rng) < FLAGS_second_char_mutate_probability) { - key_prefix_[1].store(static_cast(char_dist(rng)), std::memory_order_relaxed); + key_prefix_[1].store(static_cast(char_dist(rng)), + std::memory_order_relaxed); } if (dist(rng) < FLAGS_third_char_mutate_probability) { - key_prefix_[2].store(static_cast(char_dist(rng)), std::memory_order_relaxed); + key_prefix_[2].store(static_cast(char_dist(rng)), + std::memory_order_relaxed); } } } diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc index b6a4fbceec4..61be2e93a3c 100644 --- a/trace_replay/block_cache_tracer.cc +++ b/trace_replay/block_cache_tracer.cc @@ -8,6 +8,7 @@ #include #include #include +#include #include "db/db_impl/db_impl.h" #include "db/dbformat.h" @@ -81,6 +82,10 @@ uint64_t BlockCacheTraceHelper::GetSequenceNumber( if (!IsGetOrMultiGet(access.caller)) { return 0; } + if (access.caller == TableReaderCaller::kUserMultiGet && + access.referenced_key.size() < 4) { + return 0; + } return access.get_from_user_specified_snapshot ? 1 + GetInternalKeySeqno(access.referenced_key) : 0; diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h index 4a749608f4a..301c7d95ee9 100644 --- a/trace_replay/block_cache_tracer.h +++ b/trace_replay/block_cache_tracer.h @@ -129,11 +129,11 @@ class BlockCacheTraceWriterImpl : public BlockCacheTraceWriter { // Pass Slice references to avoid copy. Status WriteBlockAccess(const BlockCacheTraceRecord& record, const Slice& block_key, const Slice& cf_name, - const Slice& referenced_key); + const Slice& referenced_key) override; // Write a trace header at the beginning, typically on initiating a trace, // with some metadata like a magic number and RocksDB version. - Status WriteHeader(); + Status WriteHeader() override; private: SystemClock* clock_; diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc index 37b95852b74..c681e374c43 100644 --- a/trace_replay/trace_replay.cc +++ b/trace_replay/trace_replay.cc @@ -317,7 +317,7 @@ Status TracerHelper::DecodeTraceRecord(Trace* trace, int trace_file_version, cf_ids.reserve(multiget_size); multiget_keys.reserve(multiget_size); for (uint32_t i = 0; i < multiget_size; i++) { - uint32_t tmp_cfid; + uint32_t tmp_cfid = 0; Slice tmp_key; GetFixed32(&cfids_payload, &tmp_cfid); GetLengthPrefixedSlice(&keys_payload, &tmp_key); diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h index eb32c21b4ea..95ee5dfe82e 100644 --- a/util/aligned_buffer.h +++ b/util/aligned_buffer.h @@ -9,6 +9,7 @@ #pragma once #include + #include "port/port.h" namespace ROCKSDB_NAMESPACE { @@ -30,9 +31,7 @@ inline size_t TruncateToPageBoundary(size_t page_size, size_t s) { // Example: // Roundup(13, 5) => 15 // Roundup(201, 16) => 208 -inline size_t Roundup(size_t x, size_t y) { - return ((x + y - 1) / y) * y; -} +inline size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } // Round down x to a multiple of y. // Example: @@ -61,13 +60,9 @@ class AlignedBuffer { size_t cursize_; char* bufstart_; -public: + public: AlignedBuffer() - : alignment_(), - capacity_(0), - cursize_(0), - bufstart_(nullptr) { - } + : alignment_(), capacity_(0), cursize_(0), bufstart_(nullptr) {} AlignedBuffer(AlignedBuffer&& o) noexcept { *this = std::move(o); } @@ -92,27 +87,17 @@ class AlignedBuffer { return n % alignment == 0; } - size_t Alignment() const { - return alignment_; - } + size_t Alignment() const { return alignment_; } - size_t Capacity() const { - return capacity_; - } + size_t Capacity() const { return capacity_; } - size_t CurrentSize() const { - return cursize_; - } + size_t CurrentSize() const { return cursize_; } - const char* BufferStart() const { - return bufstart_; - } + const char* BufferStart() const { return bufstart_; } char* BufferStart() { return bufstart_; } - void Clear() { - cursize_ = 0; - } + void Clear() { cursize_ = 0; } char* Release() { cursize_ = 0; @@ -202,7 +187,7 @@ class AlignedBuffer { assert(offset < cursize_); size_t to_read = 0; - if(offset < cursize_) { + if (offset < cursize_) { to_read = std::min(cursize_ - offset, read_size); } if (to_read > 0) { @@ -242,12 +227,8 @@ class AlignedBuffer { // the buffer is modified without using the write APIs or encapsulation // offered by AlignedBuffer. It is up to the user to guard against such // errors. - char* Destination() { - return bufstart_ + cursize_; - } + char* Destination() { return bufstart_ + cursize_; } - void Size(size_t cursize) { - cursize_ = cursize; - } + void Size(size_t cursize) { cursize_ = cursize; } }; } // namespace ROCKSDB_NAMESPACE diff --git a/util/async_file_reader.cc b/util/async_file_reader.cc index 8401a6b44ce..080c1ae9668 100644 --- a/util/async_file_reader.cc +++ b/util/async_file_reader.cc @@ -20,17 +20,20 @@ bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) { awaiter->io_handle_.resize(awaiter->num_reqs_); awaiter->del_fn_.resize(awaiter->num_reqs_); for (size_t i = 0; i < awaiter->num_reqs_; ++i) { - awaiter->file_ - ->ReadAsync( - awaiter->read_reqs_[i], awaiter->opts_, - [](const FSReadRequest& req, void* cb_arg) { - FSReadRequest* read_req = static_cast(cb_arg); - read_req->status = req.status; - read_req->result = req.result; - }, - &awaiter->read_reqs_[i], &awaiter->io_handle_[i], - &awaiter->del_fn_[i], /*aligned_buf=*/nullptr) - .PermitUncheckedError(); + IOStatus s = awaiter->file_->ReadAsync( + awaiter->read_reqs_[i], awaiter->opts_, + [](const FSReadRequest& req, void* cb_arg) { + FSReadRequest* read_req = static_cast(cb_arg); + read_req->status = req.status; + read_req->result = req.result; + }, + &awaiter->read_reqs_[i], &awaiter->io_handle_[i], &awaiter->del_fn_[i], + /*aligned_buf=*/nullptr); + if (!s.ok()) { + // For any non-ok status, the FileSystem will not call the callback + // So let's update the status ourselves + awaiter->read_reqs_[i].status = s; + } } return true; } @@ -41,6 +44,7 @@ void AsyncFileReader::Wait() { } ReadAwaiter* waiter; std::vector io_handles; + IOStatus s; io_handles.reserve(num_reqs_); waiter = head_; do { @@ -52,7 +56,7 @@ void AsyncFileReader::Wait() { } while (waiter != tail_ && (waiter = waiter->next_)); if (io_handles.size() > 0) { StopWatch sw(SystemClock::Default().get(), stats_, POLL_WAIT_MICROS); - fs_->Poll(io_handles, io_handles.size()).PermitUncheckedError(); + s = fs_->Poll(io_handles, io_handles.size()); } do { waiter = head_; @@ -62,6 +66,10 @@ void AsyncFileReader::Wait() { if (waiter->io_handle_[i] && waiter->del_fn_[i]) { waiter->del_fn_[i](waiter->io_handle_[i]); } + if (waiter->read_reqs_[i].status.ok() && !s.ok()) { + // Override the request status with the Poll error + waiter->read_reqs_[i].status = s; + } } waiter->awaiting_coro_.resume(); } while (waiter != tail_); diff --git a/util/autovector.h b/util/autovector.h index 3242cb4bddc..f758473b79b 100644 --- a/util/autovector.h +++ b/util/autovector.h @@ -73,7 +73,7 @@ class autovector { using iterator_category = std::random_access_iterator_tag; iterator_impl(TAutoVector* vect, size_t index) - : vect_(vect), index_(index) {}; + : vect_(vect), index_(index){}; iterator_impl(const iterator_impl&) = default; ~iterator_impl() {} iterator_impl& operator=(const iterator_impl&) = default; @@ -139,9 +139,7 @@ class autovector { return &(*vect_)[index_]; } - reference operator[](difference_type len) const { - return *(*this + len); - } + reference operator[](difference_type len) const { return *(*this + len); } // -- Logical Operators bool operator==(const self_type& other) const { @@ -303,7 +301,7 @@ class autovector { reference emplace_back(Args&&... args) { if (num_stack_items_ < kSize) { return *(new ((void*)(&values_[num_stack_items_++])) - value_type(std::forward(args)...)); + value_type(std::forward(args)...)); } else { return vect_.emplace_back(std::forward(args)...); } @@ -319,7 +317,6 @@ class autovector { } #endif - void pop_back() { assert(!empty()); if (!vect_.empty()) { diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 9dad69e735f..8c7c39ce644 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -3,6 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "util/autovector.h" + #include #include #include @@ -11,7 +13,6 @@ #include "rocksdb/env.h" #include "test_util/testharness.h" #include "test_util/testutil.h" -#include "util/autovector.h" #include "util/string_util.h" using std::cout; @@ -28,8 +29,8 @@ void AssertAutoVectorOnlyInStack(autovector* vec, bool result) { #ifndef ROCKSDB_LITE ASSERT_EQ(vec->only_in_stack(), result); #else - (void) vec; - (void) result; + (void)vec; + (void)result; #endif // !ROCKSDB_LITE } } // namespace @@ -109,8 +110,8 @@ TEST_F(AutoVectorTest, Resize) { } namespace { -void AssertEqual( - const autovector& a, const autovector& b) { +void AssertEqual(const autovector& a, + const autovector& b) { ASSERT_EQ(a.size(), b.size()); ASSERT_EQ(a.empty(), b.empty()); #ifndef ROCKSDB_LITE @@ -124,7 +125,7 @@ void AssertEqual( TEST_F(AutoVectorTest, CopyAndAssignment) { // Test both heap-allocated and stack-allocated cases. - for (auto size : { kSize / 2, kSize * 1000 }) { + for (auto size : {kSize / 2, kSize * 1000}) { autovector vec; for (size_t i = 0; i < size; ++i) { vec.push_back(i); @@ -223,7 +224,7 @@ void BenchmarkVectorCreationAndInsertion( int index = 0; auto start_time = env->NowNanos(); auto ops_remaining = ops; - while(ops_remaining--) { + while (ops_remaining--) { TVector v; for (size_t i = 0; i < item_size; ++i) { v.push_back(items[index++]); @@ -283,7 +284,7 @@ TEST_F(AutoVectorTest, PerfBench) { // pre-generated unique keys auto string_keys = GetTestKeys(kOps * 2 * kSize); - for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + for (auto insertions : {0ul, 1ul, kSize / 2, kSize, 2 * kSize}) { BenchmarkVectorCreationAndInsertion>( "std::vector", kOps, insertions, string_keys); BenchmarkVectorCreationAndInsertion>( @@ -300,12 +301,11 @@ TEST_F(AutoVectorTest, PerfBench) { for (size_t i = 0; i < kOps * 2 * kSize; ++i) { int_keys[i] = i; } - for (auto insertions : { 0ul, 1ul, kSize / 2, kSize, 2 * kSize }) { + for (auto insertions : {0ul, 1ul, kSize / 2, kSize, 2 * kSize}) { BenchmarkVectorCreationAndInsertion>( "std::vector", kOps, insertions, int_keys); BenchmarkVectorCreationAndInsertion>( - "autovector", kOps, insertions, int_keys - ); + "autovector", kOps, insertions, int_keys); cout << "-----------------------------------" << endl; } @@ -313,7 +313,7 @@ TEST_F(AutoVectorTest, PerfBench) { cout << "=====================================================" << endl; cout << "Sequence Access Test" << endl; cout << "=====================================================" << endl; - for (auto elem_size : { kSize / 2, kSize, 2 * kSize }) { + for (auto elem_size : {kSize / 2, kSize, 2 * kSize}) { BenchmarkSequenceAccess>("std::vector", kOps, elem_size); BenchmarkSequenceAccess>("autovector", kOps, diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 4adf522e461..06dd1de06c6 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -110,9 +110,7 @@ class FullBloomTest : public testing::TestWithParam { void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); } - void Add(const Slice& s) { - bits_builder_->AddKey(s); - } + void Add(const Slice& s) { bits_builder_->AddKey(s); } void OpenRaw(const Slice& s) { bits_reader_.reset(policy_->GetFilterBitsReader(s)); @@ -124,9 +122,7 @@ class FullBloomTest : public testing::TestWithParam { filter_size_ = filter.size(); } - size_t FilterSize() const { - return filter_size_; - } + size_t FilterSize() const { return filter_size_; } Slice FilterData() { return Slice(buf_.get(), filter_size_); } @@ -319,7 +315,7 @@ TEST_P(FullBloomTest, FullVaryingLengths) { double rate = FalsePositiveRate(); if (kVerbose >= 1) { fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", - rate*100.0, length, static_cast(FilterSize())); + rate * 100.0, length, static_cast(FilterSize())); } if (FLAGS_bits_per_key == 10) { EXPECT_LE(rate, 0.02); // Must not be over 2% @@ -331,8 +327,8 @@ TEST_P(FullBloomTest, FullVaryingLengths) { } } if (kVerbose >= 1) { - fprintf(stderr, "Filters: %d good, %d mediocre\n", - good_filters, mediocre_filters); + fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters, + mediocre_filters); } EXPECT_LE(mediocre_filters, good_filters / 5); } @@ -810,14 +806,14 @@ TEST_P(FullBloomTest, Schema) { struct RawFilterTester { // Buffer, from which we always return a tail Slice, so the // last five bytes are always the metadata bytes. - std::array data_; + std::array data_{}; // Points five bytes from the end char* metadata_ptr_; RawFilterTester() : metadata_ptr_(&*(data_.end() - 5)) {} Slice ResetNoFill(uint32_t len_without_metadata, uint32_t num_lines, - uint32_t num_probes) { + uint32_t num_probes) { metadata_ptr_[0] = static_cast(num_probes); EncodeFixed32(metadata_ptr_ + 1, num_lines); uint32_t len = len_without_metadata + /*metadata*/ 5; @@ -826,13 +822,13 @@ struct RawFilterTester { } Slice Reset(uint32_t len_without_metadata, uint32_t num_lines, - uint32_t num_probes, bool fill_ones) { + uint32_t num_probes, bool fill_ones) { data_.fill(fill_ones ? 0xff : 0); return ResetNoFill(len_without_metadata, num_lines, num_probes); } Slice ResetWeirdFill(uint32_t len_without_metadata, uint32_t num_lines, - uint32_t num_probes) { + uint32_t num_probes) { for (uint32_t i = 0; i < data_.size(); ++i) { data_[i] = static_cast(0x7b7b >> (i % 7)); } diff --git a/util/coding.cc b/util/coding.cc index a54324d2881..3da8afaa271 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -10,6 +10,7 @@ #include "util/coding.h" #include + #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" diff --git a/util/coding.h b/util/coding.h index 72f63bc6b29..3168fd2fd1f 100644 --- a/util/coding.h +++ b/util/coding.h @@ -25,7 +25,7 @@ // Some processors does not allow unaligned access to memory #if defined(__sparc) - #define PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED +#define PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED #endif namespace ROCKSDB_NAMESPACE { @@ -82,8 +82,10 @@ inline int64_t zigzagToI64(uint64_t n) { // in *v and return a pointer just past the parsed value, or return // nullptr on error. These routines only look at bytes in the range // [p..limit-1] -extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); -extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); +extern const char* GetVarint32Ptr(const char* p, const char* limit, + uint32_t* v); +extern const char* GetVarint64Ptr(const char* p, const char* limit, + uint64_t* v); inline const char* GetVarsignedint64Ptr(const char* p, const char* limit, int64_t* value) { uint64_t u = 0; @@ -102,11 +104,9 @@ extern char* EncodeVarint32(char* dst, uint32_t value); extern char* EncodeVarint64(char* dst, uint64_t value); // Internal routine for use by fallback path of GetVarint32Ptr -extern const char* GetVarint32PtrFallback(const char* p, - const char* limit, +extern const char* GetVarint32PtrFallback(const char* p, const char* limit, uint32_t* value); -inline const char* GetVarint32Ptr(const char* p, - const char* limit, +inline const char* GetVarint32Ptr(const char* p, const char* limit, uint32_t* value) { if (p < limit) { uint32_t result = *(reinterpret_cast(p)); @@ -133,7 +133,7 @@ inline void PutFixed16(std::string* dst, uint16_t value) { inline void PutFixed32(std::string* dst, uint32_t value) { if (port::kLittleEndian) { dst->append(const_cast(reinterpret_cast(&value)), - sizeof(value)); + sizeof(value)); } else { char buf[sizeof(value)]; EncodeFixed32(buf, value); @@ -144,7 +144,7 @@ inline void PutFixed32(std::string* dst, uint32_t value) { inline void PutFixed64(std::string* dst, uint64_t value) { if (port::kLittleEndian) { dst->append(const_cast(reinterpret_cast(&value)), - sizeof(value)); + sizeof(value)); } else { char buf[sizeof(value)]; EncodeFixed64(buf, value); @@ -350,7 +350,7 @@ inline Slice GetSliceUntil(Slice* slice, char delimiter) { return ret; } -template +template #ifdef ROCKSDB_UBSAN_RUN #if defined(__clang__) __attribute__((__no_sanitize__("alignment"))) @@ -358,16 +358,17 @@ __attribute__((__no_sanitize__("alignment"))) __attribute__((__no_sanitize_undefined__)) #endif #endif -inline void PutUnaligned(T *memory, const T &value) { +inline void +PutUnaligned(T* memory, const T& value) { #if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED) - char *nonAlignedMemory = reinterpret_cast(memory); + char* nonAlignedMemory = reinterpret_cast(memory); memcpy(nonAlignedMemory, reinterpret_cast(&value), sizeof(T)); #else *memory = value; #endif } -template +template #ifdef ROCKSDB_UBSAN_RUN #if defined(__clang__) __attribute__((__no_sanitize__("alignment"))) @@ -375,9 +376,10 @@ __attribute__((__no_sanitize__("alignment"))) __attribute__((__no_sanitize_undefined__)) #endif #endif -inline void GetUnaligned(const T *memory, T *value) { +inline void +GetUnaligned(const T* memory, T* value) { #if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED) - char *nonAlignedMemory = reinterpret_cast(value); + char* nonAlignedMemory = reinterpret_cast(value); memcpy(nonAlignedMemory, reinterpret_cast(memory), sizeof(T)); #else *value = *memory; diff --git a/util/coding_test.cc b/util/coding_test.cc index 0f974277d51..79dd7b82e92 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -13,7 +13,7 @@ namespace ROCKSDB_NAMESPACE { -class Coding { }; +class Coding {}; TEST(Coding, Fixed16) { std::string s; for (uint16_t v = 0; v < 0xFFFF; v++) { @@ -56,15 +56,15 @@ TEST(Coding, Fixed64) { uint64_t v = static_cast(1) << power; uint64_t actual = 0; actual = DecodeFixed64(p); - ASSERT_EQ(v-1, actual); + ASSERT_EQ(v - 1, actual); p += sizeof(uint64_t); actual = DecodeFixed64(p); - ASSERT_EQ(v+0, actual); + ASSERT_EQ(v + 0, actual); p += sizeof(uint64_t); actual = DecodeFixed64(p); - ASSERT_EQ(v+1, actual); + ASSERT_EQ(v + 1, actual); p += sizeof(uint64_t); } } @@ -125,8 +125,8 @@ TEST(Coding, Varint64) { // Test values near powers of two const uint64_t power = 1ull << k; values.push_back(power); - values.push_back(power-1); - values.push_back(power+1); + values.push_back(power - 1); + values.push_back(power + 1); }; std::string s; @@ -146,14 +146,13 @@ TEST(Coding, Varint64) { ASSERT_EQ(VarintLength(actual), p - start); } ASSERT_EQ(p, limit); - } TEST(Coding, Varint32Overflow) { uint32_t result; std::string input("\x81\x82\x83\x84\x85\x11"); - ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result) - == nullptr); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), + &result) == nullptr); } TEST(Coding, Varint32Truncation) { @@ -164,16 +163,16 @@ TEST(Coding, Varint32Truncation) { for (unsigned int len = 0; len + 1 < s.size(); len++) { ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr); } - ASSERT_TRUE( - GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != + nullptr); ASSERT_EQ(large_value, result); } TEST(Coding, Varint64Overflow) { uint64_t result; std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); - ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result) - == nullptr); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), + &result) == nullptr); } TEST(Coding, Varint64Truncation) { @@ -184,8 +183,8 @@ TEST(Coding, Varint64Truncation) { for (unsigned int len = 0; len + 1 < s.size(); len++) { ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr); } - ASSERT_TRUE( - GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != nullptr); + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != + nullptr); ASSERT_EQ(large_value, result); } diff --git a/util/comparator.cc b/util/comparator.cc index 72584de4348..f85ed69ee6c 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -29,7 +29,7 @@ namespace ROCKSDB_NAMESPACE { namespace { class BytewiseComparatorImpl : public Comparator { public: - BytewiseComparatorImpl() { } + BytewiseComparatorImpl() {} static const char* kClassName() { return "leveldb.BytewiseComparator"; } const char* Name() const override { return kClassName(); } @@ -97,7 +97,7 @@ class BytewiseComparatorImpl : public Comparator { const uint8_t byte = (*key)[i]; if (byte != static_cast(0xff)) { (*key)[i] = byte + 1; - key->resize(i+1); + key->resize(i + 1); return; } } @@ -147,7 +147,7 @@ class BytewiseComparatorImpl : public Comparator { class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { public: - ReverseBytewiseComparatorImpl() { } + ReverseBytewiseComparatorImpl() {} static const char* kClassName() { return "rocksdb.ReverseBytewiseComparator"; @@ -298,7 +298,7 @@ class ComparatorWithU64TsImpl : public Comparator { TComparator cmp_without_ts_; }; -}// namespace +} // namespace const Comparator* BytewiseComparator() { STATIC_AVOID_DESTRUCTION(BytewiseComparatorImpl, bytewise); diff --git a/util/compression.cc b/util/compression.cc index 8e2f01b1250..712d333ee63 100644 --- a/util/compression.cc +++ b/util/compression.cc @@ -85,14 +85,14 @@ void ZSTDStreamingCompress::Reset() { int ZSTDStreamingUncompress::Uncompress(const char* input, size_t input_size, char* output, size_t* output_pos) { - assert(input != nullptr && output != nullptr && output_pos != nullptr); + assert(output != nullptr && output_pos != nullptr); *output_pos = 0; // Don't need to uncompress an empty input if (input_size == 0) { return 0; } #ifdef ZSTD_STREAMING - if (input_buffer_.src != input) { + if (input) { // New input input_buffer_ = {input, input_size, /*pos=*/0}; } diff --git a/util/compression.h b/util/compression.h index 4fccbdb007e..31ff5a7554a 100644 --- a/util/compression.h +++ b/util/compression.h @@ -23,6 +23,7 @@ #include "memory/memory_allocator.h" #include "rocksdb/options.h" #include "rocksdb/table.h" +#include "table/block_based/block_type.h" #include "test_util/sync_point.h" #include "util/coding.h" #include "util/compression_context_cache.h" @@ -47,10 +48,12 @@ #if defined(ZSTD) #include -#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+ +// v1.1.3+ +#if ZSTD_VERSION_NUMBER >= 10103 #include #endif // ZSTD_VERSION_NUMBER >= 10103 -#if ZSTD_VERSION_NUMBER >= 10400 // v1.4.0+ +// v1.4.0+ +#if ZSTD_VERSION_NUMBER >= 10400 #define ZSTD_STREAMING #endif // ZSTD_VERSION_NUMBER >= 10400 namespace ROCKSDB_NAMESPACE { @@ -143,6 +146,7 @@ class ZSTDUncompressCachedData { int64_t GetCacheIndex() const { return -1; } void CreateIfNeeded() {} void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {} + private: void ignore_padding__() { padding = nullptr; } }; @@ -318,6 +322,11 @@ struct UncompressionDict { const Slice& GetRawDict() const { return slice_; } + // For TypedCacheInterface + const Slice& ContentSlice() const { return slice_; } + static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock; + static constexpr BlockType kBlockType = BlockType::kCompressionDictionary; + #ifdef ROCKSDB_ZSTD_DDICT const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; } #endif // ROCKSDB_ZSTD_DDICT @@ -1256,7 +1265,7 @@ inline bool LZ4HC_Compress(const CompressionInfo& info, size_t compression_dict_size = compression_dict.size(); if (compression_dict_data != nullptr) { LZ4_loadDictHC(stream, compression_dict_data, - static_cast(compression_dict_size)); + static_cast(compression_dict_size)); } #if LZ4_VERSION_NUMBER >= 10700 // r129+ @@ -1702,8 +1711,11 @@ class StreamingUncompress { compress_format_version_(compress_format_version), max_output_len_(max_output_len) {} virtual ~StreamingUncompress() = default; - // uncompress should be called again with the same input if output_size is - // equal to max_output_len or with the next input fragment. + // Uncompress can be called repeatedly to progressively process the same + // input buffer, or can be called with a new input buffer. When the input + // buffer is not fully consumed, the return value is > 0 or output_size + // == max_output_len. When calling uncompress to continue processing the + // same input buffer, the input argument should be nullptr. // Parameters: // input - buffer to uncompress // input_size - size of input buffer diff --git a/util/compression_context_cache.cc b/util/compression_context_cache.cc index f62ac0c9b5a..52c3fac72ac 100644 --- a/util/compression_context_cache.cc +++ b/util/compression_context_cache.cc @@ -9,11 +9,11 @@ #include "util/compression_context_cache.h" +#include + #include "util/compression.h" #include "util/core_local.h" -#include - namespace ROCKSDB_NAMESPACE { namespace compression_cache { diff --git a/util/concurrent_task_limiter_impl.cc b/util/concurrent_task_limiter_impl.cc index 2342677d895..a0fc7331f53 100644 --- a/util/concurrent_task_limiter_impl.cc +++ b/util/concurrent_task_limiter_impl.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "util/concurrent_task_limiter_impl.h" + #include "rocksdb/concurrent_task_limiter.h" namespace ROCKSDB_NAMESPACE { @@ -16,17 +17,13 @@ ConcurrentTaskLimiterImpl::ConcurrentTaskLimiterImpl( const std::string& name, int32_t max_outstanding_task) : name_(name), max_outstanding_tasks_{max_outstanding_task}, - outstanding_tasks_{0} { - -} + outstanding_tasks_{0} {} ConcurrentTaskLimiterImpl::~ConcurrentTaskLimiterImpl() { assert(outstanding_tasks_ == 0); } -const std::string& ConcurrentTaskLimiterImpl::GetName() const { - return name_; -} +const std::string& ConcurrentTaskLimiterImpl::GetName() const { return name_; } void ConcurrentTaskLimiterImpl::SetMaxOutstandingTask(int32_t limit) { max_outstanding_tasks_.store(limit, std::memory_order_relaxed); @@ -54,8 +51,8 @@ std::unique_ptr ConcurrentTaskLimiterImpl::GetToken( return nullptr; } -ConcurrentTaskLimiter* NewConcurrentTaskLimiter( - const std::string& name, int32_t limit) { +ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name, + int32_t limit) { return new ConcurrentTaskLimiterImpl(name, limit); } diff --git a/util/concurrent_task_limiter_impl.h b/util/concurrent_task_limiter_impl.h index d8c1e03cb07..4952ae23aa9 100644 --- a/util/concurrent_task_limiter_impl.h +++ b/util/concurrent_task_limiter_impl.h @@ -11,8 +11,8 @@ #include #include -#include "rocksdb/env.h" #include "rocksdb/concurrent_task_limiter.h" +#include "rocksdb/env.h" namespace ROCKSDB_NAMESPACE { diff --git a/util/crc32c.cc b/util/crc32c.cc index 59acb8c72b0..d71c71c2e42 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -64,301 +64,209 @@ static int arch_ppc_crc32 = 0; #endif static const uint32_t table0_[256] = { - 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, - 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, - 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, - 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, - 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, - 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, - 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, - 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, - 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, - 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, - 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, - 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, - 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, - 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, - 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, - 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, - 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, - 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, - 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, - 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, - 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, - 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, - 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, - 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, - 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, - 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, - 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, - 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, - 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, - 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, - 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, - 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, - 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, - 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, - 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, - 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, - 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, - 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, - 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, - 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, - 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, - 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, - 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, - 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, - 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, - 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, - 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, - 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, - 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, - 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, - 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, - 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, - 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, - 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, - 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, - 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, - 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, - 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, - 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, - 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, - 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, - 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, - 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, - 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 -}; + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, + 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, + 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, + 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, + 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, + 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, + 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, + 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, + 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, + 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, + 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, + 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, + 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, + 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, + 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, + 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, + 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, + 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, + 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, + 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, + 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, + 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351}; static const uint32_t table1_[256] = { - 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, - 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, - 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, - 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, - 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, - 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, - 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, - 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, - 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, - 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, - 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, - 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, - 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, - 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, - 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, - 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, - 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, - 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, - 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, - 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, - 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, - 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, - 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, - 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, - 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, - 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, - 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, - 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, - 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, - 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, - 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, - 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, - 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, - 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, - 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, - 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, - 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, - 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, - 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, - 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, - 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, - 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, - 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, - 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, - 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, - 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, - 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, - 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, - 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, - 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, - 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, - 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, - 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, - 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, - 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, - 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, - 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, - 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, - 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, - 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, - 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, - 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, - 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, - 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 -}; + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, + 0x69cf5132, 0x7a6dc945, 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, 0x3fc5f181, 0x2c6769f6, + 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, + 0xcb1e630b, 0xd8bcfb7c, 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, 0xe29f20ba, 0xf13db8cd, + 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, + 0x298143b1, 0x3a23dbc6, 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, 0xff17c604, 0xecb55e73, + 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, + 0x0bcc548e, 0x186eccf9, 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, 0x5dc6f43d, 0x4e646c4a, + 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, + 0xe9537434, 0xfaf1ec43, 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, 0xbf59d487, 0xacfb4cf0, + 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, + 0x4b82460d, 0x5820de7a, 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, 0x66d73941, 0x7575a136, + 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, + 0xadc95a4a, 0xbe6bc23d, 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, 0x844819fb, 0x97ea818c, + 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, + 0x70938b71, 0x63311306, 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, 0x26992bc2, 0x353bb3b5, + 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, + 0x6d1b6dcf, 0x7eb9f5b8, 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, 0x3b11cd7c, 0x28b3550b, + 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, + 0xcfca5ff6, 0xdc68c781, 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, 0xe64b1c47, 0xf5e98430, + 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, + 0x2d557f4c, 0x3ef7e73b, 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483}; static const uint32_t table2_[256] = { - 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, - 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, - 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, - 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, - 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, - 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, - 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, - 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, - 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, - 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, - 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, - 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, - 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, - 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, - 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, - 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, - 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, - 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, - 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, - 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, - 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, - 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, - 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, - 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, - 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, - 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, - 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, - 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, - 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, - 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, - 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, - 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, - 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, - 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, - 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, - 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, - 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, - 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, - 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, - 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, - 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, - 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, - 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, - 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, - 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, - 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, - 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, - 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, - 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, - 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, - 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, - 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, - 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, - 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, - 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, - 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, - 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, - 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, - 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, - 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, - 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, - 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, - 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, - 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 -}; + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, + 0xd1b1f617, 0x74f06469, 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, 0x70a27d8a, 0xd5e3eff4, + 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, + 0x9942b558, 0x3c032726, 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, 0xd915c5d1, 0x7c5457af, + 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, + 0x40577089, 0xe516e2f7, 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, 0xc76580d9, 0x622412a7, + 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, + 0x2e85480b, 0x8bc4da75, 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, 0x8f96c396, 0x2ad751e8, + 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, + 0xf7908dda, 0x52d11fa4, 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, 0x56830647, 0xf3c29439, + 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, + 0xbf63ce95, 0x1a225ceb, 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, 0xb3764986, 0x1637dbf8, + 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, + 0x2a34fcde, 0x8f756ea0, 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, 0x6a638c57, 0xcf221e29, + 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, + 0x83834485, 0x26c2d6fb, 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, 0x2290cf18, 0x87d15d66, + 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, + 0x9df3018d, 0x38b293f3, 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, 0x3ce08a10, 0x99a1186e, + 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, + 0xd50042c2, 0x7041d0bc, 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, 0x9557324b, 0x3016a035, + 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, + 0x0c158713, 0xa954156d, 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8}; static const uint32_t table3_[256] = { - 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, - 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, - 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, - 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, - 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, - 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, - 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, - 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, - 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, - 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, - 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, - 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, - 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, - 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, - 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, - 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, - 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, - 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, - 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, - 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, - 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, - 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, - 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, - 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, - 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, - 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, - 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, - 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, - 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, - 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, - 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, - 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, - 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, - 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, - 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, - 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, - 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, - 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, - 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, - 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, - 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, - 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, - 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, - 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, - 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, - 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, - 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, - 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, - 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, - 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, - 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, - 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, - 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, - 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, - 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, - 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, - 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, - 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, - 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, - 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, - 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, - 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, - 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, - 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 -}; + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, + 0xc4451272, 0x1900b8ca, 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, 0xe964b13d, 0x34211b85, + 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, + 0xdb65c0a9, 0x06206a11, 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, 0x2161776d, 0xfc24ddd5, + 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, + 0xfa04b7c4, 0x27411d7c, 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, 0xaba65fe7, 0x76e3f55f, + 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, + 0x99a72e73, 0x44e284cb, 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, 0xb4868d3c, 0x69c32784, + 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, + 0xb8c6591e, 0x6583f3a6, 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, 0x95e7fa51, 0x48a250e9, + 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, + 0xa7e68bc5, 0x7aa3217d, 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, 0xa4e4aad9, 0x79a10061, + 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, + 0x7f816a70, 0xa2c4c0c8, 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, 0x8585ddb4, 0x58c0770c, + 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, + 0xb784ac20, 0x6ac10698, 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, 0x9aa50f6f, 0x47e0a5d7, + 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, + 0x3d4384aa, 0xe0062e12, 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, 0x106227e5, 0xcd278d5d, + 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, + 0x22635671, 0xff26fcc9, 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, 0xd867e1b5, 0x05224b0d, + 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, + 0x0302211c, 0xde478ba4, 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842}; // Used to fetch a naturally-aligned 32-bit word in little endian byte-order -static inline uint32_t LE_LOAD32(const uint8_t *p) { +static inline uint32_t LE_LOAD32(const uint8_t* p) { return DecodeFixed32(reinterpret_cast(p)); } #if defined(HAVE_SSE42) && (defined(__LP64__) || defined(_WIN64)) -static inline uint64_t LE_LOAD64(const uint8_t *p) { +static inline uint64_t LE_LOAD64(const uint8_t* p) { return DecodeFixed64(reinterpret_cast(p)); } #endif -static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { +static inline void Slow_CRC32(uint64_t* l, uint8_t const** p) { uint32_t c = static_cast(*l ^ LE_LOAD32(*p)); *p += 4; - *l = table3_[c & 0xff] ^ - table2_[(c >> 8) & 0xff] ^ - table1_[(c >> 16) & 0xff] ^ - table0_[c >> 24]; + *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; // DO it twice. c = static_cast(*l ^ LE_LOAD32(*p)); *p += 4; - *l = table3_[c & 0xff] ^ - table2_[(c >> 8) & 0xff] ^ - table1_[(c >> 16) & 0xff] ^ - table0_[c >> 24]; + *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; } #if (!(defined(HAVE_POWER8) && defined(HAS_ALTIVEC))) && \ (!defined(HAVE_ARM64_CRC)) || \ defined(NO_THREEWAY_CRC32C) -static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { +static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) { #ifndef HAVE_SSE42 Slow_CRC32(l, p); #elif defined(__LP64__) || defined(_WIN64) @@ -373,21 +281,20 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { } #endif -template +template uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { - - const uint8_t *p = reinterpret_cast(buf); - const uint8_t *e = p + size; + const uint8_t* p = reinterpret_cast(buf); + const uint8_t* e = p + size; uint64_t l = crc ^ 0xffffffffu; // Align n to (1 << m) byte boundary -#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) - -#define STEP1 do { \ - int c = (l & 0xff) ^ *p++; \ - l = table0_[c] ^ (l >> 8); \ -} while (0) +#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) +#define STEP1 \ + do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ + } while (0) // Point x at first 16-byte aligned byte in string. This might be // just past the end of the string. @@ -400,12 +307,12 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { } } // Process bytes 16 at a time - while ((e-p) >= 16) { + while ((e - p) >= 16) { CRC32(&l, &p); CRC32(&l, &p); } // Process bytes 8 at a time - while ((e-p) >= 8) { + while ((e - p) >= 8) { CRC32(&l, &p); } // Process the last few bytes @@ -440,8 +347,8 @@ static bool isSSE42() { static bool isPCLMULQDQ() { #ifndef HAVE_SSE42 -// in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ are -// supported by compiler + // in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ + // are supported by compiler return false; #elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) uint32_t c_; @@ -462,8 +369,8 @@ static bool isPCLMULQDQ() { using Function = uint32_t (*)(uint32_t, const char*, size_t); #if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) -uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) { - return crc32c_ppc(crc, (const unsigned char *)buf, size); +uint32_t ExtendPPCImpl(uint32_t crc, const char* buf, size_t size) { + return crc32c_ppc(crc, (const unsigned char*)buf, size); } #if __linux__ @@ -500,8 +407,8 @@ static bool isAltiVec() { #endif #if defined(HAVE_ARM64_CRC) -uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) { - return crc32c_arm64(crc, (const unsigned char *)buf, size); +uint32_t ExtendARMImpl(uint32_t crc, const char* buf, size_t size) { + return crc32c_arm64(crc, (const unsigned char*)buf, size); } #endif @@ -534,14 +441,12 @@ std::string IsFastCrc32Supported() { #endif if (has_fast_crc) { fast_zero_msg.append("Supported on " + arch); - } - else { + } else { fast_zero_msg.append("Not supported on " + arch); } return fast_zero_msg; } - /* * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands * This software is provided 'as-is', without any express or implied @@ -568,9 +473,10 @@ std::string IsFastCrc32Supported() { * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf * - * This version is from the folly library, created by Dave Watson + * This version is from the folly library, created by Dave Watson + * * -*/ + */ #if defined HAVE_SSE42 && defined HAVE_PCLMUL #define CRCtriplet(crc, buf, offset) \ @@ -582,10 +488,9 @@ std::string IsFastCrc32Supported() { crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); -#define CRCsinglet(crc, buf, offset) \ +#define CRCsinglet(crc, buf, offset) \ crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); - // Numbers taken directly from intel whitepaper. // clang-format off const uint64_t clmul_constants[] = { @@ -1262,7 +1167,7 @@ static inline Function Choose_Extend() { #else if (isSSE42()) { if (isPCLMULQDQ()) { -#if defined HAVE_SSE42 && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C +#if (defined HAVE_SSE42 && defined HAVE_PCLMUL) && !defined NO_THREEWAY_CRC32C return crc32c_3way; #else return ExtendImpl; // Fast_CRC32 will check HAVE_SSE42 itself diff --git a/util/crc32c.h b/util/crc32c.h index d4f397c06b7..a08ad60af3f 100644 --- a/util/crc32c.h +++ b/util/crc32c.h @@ -10,6 +10,7 @@ #pragma once #include #include + #include #include "rocksdb/rocksdb_namespace.h" @@ -31,9 +32,7 @@ extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); extern uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len); // Return the crc32c of data[0,n-1] -inline uint32_t Value(const char* data, size_t n) { - return Extend(0, data, n); -} +inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); } static const uint32_t kMaskDelta = 0xa282ead8ul; diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc index a799fb6058c..4885f4fe101 100644 --- a/util/crc32c_arm64.cc +++ b/util/crc32c_arm64.cc @@ -22,6 +22,12 @@ #if defined(__APPLE__) #include #endif +#if defined(__OpenBSD__) +#include +#include +#include +#include +#endif #ifdef HAVE_ARM64_CRYPTO /* unfolding to compute 8 * 3 = 24 bytes parallelly */ @@ -46,7 +52,7 @@ extern bool pmull_runtime_flag; uint32_t crc32c_runtime_check(void) { -#if !defined(__APPLE__) +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) uint64_t auxv = 0; #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) auxv = getauxval(AT_HWCAP); @@ -54,16 +60,29 @@ uint32_t crc32c_runtime_check(void) { elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv)); #endif return (auxv & HWCAP_CRC32) != 0; -#else +#elif defined(__APPLE__) int r; size_t l = sizeof(r); if (sysctlbyname("hw.optional.armv8_crc32", &r, &l, NULL, 0) == -1) return 0; return r == 1; +#elif defined(__OpenBSD__) + int r = 0; + const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0; + size_t len = sizeof(isar0); + + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE) + r = 1; + } + return r; +#else + return 0; #endif } bool crc32c_pmull_runtime_check(void) { -#if !defined(__APPLE__) +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) uint64_t auxv = 0; #if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) auxv = getauxval(AT_HWCAP); @@ -71,8 +90,21 @@ bool crc32c_pmull_runtime_check(void) { elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv)); #endif return (auxv & HWCAP_PMULL) != 0; -#else +#elif defined(__APPLE__) return true; +#elif defined(__OpenBSD__) + bool r = false; + const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0; + size_t len = sizeof(isar0); + + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + if (ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_PMULL) + r = true; + } + return r; +#else + return false; #endif } diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h index b16b1f3e0be..4b27fe87108 100644 --- a/util/crc32c_arm64.h +++ b/util/crc32c_arm64.h @@ -18,6 +18,7 @@ #define crc32c_u16(crc, v) __crc32ch(crc, v) #define crc32c_u32(crc, v) __crc32cw(crc, v) #define crc32c_u64(crc, v) __crc32cd(crc, v) +// clang-format off #define PREF4X64L1(buffer, PREF_OFFSET, ITR) \ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ [c] "I"((PREF_OFFSET) + ((ITR) + 0) * 64)); \ @@ -27,6 +28,7 @@ [c] "I"((PREF_OFFSET) + ((ITR) + 2) * 64)); \ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ [c] "I"((PREF_OFFSET) + ((ITR) + 3) * 64)); +// clang-format on #define PREF1KL1(buffer, PREF_OFFSET) \ PREF4X64L1(buffer, (PREF_OFFSET), 0) \ diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc index e8929bfe4b6..715d63e2de0 100644 --- a/util/crc32c_test.cc +++ b/util/crc32c_test.cc @@ -15,8 +15,7 @@ namespace ROCKSDB_NAMESPACE { namespace crc32c { -class CRC { }; - +class CRC {}; // Tests for 3-way crc32c algorithm. We need these tests because it uses // different lookup tables than the original Fast_CRC32 @@ -31,42 +30,41 @@ struct ExpectedResult { ExpectedResult expectedResults[] = { // Zero-byte input - { 0, 0, ~0U }, + {0, 0, ~0U}, // Small aligned inputs to test special cases in SIMD implementations - { 8, 1, 1543413366 }, - { 8, 2, 523493126 }, - { 8, 3, 1560427360 }, - { 8, 4, 3422504776 }, - { 8, 5, 447841138 }, - { 8, 6, 3910050499 }, - { 8, 7, 3346241981 }, + {8, 1, 1543413366}, + {8, 2, 523493126}, + {8, 3, 1560427360}, + {8, 4, 3422504776}, + {8, 5, 447841138}, + {8, 6, 3910050499}, + {8, 7, 3346241981}, // Small unaligned inputs - { 9, 1, 3855826643 }, - { 10, 2, 560880875 }, - { 11, 3, 1479707779 }, - { 12, 4, 2237687071 }, - { 13, 5, 4063855784 }, - { 14, 6, 2553454047 }, - { 15, 7, 1349220140 }, + {9, 1, 3855826643}, + {10, 2, 560880875}, + {11, 3, 1479707779}, + {12, 4, 2237687071}, + {13, 5, 4063855784}, + {14, 6, 2553454047}, + {15, 7, 1349220140}, // Larger inputs to test leftover chunks at the end of aligned blocks - { 8, 8, 627613930 }, - { 8, 9, 2105929409 }, - { 8, 10, 2447068514 }, - { 8, 11, 863807079 }, - { 8, 12, 292050879 }, - { 8, 13, 1411837737 }, - { 8, 14, 2614515001 }, - { 8, 15, 3579076296 }, - { 8, 16, 2897079161 }, - { 8, 17, 675168386 }, + {8, 8, 627613930}, + {8, 9, 2105929409}, + {8, 10, 2447068514}, + {8, 11, 863807079}, + {8, 12, 292050879}, + {8, 13, 1411837737}, + {8, 14, 2614515001}, + {8, 15, 3579076296}, + {8, 16, 2897079161}, + {8, 17, 675168386}, // // Much larger inputs - { 0, BUFFER_SIZE, 2096790750 }, - { 1, BUFFER_SIZE / 2, 3854797577 }, + {0, BUFFER_SIZE, 2096790750}, + {1, BUFFER_SIZE / 2, 3854797577}, }; TEST(CRC, StandardResults) { - // Original Fast_CRC32 tests. // From rfc3720 section B.4. char buf[32]; @@ -88,18 +86,10 @@ TEST(CRC, StandardResults) { ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf))); unsigned char data[48] = { - 0x01, 0xc0, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x04, 0x00, - 0x00, 0x00, 0x00, 0x14, - 0x00, 0x00, 0x00, 0x18, - 0x28, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, + 0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); @@ -114,21 +104,17 @@ TEST(CRC, StandardResults) { for (auto expected : expectedResults) { size_t partialLength = expected.length / 2; uint32_t partialChecksum = Value(buffer + expected.offset, partialLength); - uint32_t result = Extend(partialChecksum, - buffer + expected.offset + partialLength, - expected.length - partialLength); + uint32_t result = + Extend(partialChecksum, buffer + expected.offset + partialLength, + expected.length - partialLength); EXPECT_EQ(~expected.crc32c, result); } - } -TEST(CRC, Values) { - ASSERT_NE(Value("a", 1), Value("foo", 3)); -} +TEST(CRC, Values) { ASSERT_NE(Value("a", 1), Value("foo", 3)); } TEST(CRC, Extend) { - ASSERT_EQ(Value("hello world", 11), - Extend(Value("hello ", 6), "world", 5)); + ASSERT_EQ(Value("hello world", 11), Extend(Value("hello ", 6), "world", 5)); } TEST(CRC, Mask) { @@ -189,15 +175,14 @@ TEST(CRC, Crc32cCombineBigSizeTest) { // copied from folly const uint64_t FNV_64_HASH_START = 14695981039346656037ULL; -inline uint64_t fnv64_buf(const void* buf, - size_t n, +inline uint64_t fnv64_buf(const void* buf, size_t n, uint64_t hash = FNV_64_HASH_START) { // forcing signed char, since other platforms can use unsigned const signed char* char_buf = reinterpret_cast(buf); for (size_t i = 0; i < n; ++i) { hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) + - (hash << 8) + (hash << 40); + (hash << 8) + (hash << 40); hash ^= char_buf[i]; } return hash; diff --git a/util/defer.h b/util/defer.h index 33c0243e650..f71e67ba9ca 100644 --- a/util/defer.h +++ b/util/defer.h @@ -37,7 +37,8 @@ namespace ROCKSDB_NAMESPACE { // but sometimes, this might lead to nested blocks of "if (s.ok()) {...}". // // With the help of Defer, you can centralize the cleanup logic inside the -// lambda passed to Defer, and you can return immediately on failure when necessary. +// lambda passed to Defer, and you can return immediately on failure when +// necessary. class Defer final { public: explicit Defer(std::function&& fn) : fn_(std::move(fn)) {} diff --git a/util/defer_test.cc b/util/defer_test.cc index 1334e68b283..0e98f68b6d1 100644 --- a/util/defer_test.cc +++ b/util/defer_test.cc @@ -3,10 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "util/defer.h" + #include "port/port.h" #include "port/stack_trace.h" #include "test_util/testharness.h" -#include "util/defer.h" namespace ROCKSDB_NAMESPACE { diff --git a/util/duplicate_detector.h b/util/duplicate_detector.h index 02f6d09fa05..d778622db81 100644 --- a/util/duplicate_detector.h +++ b/util/duplicate_detector.h @@ -54,7 +54,8 @@ class DuplicateDetector { db_->immutable_db_options().info_log, "Recovering an entry from the dropped column family %" PRIu32 ". WAL must must have been emptied before dropping the column " - "family", cf); + "family", + cf); #ifndef ROCKSDB_LITE throw std::runtime_error( "Recovering an entry from a dropped column family. " diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index 60d4b2cf29c..0ff3b4a758e 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -23,7 +23,7 @@ uint32_t roundUpToPow2(uint32_t x) { } return rv; } -} +} // namespace DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits, uint32_t num_probes, size_t huge_page_tlb_size, diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index 542183280a1..40cd2940445 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -6,15 +6,15 @@ #pragma once #include +#include +#include #include + #include "port/port.h" #include "rocksdb/slice.h" #include "table/multiget_context.h" #include "util/hash.h" -#include -#include - namespace ROCKSDB_NAMESPACE { class Slice; diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 9f43ce0fd0a..925c5479ab0 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -314,7 +314,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) { } // namespace ROCKSDB_NAMESPACE -int main(int argc, char** argv) { +int main(int argc, char **argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); ParseCommandLineFlags(&argc, &argv, true); diff --git a/util/filelock_test.cc b/util/filelock_test.cc index e7f4d8ae4f4..69947a732e1 100644 --- a/util/filelock_test.cc +++ b/util/filelock_test.cc @@ -3,15 +3,16 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "rocksdb/status.h" -#include "rocksdb/env.h" - #include + +#include "rocksdb/env.h" +#include "rocksdb/status.h" #ifdef __FreeBSD__ #include #include #endif #include + #include "test_util/testharness.h" #include "util/coding.h" #include "util/string_util.h" @@ -32,23 +33,19 @@ class LockTest : public testing::Test { ~LockTest() override {} - Status LockFile(FileLock** db_lock) { - return env_->LockFile(file_, db_lock); - } + Status LockFile(FileLock** db_lock) { return env_->LockFile(file_, db_lock); } - Status UnlockFile(FileLock* db_lock) { - return env_->UnlockFile(db_lock); - } + Status UnlockFile(FileLock* db_lock) { return env_->UnlockFile(db_lock); } - bool AssertFileIsLocked(){ - return CheckFileLock( /* lock_expected = */ true); + bool AssertFileIsLocked() { + return CheckFileLock(/* lock_expected = */ true); } - bool AssertFileIsNotLocked(){ - return CheckFileLock( /* lock_expected = */ false); + bool AssertFileIsNotLocked() { + return CheckFileLock(/* lock_expected = */ false); } - bool CheckFileLock(bool lock_expected){ + bool CheckFileLock(bool lock_expected) { // We need to fork to check the fcntl lock as we need // to open and close the file from a different process // to avoid either releasing the lock on close, or not @@ -63,13 +60,13 @@ class LockTest : public testing::Test { #else pid_t pid = fork(); - if ( 0 == pid ) { + if (0 == pid) { // child process int exit_val = EXIT_FAILURE; int fd = open(file_.c_str(), O_RDWR | O_CREAT, 0644); if (fd < 0) { // could not open file, could not check if it was locked - fprintf( stderr, "Open on on file %s failed.\n",file_.c_str()); + fprintf(stderr, "Open on on file %s failed.\n", file_.c_str()); exit(exit_val); } @@ -78,23 +75,24 @@ class LockTest : public testing::Test { f.l_type = (F_WRLCK); f.l_whence = SEEK_SET; f.l_start = 0; - f.l_len = 0; // Lock/unlock entire file + f.l_len = 0; // Lock/unlock entire file int value = fcntl(fd, F_SETLK, &f); - if( value == -1 ){ - if( lock_expected ){ + if (value == -1) { + if (lock_expected) { exit_val = EXIT_SUCCESS; } } else { - if( ! lock_expected ){ + if (!lock_expected) { exit_val = EXIT_SUCCESS; } } - close(fd); // lock is released for child process + close(fd); // lock is released for child process exit(exit_val); } else if (pid > 0) { // parent process int status; - while (-1 == waitpid(pid, &status, 0)); + while (-1 == waitpid(pid, &status, 0)) + ; if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { // child process exited with non success status return false; @@ -102,15 +100,13 @@ class LockTest : public testing::Test { return true; } } else { - fprintf( stderr, "Fork failed\n" ); + fprintf(stderr, "Fork failed\n"); return false; } return false; #endif - } - }; LockTest* LockTest::current_; @@ -122,7 +118,7 @@ TEST_F(LockTest, LockBySameThread) { ASSERT_OK(LockFile(&lock1)); // check the file is locked - ASSERT_TRUE( AssertFileIsLocked() ); + ASSERT_TRUE(AssertFileIsLocked()); // re-acquire the lock on the same file. This should fail. Status s = LockFile(&lock2); @@ -134,14 +130,13 @@ TEST_F(LockTest, LockBySameThread) { #endif // check the file is locked - ASSERT_TRUE( AssertFileIsLocked() ); + ASSERT_TRUE(AssertFileIsLocked()); // release the lock ASSERT_OK(UnlockFile(lock1)); // check the file is not locked - ASSERT_TRUE( AssertFileIsNotLocked() ); - + ASSERT_TRUE(AssertFileIsNotLocked()); } } // namespace ROCKSDB_NAMESPACE diff --git a/util/filter_bench.cc b/util/filter_bench.cc index f1cf2711575..93186cd0841 100644 --- a/util/filter_bench.cc +++ b/util/filter_bench.cc @@ -195,7 +195,7 @@ struct KeyMaker { len += FastRange32( (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5); } - char * data = buf_.get() + start; + char *data = buf_.get() + start; // Populate key data such that all data makes it into a key of at // least 8 bytes. We also don't want all the within-filter key // variance confined to a contiguous 32 bits, because then a 32 bit @@ -378,9 +378,9 @@ void FilterBench::Go() { FLAGS_average_keys_per_filter); const uint32_t variance_offset = variance_range / 2; - const std::vector &testModes = - FLAGS_best_case ? bestCaseTestModes - : FLAGS_quick ? quickTestModes : allTestModes; + const std::vector &testModes = FLAGS_best_case ? bestCaseTestModes + : FLAGS_quick ? quickTestModes + : allTestModes; m_queries_ = FLAGS_m_queries; double working_mem_size_mb = FLAGS_working_mem_size_mb; diff --git a/util/hash_test.cc b/util/hash_test.cc index 4b1d5223873..72112b04481 100644 --- a/util/hash_test.cc +++ b/util/hash_test.cc @@ -547,7 +547,7 @@ TEST(FastRangeGenericTest, Values) { uint16_t{6234}); // Not recommended for typical use because for example this could fail on // some platforms and pass on others: - //EXPECT_EQ(FastRangeGeneric(static_cast(0x80000000), + // EXPECT_EQ(FastRangeGeneric(static_cast(0x80000000), // uint16_t{12468}), // uint16_t{6234}); } @@ -843,7 +843,7 @@ TEST(MathTest, CodingGeneric) { EXPECT_EQ(std::string("_12"), std::string(out)); } -int main(int argc, char** argv) { +int main(int argc, char **argv) { fprintf(stderr, "NPHash64 id: %x\n", static_cast(ROCKSDB_NAMESPACE::GetSliceNPHash64("RocksDB"))); ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/util/heap.h b/util/heap.h index 3f4cddeb90c..f221fc7327e 100644 --- a/util/heap.h +++ b/util/heap.h @@ -8,6 +8,7 @@ #include #include #include + #include "port/port.h" #include "util/autovector.h" @@ -37,11 +38,11 @@ namespace ROCKSDB_NAMESPACE { // std::priority_queue: the comparison operator is expected to provide the // less-than relation, but top() will return the maximum. -template> +template > class BinaryHeap { public: - BinaryHeap() { } - explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) { } + BinaryHeap() {} + explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) {} void push(const T& value) { data_.push_back(value); @@ -86,7 +87,7 @@ class BinaryHeap { } } - void swap(BinaryHeap &other) { + void swap(BinaryHeap& other) { std::swap(cmp_, other.cmp_); data_.swap(other.data_); std::swap(root_cmp_cache_, other.root_cmp_cache_); diff --git a/util/heap_test.cc b/util/heap_test.cc index fd54751e3f0..bbb93324f5a 100644 --- a/util/heap_test.cc +++ b/util/heap_test.cc @@ -31,8 +31,7 @@ namespace ROCKSDB_NAMESPACE { using HeapTestValue = uint64_t; using Params = std::tuple; -class HeapTest : public ::testing::TestWithParam { -}; +class HeapTest : public ::testing::TestWithParam {}; TEST_P(HeapTest, Test) { // This test performs the same pseudorandom sequence of operations on a @@ -54,15 +53,14 @@ TEST_P(HeapTest, Test) { std::mt19937 rng(static_cast(RNG_SEED)); std::uniform_int_distribution value_dist(0, MAX_VALUE); int ndrains = 0; - bool draining = false; // hit max size, draining until we empty the heap + bool draining = false; // hit max size, draining until we empty the heap size_t size = 0; for (int64_t i = 0; i < FLAGS_iters; ++i) { if (size == 0) { draining = false; } - if (!draining && - (size == 0 || std::bernoulli_distribution(0.4)(rng))) { + if (!draining && (size == 0 || std::bernoulli_distribution(0.4)(rng))) { // insert HeapTestValue val = value_dist(rng); heap.push(val); @@ -104,30 +102,22 @@ TEST_P(HeapTest, Test) { } // Basic test, MAX_VALUE = 3*MAX_HEAP_SIZE (occasional duplicates) -INSTANTIATE_TEST_CASE_P( - Basic, HeapTest, - ::testing::Values(Params(1000, 3000, 0x1b575cf05b708945)) -); +INSTANTIATE_TEST_CASE_P(Basic, HeapTest, + ::testing::Values(Params(1000, 3000, + 0x1b575cf05b708945))); // Mid-size heap with small values (many duplicates) -INSTANTIATE_TEST_CASE_P( - SmallValues, HeapTest, - ::testing::Values(Params(100, 10, 0x5ae213f7bd5dccd0)) -); +INSTANTIATE_TEST_CASE_P(SmallValues, HeapTest, + ::testing::Values(Params(100, 10, 0x5ae213f7bd5dccd0))); // Small heap, large value range (no duplicates) -INSTANTIATE_TEST_CASE_P( - SmallHeap, HeapTest, - ::testing::Values(Params(10, ULLONG_MAX, 0x3e1fa8f4d01707cf)) -); +INSTANTIATE_TEST_CASE_P(SmallHeap, HeapTest, + ::testing::Values(Params(10, ULLONG_MAX, + 0x3e1fa8f4d01707cf))); // Two-element heap -INSTANTIATE_TEST_CASE_P( - TwoElementHeap, HeapTest, - ::testing::Values(Params(2, 5, 0x4b5e13ea988c6abc)) -); +INSTANTIATE_TEST_CASE_P(TwoElementHeap, HeapTest, + ::testing::Values(Params(2, 5, 0x4b5e13ea988c6abc))); // One-element heap -INSTANTIATE_TEST_CASE_P( - OneElementHeap, HeapTest, - ::testing::Values(Params(1, 3, 0x176a1019ab0b612e)) -); +INSTANTIATE_TEST_CASE_P(OneElementHeap, HeapTest, + ::testing::Values(Params(1, 3, 0x176a1019ab0b612e))); } // namespace ROCKSDB_NAMESPACE diff --git a/util/kv_map.h b/util/kv_map.h index 89300d7ac82..62be6d18e36 100644 --- a/util/kv_map.h +++ b/util/kv_map.h @@ -29,5 +29,5 @@ struct LessOfComparator { }; using KVMap = std::map; -} +} // namespace stl_wrappers } // namespace ROCKSDB_NAMESPACE diff --git a/util/murmurhash.cc b/util/murmurhash.cc index 9ec4aa63357..a69f3918abe 100644 --- a/util/murmurhash.cc +++ b/util/murmurhash.cc @@ -10,6 +10,7 @@ is under the MIT license. */ #include "murmurhash.h" + #include "port/lang.h" #if defined(__x86_64__) @@ -28,6 +29,7 @@ __attribute__((__no_sanitize__("alignment"))) __attribute__((__no_sanitize_undefined__)) #endif #endif +// clang-format off uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) { const uint64_t m = 0xc6a4a7935bd1e995; @@ -70,6 +72,7 @@ uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) return h; } +// clang-format on #elif defined(__i386__) @@ -85,7 +88,7 @@ uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) // 1. It will not work incrementally. // 2. It will not produce the same results on little-endian and big-endian // machines. - +// clang-format off unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) { // 'm' and 'r' are mixing constants generated offline. @@ -136,6 +139,7 @@ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) return h; } +// clang-format on #else @@ -143,7 +147,7 @@ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) // // Same as MurmurHash2, but endian- and alignment-neutral. // Half the speed though, alas. - +// clang-format off unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) { const unsigned int m = 0x5bd1e995; @@ -187,5 +191,6 @@ unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) return h; } +// clang-format on #endif diff --git a/util/murmurhash.h b/util/murmurhash.h index 5f66c4ebf04..7ef4cbbec88 100644 --- a/util/murmurhash.h +++ b/util/murmurhash.h @@ -11,23 +11,24 @@ */ #pragma once #include + #include "rocksdb/slice.h" #if defined(__x86_64__) #define MURMUR_HASH MurmurHash64A -uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ); +uint64_t MurmurHash64A(const void* key, int len, unsigned int seed); #define MurmurHash MurmurHash64A using murmur_t = uint64_t; #elif defined(__i386__) #define MURMUR_HASH MurmurHash2 -unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ); +unsigned int MurmurHash2(const void* key, int len, unsigned int seed); #define MurmurHash MurmurHash2 using murmur_t = unsigned int; #else #define MURMUR_HASH MurmurHashNeutral2 -unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ); +unsigned int MurmurHashNeutral2(const void* key, int len, unsigned int seed); #define MurmurHash MurmurHashNeutral2 using murmur_t = unsigned int; #endif diff --git a/util/mutexlock.h b/util/mutexlock.h index 8af29dd784a..94066b29ea4 100644 --- a/util/mutexlock.h +++ b/util/mutexlock.h @@ -9,9 +9,11 @@ #pragma once #include + #include #include #include + #include "port/port.h" namespace ROCKSDB_NAMESPACE { @@ -28,9 +30,7 @@ namespace ROCKSDB_NAMESPACE { class MutexLock { public: - explicit MutexLock(port::Mutex *mu) : mu_(mu) { - this->mu_->Lock(); - } + explicit MutexLock(port::Mutex *mu) : mu_(mu) { this->mu_->Lock(); } // No copying allowed MutexLock(const MutexLock &) = delete; void operator=(const MutexLock &) = delete; @@ -48,9 +48,7 @@ class MutexLock { // class ReadLock { public: - explicit ReadLock(port::RWMutex *mu) : mu_(mu) { - this->mu_->ReadLock(); - } + explicit ReadLock(port::RWMutex *mu) : mu_(mu) { this->mu_->ReadLock(); } // No copying allowed ReadLock(const ReadLock &) = delete; void operator=(const ReadLock &) = delete; @@ -84,9 +82,7 @@ class ReadUnlock { // class WriteLock { public: - explicit WriteLock(port::RWMutex *mu) : mu_(mu) { - this->mu_->WriteLock(); - } + explicit WriteLock(port::RWMutex *mu) : mu_(mu) { this->mu_->WriteLock(); } // No copying allowed WriteLock(const WriteLock &) = delete; void operator=(const WriteLock &) = delete; @@ -152,13 +148,11 @@ class Striped { public: Striped(size_t stripes, std::function hash) : stripes_(stripes), hash_(hash) { - locks_ = reinterpret_cast *>( port::cacheline_aligned_alloc(sizeof(LockData) * stripes)); for (size_t i = 0; i < stripes; i++) { new (&locks_[i]) LockData(); } - } virtual ~Striped() { diff --git a/util/random.cc b/util/random.cc index 5d9f4bc67c2..c94c28dfb2b 100644 --- a/util/random.cc +++ b/util/random.cc @@ -8,6 +8,7 @@ #include #include + #include #include diff --git a/util/random.h b/util/random.h index 16162f67bb9..8923bdc4f03 100644 --- a/util/random.h +++ b/util/random.h @@ -85,9 +85,7 @@ class Random { // Skewed: pick "base" uniformly from range [0,max_log] and then // return "base" random bits. The effect is to pick a number in the // range [0,2^max_log-1] with exponential bias towards smaller numbers. - uint32_t Skewed(int max_log) { - return Uniform(1 << Uniform(max_log + 1)); - } + uint32_t Skewed(int max_log) { return Uniform(1 << Uniform(max_log + 1)); } // Returns a random string of length "len" std::string RandomString(int len); @@ -153,7 +151,7 @@ class Random64 { std::mt19937_64 generator_; public: - explicit Random64(uint64_t s) : generator_(s) { } + explicit Random64(uint64_t s) : generator_(s) {} // Generates the next random number uint64_t Next() { return generator_(); } diff --git a/util/random_test.cc b/util/random_test.cc index b8c9357e8a6..1aa62c5da64 100644 --- a/util/random_test.cc +++ b/util/random_test.cc @@ -7,11 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "util/random.h" + #include #include #include "test_util/testharness.h" -#include "util/random.h" using ROCKSDB_NAMESPACE::Random; diff --git a/util/ribbon_test.cc b/util/ribbon_test.cc index ae6d1db498c..6519df3d5fb 100644 --- a/util/ribbon_test.cc +++ b/util/ribbon_test.cc @@ -836,9 +836,10 @@ TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) { double single_failure_rate = 1.0 * total_single_failures / total_singles; fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate); // A rough bound (one sided) based on nothing in particular - double expected_single_failures = - 1.0 * total_singles / - (sizeof(CoeffRow) == 16 ? 128 : TypeParam::kUseSmash ? 64 : 32); + double expected_single_failures = 1.0 * total_singles / + (sizeof(CoeffRow) == 16 ? 128 + : TypeParam::kUseSmash ? 64 + : 32); EXPECT_LE(total_single_failures, InfrequentPoissonUpperBound(expected_single_failures)); } diff --git a/util/slice.cc b/util/slice.cc index f9f4ddd596e..1fa21afcb2d 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -128,7 +128,7 @@ class CappedPrefixTransform : public SliceTransform { class NoopTransform : public SliceTransform { public: - explicit NoopTransform() { } + explicit NoopTransform() {} static const char* kClassName() { return "rocksdb.Noop"; } const char* Name() const override { return kClassName(); } diff --git a/util/slice_test.cc b/util/slice_test.cc index 486590e7b54..e1c35d567f3 100644 --- a/util/slice_test.cc +++ b/util/slice_test.cc @@ -32,8 +32,7 @@ void Multiplier(void* arg1, void* arg2) { class PinnableSliceTest : public testing::Test { public: - void AssertSameData(const std::string& expected, - const PinnableSlice& slice) { + void AssertSameData(const std::string& expected, const PinnableSlice& slice) { std::string got; got.assign(slice.data(), slice.size()); ASSERT_EQ(expected, got); diff --git a/util/status.cc b/util/status.cc index 2c9aa501525..1156b10ef49 100644 --- a/util/status.cc +++ b/util/status.cc @@ -8,11 +8,13 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/status.h" + #include #ifdef OS_WIN #include #endif #include + #include "port/port.h" namespace ROCKSDB_NAMESPACE { @@ -67,6 +69,13 @@ Status::Status(Code _code, SubCode _subcode, const Slice& msg, state_.reset(result); } +Status Status::CopyAppendMessage(const Status& s, const Slice& delim, + const Slice& msg) { + // (No attempt at efficiency) + return Status(s.code(), s.subcode(), s.severity(), + std::string(s.getState()) + delim.ToString() + msg.ToString()); +} + std::string Status::ToString() const { #ifdef ROCKSDB_ASSERT_STATUS_CHECKED checked_ = true; diff --git a/util/string_util.cc b/util/string_util.cc index 94459dac46a..324482a4cd7 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -8,6 +8,7 @@ #include #include #include + #include #include #include @@ -15,6 +16,7 @@ #include #include #include + #include "port/port.h" #include "port/sys_time.h" #include "rocksdb/slice.h" diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc index 51edee020e2..af4e6235595 100644 --- a/util/thread_list_test.cc +++ b/util/thread_list_test.cc @@ -3,8 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include #include +#include #include "monitoring/thread_status_updater.h" #include "rocksdb/db.h" @@ -17,16 +17,19 @@ namespace ROCKSDB_NAMESPACE { class SimulatedBackgroundTask { public: SimulatedBackgroundTask( - const void* db_key, const std::string& db_name, - const void* cf_key, const std::string& cf_name, + const void* db_key, const std::string& db_name, const void* cf_key, + const std::string& cf_name, const ThreadStatus::OperationType operation_type = ThreadStatus::OP_UNKNOWN, - const ThreadStatus::StateType state_type = - ThreadStatus::STATE_UNKNOWN) - : db_key_(db_key), db_name_(db_name), - cf_key_(cf_key), cf_name_(cf_name), - operation_type_(operation_type), state_type_(state_type), - should_run_(true), running_count_(0) { + const ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN) + : db_key_(db_key), + db_name_(db_name), + cf_key_(cf_key), + cf_name_(cf_name), + operation_type_(operation_type), + state_type_(state_type), + should_run_(true), + running_count_(0) { Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo( db_key_, db_name_, cf_key_, cf_name_); } @@ -92,24 +95,22 @@ class SimulatedBackgroundTask { class ThreadListTest : public testing::Test { public: - ThreadListTest() { - } + ThreadListTest() {} }; TEST_F(ThreadListTest, GlobalTables) { // verify the global tables for operations and states are properly indexed. for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) { ASSERT_EQ(global_operation_table[type].type, type); - ASSERT_EQ(global_operation_table[type].name, - ThreadStatus::GetOperationName( - ThreadStatus::OperationType(type))); + ASSERT_EQ( + global_operation_table[type].name, + ThreadStatus::GetOperationName(ThreadStatus::OperationType(type))); } for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) { ASSERT_EQ(global_state_table[type].type, type); ASSERT_EQ(global_state_table[type].name, - ThreadStatus::GetStateName( - ThreadStatus::StateType(type))); + ThreadStatus::GetStateName(ThreadStatus::StateType(type))); } for (int stage = 0; stage != ThreadStatus::NUM_OP_STAGES; ++stage) { @@ -131,18 +132,18 @@ TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) { env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW); // Wait 1 second so that threads start Env::Default()->SleepForMicroseconds(kDelayMicros); - SimulatedBackgroundTask running_task( - reinterpret_cast(1234), "running", - reinterpret_cast(5678), "pikachu"); + SimulatedBackgroundTask running_task(reinterpret_cast(1234), "running", + reinterpret_cast(5678), + "pikachu"); for (int test = 0; test < kSimulatedHighPriThreads; ++test) { - env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, - &running_task, Env::Priority::HIGH); + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &running_task, + Env::Priority::HIGH); } for (int test = 0; test < kSimulatedLowPriThreads; ++test) { - env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, - &running_task, Env::Priority::LOW); + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &running_task, + Env::Priority::LOW); } running_task.WaitUntilScheduled(kSimulatedHighPriThreads + kSimulatedLowPriThreads); @@ -168,14 +169,10 @@ TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) { ASSERT_EQ(0, env->ReserveThreads(kHighPriorityThreads, Env::Priority::HIGH)); ASSERT_EQ(0, env->ReserveThreads(kLowPriorityThreads, Env::Priority::LOW)); - ASSERT_EQ( - running_count[ThreadStatus::HIGH_PRIORITY], - kSimulatedHighPriThreads); - ASSERT_EQ( - running_count[ThreadStatus::LOW_PRIORITY], - kSimulatedLowPriThreads); - ASSERT_EQ( - running_count[ThreadStatus::USER], 0); + ASSERT_EQ(running_count[ThreadStatus::HIGH_PRIORITY], + kSimulatedHighPriThreads); + ASSERT_EQ(running_count[ThreadStatus::LOW_PRIORITY], kSimulatedLowPriThreads); + ASSERT_EQ(running_count[ThreadStatus::USER], 0); running_task.FinishAllTasks(); running_task.WaitUntilDone(); @@ -197,37 +194,33 @@ TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) { } } - ASSERT_EQ( - running_count[ThreadStatus::HIGH_PRIORITY], 0); - ASSERT_EQ( - running_count[ThreadStatus::LOW_PRIORITY], 0); - ASSERT_EQ( - running_count[ThreadStatus::USER], 0); + ASSERT_EQ(running_count[ThreadStatus::HIGH_PRIORITY], 0); + ASSERT_EQ(running_count[ThreadStatus::LOW_PRIORITY], 0); + ASSERT_EQ(running_count[ThreadStatus::USER], 0); } namespace { - void UpdateStatusCounts( - const std::vector& thread_list, - int operation_counts[], int state_counts[]) { - for (auto thread_status : thread_list) { - operation_counts[thread_status.operation_type]++; - state_counts[thread_status.state_type]++; - } +void UpdateStatusCounts(const std::vector& thread_list, + int operation_counts[], int state_counts[]) { + for (auto thread_status : thread_list) { + operation_counts[thread_status.operation_type]++; + state_counts[thread_status.state_type]++; } +} - void VerifyAndResetCounts( - const int correct_counts[], int collected_counts[], int size) { - for (int i = 0; i < size; ++i) { - ASSERT_EQ(collected_counts[i], correct_counts[i]); - collected_counts[i] = 0; - } +void VerifyAndResetCounts(const int correct_counts[], int collected_counts[], + int size) { + for (int i = 0; i < size; ++i) { + ASSERT_EQ(collected_counts[i], correct_counts[i]); + collected_counts[i] = 0; } +} - void UpdateCount( - int operation_counts[], int from_event, int to_event, int amount) { - operation_counts[from_event] -= amount; - operation_counts[to_event] += amount; - } +void UpdateCount(int operation_counts[], int from_event, int to_event, + int amount) { + operation_counts[from_event] -= amount; + operation_counts[to_event] += amount; +} } // namespace TEST_F(ThreadListTest, SimpleEventTest) { @@ -236,62 +229,57 @@ TEST_F(ThreadListTest, SimpleEventTest) { // simulated tasks const int kFlushWriteTasks = 3; SimulatedBackgroundTask flush_write_task( - reinterpret_cast(1234), "running", - reinterpret_cast(5678), "pikachu", - ThreadStatus::OP_FLUSH); + reinterpret_cast(1234), "running", reinterpret_cast(5678), + "pikachu", ThreadStatus::OP_FLUSH); const int kCompactionWriteTasks = 4; SimulatedBackgroundTask compaction_write_task( - reinterpret_cast(1234), "running", - reinterpret_cast(5678), "pikachu", - ThreadStatus::OP_COMPACTION); + reinterpret_cast(1234), "running", reinterpret_cast(5678), + "pikachu", ThreadStatus::OP_COMPACTION); const int kCompactionReadTasks = 5; SimulatedBackgroundTask compaction_read_task( - reinterpret_cast(1234), "running", - reinterpret_cast(5678), "pikachu", - ThreadStatus::OP_COMPACTION); + reinterpret_cast(1234), "running", reinterpret_cast(5678), + "pikachu", ThreadStatus::OP_COMPACTION); const int kCompactionWaitTasks = 6; SimulatedBackgroundTask compaction_wait_task( - reinterpret_cast(1234), "running", - reinterpret_cast(5678), "pikachu", - ThreadStatus::OP_COMPACTION); + reinterpret_cast(1234), "running", reinterpret_cast(5678), + "pikachu", ThreadStatus::OP_COMPACTION); // setup right answers int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0}; - correct_operation_counts[ThreadStatus::OP_FLUSH] = - kFlushWriteTasks; + correct_operation_counts[ThreadStatus::OP_FLUSH] = kFlushWriteTasks; correct_operation_counts[ThreadStatus::OP_COMPACTION] = kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks; - env->SetBackgroundThreads( - correct_operation_counts[ThreadStatus::OP_FLUSH], Env::HIGH); + env->SetBackgroundThreads(correct_operation_counts[ThreadStatus::OP_FLUSH], + Env::HIGH); env->SetBackgroundThreads( correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW); // schedule the simulated tasks for (int t = 0; t < kFlushWriteTasks; ++t) { - env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, - &flush_write_task, Env::Priority::HIGH); + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &flush_write_task, + Env::Priority::HIGH); } flush_write_task.WaitUntilScheduled(kFlushWriteTasks); for (int t = 0; t < kCompactionWriteTasks; ++t) { env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, - &compaction_write_task, Env::Priority::LOW); + &compaction_write_task, Env::Priority::LOW); } compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks); for (int t = 0; t < kCompactionReadTasks; ++t) { env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, - &compaction_read_task, Env::Priority::LOW); + &compaction_read_task, Env::Priority::LOW); } compaction_read_task.WaitUntilScheduled(kCompactionReadTasks); for (int t = 0; t < kCompactionWaitTasks; ++t) { env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, - &compaction_wait_task, Env::Priority::LOW); + &compaction_wait_task, Env::Priority::LOW); } compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks); diff --git a/util/thread_local.cc b/util/thread_local.cc index 61c5f59dcfe..969639d9bc9 100644 --- a/util/thread_local.cc +++ b/util/thread_local.cc @@ -8,10 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "util/thread_local.h" -#include "util/mutexlock.h" -#include "port/likely.h" + #include +#include "port/likely.h" +#include "util/mutexlock.h" + namespace ROCKSDB_NAMESPACE { struct Entry { @@ -39,10 +41,7 @@ class StaticMeta; // --------------------------------------------------- struct ThreadData { explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst) - : entries(), - next(nullptr), - prev(nullptr), - inst(_inst) {} + : entries(), next(nullptr), prev(nullptr), inst(_inst) {} std::vector entries; ThreadData* next; ThreadData* prev; @@ -50,7 +49,7 @@ struct ThreadData { }; class ThreadLocalPtr::StaticMeta { -public: + public: StaticMeta(); // Return the next available Id @@ -107,7 +106,7 @@ class ThreadLocalPtr::StaticMeta { // should be used. One example is OnThreadExit() function. port::Mutex* MemberMutex() { return &mutex_; } -private: + private: // Get UnrefHandler for id with acquiring mutex // REQUIRES: mutex locked UnrefHandler GetHandler(uint32_t id); @@ -173,7 +172,7 @@ namespace wintlscleanup { // This is set to OnThreadExit in StaticMeta singleton constructor UnrefHandler thread_local_inclass_routine = nullptr; -pthread_key_t thread_local_key = pthread_key_t (-1); +pthread_key_t thread_local_key = pthread_key_t(-1); // Static callback function to call with each thread termination. void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) { @@ -189,7 +188,7 @@ void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) { } } -} // wintlscleanup +} // namespace wintlscleanup // extern "C" suppresses C++ name mangling so we know the symbol name for the // linker /INCLUDE:symbol pragma above. @@ -298,9 +297,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { } ThreadLocalPtr::StaticMeta::StaticMeta() - : next_instance_id_(0), - head_(this), - pthread_key_(0) { + : next_instance_id_(0), head_(this), pthread_key_(0) { if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { abort(); } @@ -345,8 +342,7 @@ void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadData* d) { head_.prev = d; } -void ThreadLocalPtr::StaticMeta::RemoveThreadData( - ThreadData* d) { +void ThreadLocalPtr::StaticMeta::RemoveThreadData(ThreadData* d) { Mutex()->AssertHeld(); d->next->prev = d->prev; d->prev->next = d->next; @@ -406,7 +402,7 @@ void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) { } bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, - void*& expected) { + void*& expected) { auto* tls = GetThreadLocal(); if (UNLIKELY(id >= tls->entries.size())) { // Need mutex to protect entries access within ReclaimId @@ -418,7 +414,7 @@ bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, } void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector* ptrs, - void* const replacement) { + void* const replacement) { MutexLock l(Mutex()); for (ThreadData* t = head_.next; t != &head_; t = t->next) { if (id < t->entries.size()) { @@ -443,9 +439,7 @@ void ThreadLocalPtr::StaticMeta::Fold(uint32_t id, FoldFunc func, void* res) { } } -uint32_t ThreadLocalPtr::TEST_PeekId() { - return Instance()->PeekId(); -} +uint32_t ThreadLocalPtr::TEST_PeekId() { return Instance()->PeekId(); } void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) { MutexLock l(Mutex()); @@ -504,21 +498,13 @@ ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler) } } -ThreadLocalPtr::~ThreadLocalPtr() { - Instance()->ReclaimId(id_); -} +ThreadLocalPtr::~ThreadLocalPtr() { Instance()->ReclaimId(id_); } -void* ThreadLocalPtr::Get() const { - return Instance()->Get(id_); -} +void* ThreadLocalPtr::Get() const { return Instance()->Get(id_); } -void ThreadLocalPtr::Reset(void* ptr) { - Instance()->Reset(id_, ptr); -} +void ThreadLocalPtr::Reset(void* ptr) { Instance()->Reset(id_, ptr); } -void* ThreadLocalPtr::Swap(void* ptr) { - return Instance()->Swap(id_, ptr); -} +void* ThreadLocalPtr::Swap(void* ptr) { return Instance()->Swap(id_, ptr); } bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) { return Instance()->CompareAndSwap(id_, ptr, expected); diff --git a/util/thread_local.h b/util/thread_local.h index 01790ccc087..fde68f86fb1 100644 --- a/util/thread_local.h +++ b/util/thread_local.h @@ -15,8 +15,8 @@ #include #include -#include "util/autovector.h" #include "port/port.h" +#include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -91,8 +91,7 @@ class ThreadLocalPtr { class StaticMeta; -private: - + private: static StaticMeta* Instance(); const uint32_t id_; diff --git a/util/thread_operation.h b/util/thread_operation.h index b5d4b6906df..c24fccd5c41 100644 --- a/util/thread_operation.h +++ b/util/thread_operation.h @@ -13,10 +13,10 @@ #pragma once -#include "rocksdb/thread_status.h" - #include +#include "rocksdb/thread_status.h" + namespace ROCKSDB_NAMESPACE { #ifdef ROCKSDB_USING_THREAD_STATUS @@ -36,10 +36,9 @@ struct OperationInfo { // Note that it's not designed to be constant as in the future we // might consider adding global count to the OperationInfo. static OperationInfo global_operation_table[] = { - {ThreadStatus::OP_UNKNOWN, ""}, - {ThreadStatus::OP_COMPACTION, "Compaction"}, - {ThreadStatus::OP_FLUSH, "Flush"} -}; + {ThreadStatus::OP_UNKNOWN, ""}, + {ThreadStatus::OP_COMPACTION, "Compaction"}, + {ThreadStatus::OP_FLUSH, "Flush"}}; struct OperationStageInfo { const ThreadStatus::OperationStage stage; @@ -50,27 +49,22 @@ struct OperationStageInfo { // Note that the string must be changed accordingly when the // associated function name changed. static OperationStageInfo global_op_stage_table[] = { - {ThreadStatus::STAGE_UNKNOWN, ""}, - {ThreadStatus::STAGE_FLUSH_RUN, - "FlushJob::Run"}, - {ThreadStatus::STAGE_FLUSH_WRITE_L0, - "FlushJob::WriteLevel0Table"}, - {ThreadStatus::STAGE_COMPACTION_PREPARE, - "CompactionJob::Prepare"}, - {ThreadStatus::STAGE_COMPACTION_RUN, - "CompactionJob::Run"}, - {ThreadStatus::STAGE_COMPACTION_PROCESS_KV, - "CompactionJob::ProcessKeyValueCompaction"}, - {ThreadStatus::STAGE_COMPACTION_INSTALL, - "CompactionJob::Install"}, - {ThreadStatus::STAGE_COMPACTION_SYNC_FILE, - "CompactionJob::FinishCompactionOutputFile"}, - {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH, - "MemTableList::PickMemtablesToFlush"}, - {ThreadStatus::STAGE_MEMTABLE_ROLLBACK, - "MemTableList::RollbackMemtableFlush"}, - {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS, - "MemTableList::TryInstallMemtableFlushResults"}, + {ThreadStatus::STAGE_UNKNOWN, ""}, + {ThreadStatus::STAGE_FLUSH_RUN, "FlushJob::Run"}, + {ThreadStatus::STAGE_FLUSH_WRITE_L0, "FlushJob::WriteLevel0Table"}, + {ThreadStatus::STAGE_COMPACTION_PREPARE, "CompactionJob::Prepare"}, + {ThreadStatus::STAGE_COMPACTION_RUN, "CompactionJob::Run"}, + {ThreadStatus::STAGE_COMPACTION_PROCESS_KV, + "CompactionJob::ProcessKeyValueCompaction"}, + {ThreadStatus::STAGE_COMPACTION_INSTALL, "CompactionJob::Install"}, + {ThreadStatus::STAGE_COMPACTION_SYNC_FILE, + "CompactionJob::FinishCompactionOutputFile"}, + {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH, + "MemTableList::PickMemtablesToFlush"}, + {ThreadStatus::STAGE_MEMTABLE_ROLLBACK, + "MemTableList::RollbackMemtableFlush"}, + {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS, + "MemTableList::TryInstallMemtableFlushResults"}, }; // The structure that describes a state. @@ -85,8 +79,8 @@ struct StateInfo { // of the current ThreadStatusData will be pointing to one of the // rows in this global table. static StateInfo global_state_table[] = { - {ThreadStatus::STATE_UNKNOWN, ""}, - {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"}, + {ThreadStatus::STATE_UNKNOWN, ""}, + {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"}, }; struct OperationProperty { @@ -95,27 +89,24 @@ struct OperationProperty { }; static OperationProperty compaction_operation_properties[] = { - {ThreadStatus::COMPACTION_JOB_ID, "JobID"}, - {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"}, - {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"}, - {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"}, - {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"}, - {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"}, + {ThreadStatus::COMPACTION_JOB_ID, "JobID"}, + {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"}, + {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"}, + {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"}, + {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"}, + {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"}, }; static OperationProperty flush_operation_properties[] = { - {ThreadStatus::FLUSH_JOB_ID, "JobID"}, - {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"}, - {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"} -}; + {ThreadStatus::FLUSH_JOB_ID, "JobID"}, + {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"}, + {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"}}; #else -struct OperationInfo { -}; +struct OperationInfo {}; -struct StateInfo { -}; +struct StateInfo {}; #endif // ROCKSDB_USING_THREAD_STATUS } // namespace ROCKSDB_NAMESPACE diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc index e6d88213f24..09706cac57d 100644 --- a/util/threadpool_imp.cc +++ b/util/threadpool_imp.cc @@ -10,12 +10,12 @@ #include "util/threadpool_imp.h" #ifndef OS_WIN -# include +#include #endif #ifdef OS_LINUX -# include -# include +#include +#include #endif #include @@ -44,7 +44,6 @@ void ThreadPoolImpl::PthreadCall(const char* label, int result) { } struct ThreadPoolImpl::Impl { - Impl(); ~Impl(); @@ -61,16 +60,14 @@ struct ThreadPoolImpl::Impl { void LowerCPUPriority(CpuPriority pri); - void WakeUpAllThreads() { - bgsignal_.notify_all(); - } + void WakeUpAllThreads() { bgsignal_.notify_all(); } void BGThread(size_t thread_id); void StartBGThreads(); void Submit(std::function&& schedule, - std::function&& unschedule, void* tag); + std::function&& unschedule, void* tag); int UnSchedule(void* arg); @@ -124,41 +121,41 @@ struct ThreadPoolImpl::Impl { return released_threads_in_success; } -private: - static void BGThreadWrapper(void* arg); - - bool low_io_priority_; - CpuPriority cpu_priority_; - Env::Priority priority_; - Env* env_; - - int total_threads_limit_; - std::atomic_uint queue_len_; // Queue length. Used for stats reporting - // Number of reserved threads, managed by ReserveThreads(..) and - // ReleaseThreads(..), if num_waiting_threads_ is no larger than - // reserved_threads_, its thread will be blocked to ensure the reservation - // mechanism - int reserved_threads_; - // Number of waiting threads (Maximum number of threads that can be - // reserved), in rare cases, num_waiting_threads_ could be less than - // reserved_threads due to SetBackgroundThreadInternal or last - // excessive threads. - int num_waiting_threads_; - bool exit_all_threads_; - bool wait_for_jobs_to_complete_; - - // Entry per Schedule()/Submit() call - struct BGItem { - void* tag = nullptr; - std::function function; - std::function unschedFunction; + private: + static void BGThreadWrapper(void* arg); + + bool low_io_priority_; + CpuPriority cpu_priority_; + Env::Priority priority_; + Env* env_; + + int total_threads_limit_; + std::atomic_uint queue_len_; // Queue length. Used for stats reporting + // Number of reserved threads, managed by ReserveThreads(..) and + // ReleaseThreads(..), if num_waiting_threads_ is no larger than + // reserved_threads_, its thread will be blocked to ensure the reservation + // mechanism + int reserved_threads_; + // Number of waiting threads (Maximum number of threads that can be + // reserved), in rare cases, num_waiting_threads_ could be less than + // reserved_threads due to SetBackgroundThreadInternal or last + // excessive threads. + int num_waiting_threads_; + bool exit_all_threads_; + bool wait_for_jobs_to_complete_; + + // Entry per Schedule()/Submit() call + struct BGItem { + void* tag = nullptr; + std::function function; + std::function unschedFunction; }; using BGQueue = std::deque; - BGQueue queue_; + BGQueue queue_; - std::mutex mu_; - std::condition_variable bgsignal_; + std::mutex mu_; + std::condition_variable bgsignal_; std::vector bgthreads_; }; @@ -178,11 +175,9 @@ inline ThreadPoolImpl::Impl::Impl() bgsignal_(), bgthreads_() {} -inline -ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); } +inline ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); } void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) { - std::unique_lock lock(mu_); assert(!exit_all_threads_); @@ -208,8 +203,7 @@ void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) { wait_for_jobs_to_complete_ = false; } -inline -void ThreadPoolImpl::Impl::LowerIOPriority() { +inline void ThreadPoolImpl::Impl::LowerIOPriority() { std::lock_guard lock(mu_); low_io_priority_ = true; } @@ -247,10 +241,9 @@ void ThreadPoolImpl::Impl::BGThread(size_t thread_id) { if (exit_all_threads_) { // mechanism to let BG threads exit safely - if (!wait_for_jobs_to_complete_ || - queue_.empty()) { + if (!wait_for_jobs_to_complete_ || queue_.empty()) { break; - } + } } else if (IsLastExcessiveThread(thread_id)) { // Current thread is the last generated one and is excessive. // We always terminate excessive thread in the reverse order of @@ -365,7 +358,7 @@ void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) { } void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num, - bool allow_reduce) { + bool allow_reduce) { std::lock_guard lock(mu_); if (exit_all_threads_) { return; @@ -387,7 +380,7 @@ void ThreadPoolImpl::Impl::StartBGThreads() { // Start background thread if necessary while ((int)bgthreads_.size() < total_threads_limit_) { port::Thread p_t(&BGThreadWrapper, - new BGThreadMetadata(this, bgthreads_.size())); + new BGThreadMetadata(this, bgthreads_.size())); // Set the thread name to aid debugging #if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) @@ -407,8 +400,8 @@ void ThreadPoolImpl::Impl::StartBGThreads() { } void ThreadPoolImpl::Impl::Submit(std::function&& schedule, - std::function&& unschedule, void* tag) { - + std::function&& unschedule, + void* tag) { std::lock_guard lock(mu_); if (exit_all_threads_) { @@ -426,7 +419,7 @@ void ThreadPoolImpl::Impl::Submit(std::function&& schedule, item.unschedFunction = std::move(unschedule); queue_len_.store(static_cast(queue_.size()), - std::memory_order_relaxed); + std::memory_order_relaxed); if (!HasExcessiveThread()) { // Wake up at least one waiting thread. @@ -459,11 +452,10 @@ int ThreadPoolImpl::Impl::UnSchedule(void* arg) { } } queue_len_.store(static_cast(queue_.size()), - std::memory_order_relaxed); + std::memory_order_relaxed); } - - // Run unschedule functions outside the mutex + // Run unschedule functions outside the mutex for (auto& f : candidates) { f(); } @@ -471,17 +463,11 @@ int ThreadPoolImpl::Impl::UnSchedule(void* arg) { return count; } -ThreadPoolImpl::ThreadPoolImpl() : - impl_(new Impl()) { -} - +ThreadPoolImpl::ThreadPoolImpl() : impl_(new Impl()) {} -ThreadPoolImpl::~ThreadPoolImpl() { -} +ThreadPoolImpl::~ThreadPoolImpl() {} -void ThreadPoolImpl::JoinAllThreads() { - impl_->JoinThreads(false); -} +void ThreadPoolImpl::JoinAllThreads() { impl_->JoinThreads(false); } void ThreadPoolImpl::SetBackgroundThreads(int num) { impl_->SetBackgroundThreadsInternal(num, true); @@ -499,9 +485,7 @@ void ThreadPoolImpl::WaitForJobsAndJoinAllThreads() { impl_->JoinThreads(true); } -void ThreadPoolImpl::LowerIOPriority() { - impl_->LowerIOPriority(); -} +void ThreadPoolImpl::LowerIOPriority() { impl_->LowerIOPriority(); } void ThreadPoolImpl::LowerCPUPriority(CpuPriority pri) { impl_->LowerCPUPriority(pri); @@ -516,13 +500,12 @@ void ThreadPoolImpl::SubmitJob(const std::function& job) { impl_->Submit(std::move(copy), std::function(), nullptr); } - void ThreadPoolImpl::SubmitJob(std::function&& job) { impl_->Submit(std::move(job), std::function(), nullptr); } -void ThreadPoolImpl::Schedule(void(*function)(void* arg1), void* arg, - void* tag, void(*unschedFunction)(void* arg)) { +void ThreadPoolImpl::Schedule(void (*function)(void* arg1), void* arg, + void* tag, void (*unschedFunction)(void* arg)) { if (unschedFunction == nullptr) { impl_->Submit(std::bind(function, arg), std::function(), tag); } else { @@ -531,9 +514,7 @@ void ThreadPoolImpl::Schedule(void(*function)(void* arg1), void* arg, } } -int ThreadPoolImpl::UnSchedule(void* arg) { - return impl_->UnSchedule(arg); -} +int ThreadPoolImpl::UnSchedule(void* arg) { return impl_->UnSchedule(arg); } void ThreadPoolImpl::SetHostEnv(Env* env) { impl_->SetHostEnv(env); } diff --git a/util/threadpool_imp.h b/util/threadpool_imp.h index e072ae8d30a..a5109e38f51 100644 --- a/util/threadpool_imp.h +++ b/util/threadpool_imp.h @@ -8,11 +8,11 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "rocksdb/threadpool.h" -#include "rocksdb/env.h" - -#include #include +#include + +#include "rocksdb/env.h" +#include "rocksdb/threadpool.h" namespace ROCKSDB_NAMESPACE { @@ -101,20 +101,20 @@ class ThreadPoolImpl : public ThreadPool { struct Impl; private: - - // Current public virtual interface does not provide usable - // functionality and thus can not be used internally to - // facade different implementations. - // - // We propose a pimpl idiom in order to easily replace the thread pool impl - // w/o touching the header file but providing a different .cc potentially - // CMake option driven. - // - // Another option is to introduce a Env::MakeThreadPool() virtual interface - // and override the environment. This would require refactoring ThreadPool usage. - // - // We can also combine these two approaches - std::unique_ptr impl_; + // Current public virtual interface does not provide usable + // functionality and thus can not be used internally to + // facade different implementations. + // + // We propose a pimpl idiom in order to easily replace the thread pool impl + // w/o touching the header file but providing a different .cc potentially + // CMake option driven. + // + // Another option is to introduce a Env::MakeThreadPool() virtual interface + // and override the environment. This would require refactoring ThreadPool + // usage. + // + // We can also combine these two approaches + std::unique_ptr impl_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/util/timer.h b/util/timer.h index c2fbf869b66..db71cefaf8d 100644 --- a/util/timer.h +++ b/util/timer.h @@ -194,7 +194,6 @@ class Timer { #endif // NDEBUG private: - void Run() { InstrumentedMutexLock l(&mutex_); @@ -302,9 +301,7 @@ class Timer { repeat_every_us(_repeat_every_us), valid(true) {} - void Cancel() { - valid = false; - } + void Cancel() { valid = false; } bool IsValid() const { return valid; } }; @@ -318,8 +315,7 @@ class Timer { } struct RunTimeOrder { - bool operator()(const FunctionInfo* f1, - const FunctionInfo* f2) { + bool operator()(const FunctionInfo* f1, const FunctionInfo* f2) { return f1->next_run_time_us > f2->next_run_time_us; } }; @@ -333,9 +329,8 @@ class Timer { bool running_; bool executing_task_; - std::priority_queue, - RunTimeOrder> heap_; + std::priority_queue, RunTimeOrder> + heap_; // In addition to providing a mapping from a function name to a function, // it is also responsible for memory management. diff --git a/util/timer_queue.h b/util/timer_queue.h index 3bd517531a6..36a1744aca3 100644 --- a/util/timer_queue.h +++ b/util/timer_queue.h @@ -23,6 +23,7 @@ #pragma once #include + #include #include #include diff --git a/util/timer_queue_test.cc b/util/timer_queue_test.cc index 5f5f08f21bb..b3c3768ec79 100644 --- a/util/timer_queue_test.cc +++ b/util/timer_queue_test.cc @@ -25,6 +25,7 @@ // #include "util/timer_queue.h" + #include namespace Timing { diff --git a/util/xxhash.cc b/util/xxhash.cc index 07a607b8f4a..88852c3308a 100644 --- a/util/xxhash.cc +++ b/util/xxhash.cc @@ -36,11 +36,10 @@ * - xxHash source repository: https://github.com/Cyan4973/xxHash */ - /* * xxhash.c instantiates functions defined in xxhash.h */ - +// clang-format off #ifndef XXH_STATIC_LINKING_ONLY #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ #endif // !defined(XXH_STATIC_LINKING_ONLY) diff --git a/util/xxhash.h b/util/xxhash.h index 706b97bd9ad..ad49bab816d 100644 --- a/util/xxhash.h +++ b/util/xxhash.h @@ -5,19 +5,22 @@ /* BEGIN RocksDB customizations */ #ifndef XXH_STATIC_LINKING_ONLY -#define XXH_STATIC_LINKING_ONLY 1 /* using xxhash.cc */ -#endif // !defined(XXH_STATIC_LINKING_ONLY) +// Using compiled xxhash.cc +#define XXH_STATIC_LINKING_ONLY 1 +#endif // !defined(XXH_STATIC_LINKING_ONLY) #ifndef XXH_NAMESPACE #define XXH_NAMESPACE ROCKSDB_ -#endif // !defined(XXH_NAMESPACE) -#include "port/lang.h" // for FALLTHROUGH_INTENDED, inserted as appropriate +#endif // !defined(XXH_NAMESPACE) + +// for FALLTHROUGH_INTENDED, inserted as appropriate +#include "port/lang.h" /* END RocksDB customizations */ // clang-format off /* * xxHash - Extremely Fast Hash algorithm * Header File - * Copyright (C) 2012-2020 Yann Collet + * Copyright (C) 2012-2021 Yann Collet * * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) * @@ -48,49 +51,142 @@ * - xxHash homepage: https://www.xxhash.com * - xxHash source repository: https://github.com/Cyan4973/xxHash */ + /*! * @mainpage xxHash * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include + * #include + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * * @file xxhash.h * xxHash prototypes and implementation */ -/* TODO: update */ -/* Notice extracted from xxHash homepage: - -xxHash is an extremely fast hash algorithm, running at RAM speed limits. -It also successfully passes all tests from the SMHasher suite. - -Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) - -Name Speed Q.Score Author -xxHash 5.4 GB/s 10 -CrapWow 3.2 GB/s 2 Andrew -MurmurHash 3a 2.7 GB/s 10 Austin Appleby -SpookyHash 2.0 GB/s 10 Bob Jenkins -SBox 1.4 GB/s 9 Bret Mulvey -Lookup3 1.2 GB/s 9 Bob Jenkins -SuperFastHash 1.2 GB/s 1 Paul Hsieh -CityHash64 1.05 GB/s 10 Pike & Alakuijala -FNV 0.55 GB/s 5 Fowler, Noll, Vo -CRC32 0.43 GB/s 9 -MD5-32 0.33 GB/s 10 Ronald L. Rivest -SHA1-32 0.28 GB/s 10 - -Q.Score is a measure of quality of the hash function. -It depends on successfully passing SMHasher test set. -10 is a perfect score. - -Note: SMHasher's CRC32 implementation is not the fastest one. -Other speed-oriented implementations can be faster, -especially in combination with PCLMUL instruction: -https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 - -A 64-bit version, named XXH64, is available since r35. -It offers much better speed, but for 64-bit applications only. -Name Speed on 64 bits Speed on 32 bits -XXH64 13.8 GB/s 1.9 GB/s -XXH32 6.8 GB/s 6.0 GB/s -*/ #if defined (__cplusplus) extern "C" { @@ -100,21 +196,53 @@ extern "C" { * INLINE mode ******************************/ /*! - * XXH_INLINE_ALL (and XXH_PRIVATE_API) + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * * Use these build macros to inline xxhash into the target unit. * Inlining improves performance on small inputs, especially when the length is * expressed as a compile-time constant: * - * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html * * It also keeps xxHash symbols private to the unit, so they are not exported. * * Usage: + * @code{.c} * #define XXH_INLINE_ALL * #include "xxhash.h" - * + * @endcode * Do not compile and link xxhash.o as a separate object, as it is not useful. */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ && !defined(XXH_INLINE_ALL_31684351384) /* this section should be traversed only once */ @@ -137,29 +265,80 @@ extern "C" { /* * This part deals with the special case where a unit wants to inline xxHash, - * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such - * as part of some previously included *.h header file. + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. * Without further action, the new include would just be ignored, * and functions would effectively _not_ be inlined (silent failure). * The following macros solve this situation by prefixing all inlined names, * avoiding naming collision with previous inclusions. */ -# ifdef XXH_NAMESPACE -# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" - /* - * Note: Alternative: #undef all symbols (it's a pretty large list). - * Without #error: it compiles, but functions are actually not inlined. - */ -# endif + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ # define XXH_NAMESPACE XXH_INLINE_ /* - * Some identifiers (enums, type names) are not symbols, but they must - * still be renamed to avoid redeclaration. + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. * Alternative solution: do not redeclare them. - * However, this requires some #ifdefs, and is a more dispersed action. - * Meanwhile, renaming can be achieved in a single block + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. */ -# define XXH_IPREF(Id) XXH_INLINE_ ## Id +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id # define XXH_OK XXH_IPREF(XXH_OK) # define XXH_ERROR XXH_IPREF(XXH_ERROR) # define XXH_errorcode XXH_IPREF(XXH_errorcode) @@ -178,21 +357,13 @@ extern "C" { # undef XXHASH_H_STATIC_13879238742 #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ - - /* **************************************************************** * Stable API *****************************************************************/ #ifndef XXHASH_H_5627135585666179 #define XXHASH_H_5627135585666179 1 - -/*! - * @defgroup public Public API - * Contains details on the public xxHash functions. - * @{ - */ -/* specific declaration modes for Windows */ +/*! @brief Marks a global symbol. */ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) # if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) # ifdef XXH_EXPORT @@ -205,24 +376,6 @@ extern "C" { # endif #endif -#ifdef XXH_DOXYGEN -/*! - * @brief Emulate a namespace by transparently prefixing all symbols. - * - * If you want to include _and expose_ xxHash functions from within your own - * library, but also want to avoid symbol collisions with other libraries which - * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix - * any public symbol from xxhash library with the value of XXH_NAMESPACE - * (therefore, avoid empty or numeric values). - * - * Note that no change is required within the calling program as long as it - * includes `xxhash.h`: Regular symbol names will be automatically translated - * by this header. - */ -# define XXH_NAMESPACE /* YOUR NAME HERE */ -# undef XXH_NAMESPACE -#endif - #ifdef XXH_NAMESPACE # define XXH_CAT(A,B) A##B # define XXH_NAME2(A,B) XXH_CAT(A,B) @@ -251,23 +404,28 @@ extern "C" { # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) # define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) # define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) # define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) # define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) # define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) # define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) # define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) # define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) # define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) # define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) # define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) /* XXH3_128bits */ # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) # define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) # define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) # define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) # define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) # define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) # define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) # define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) # define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) @@ -277,30 +435,64 @@ extern "C" { #endif +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + /* ************************************* * Version ***************************************/ #define XXH_VERSION_MAJOR 0 #define XXH_VERSION_MINOR 8 #define XXH_VERSION_RELEASE 1 +/*! @brief Version number, encoded as two digits each */ #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) /*! * @brief Obtains the xxHash version. * - * This is only useful when xxHash is compiled as a shared library, as it is - * independent of the version defined in the header. + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. * - * @return `XXH_VERSION_NUMBER` as of when the libray was compiled. + * @return @ref XXH_VERSION_NUMBER of the invoked library. */ -XXH_PUBLIC_API unsigned XXH_versionNumber (void); +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); /* **************************** -* Definitions +* Common basic types ******************************/ #include /* size_t */ -typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; /*-********************************************************************** @@ -313,39 +505,38 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; * Not necessarily defined to `uint32_t` but functionally equivalent. */ typedef uint32_t XXH32_hash_t; + #elif !defined (__VMS) \ && (defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) # include typedef uint32_t XXH32_hash_t; + #else # include # if UINT_MAX == 0xFFFFFFFFUL typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; # else -# if ULONG_MAX == 0xFFFFFFFFUL - typedef unsigned long XXH32_hash_t; -# else -# error "unsupported platform: need a 32-bit type" -# endif +# error "unsupported platform: need a 32-bit type" # endif #endif /*! * @} * - * @defgroup xxh32_family XXH32 family + * @defgroup XXH32_family XXH32 family * @ingroup public * Contains functions used in the classic 32-bit xxHash algorithm. * * @note - * XXH32 is considered rather weak by today's standards. - * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit - * systems, and offers true 64/128 bit hash results. It provides a superior - * level of dispersion, and greatly reduces the risks of collisions. + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. * - * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families - * @see @ref xxh32_impl for implementation details + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details * @{ */ @@ -354,6 +545,8 @@ typedef uint32_t XXH32_hash_t; * * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s * + * See @ref single_shot_example "Single Shot Example" for an example. + * * @param input The block of data to be hashed, at least @p length bytes in size. * @param length The length of @p input, in bytes. * @param seed The 32-bit seed to alter the hash's output predictably. @@ -371,8 +564,9 @@ typedef uint32_t XXH32_hash_t; * @see * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. */ -XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); +#ifndef XXH_NO_STREAM /*! * Streaming functions generate the xxHash value from an incremental input. * This method is slower than single-call functions, due to state management. @@ -395,32 +589,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_ * * When done, release the state using `XXH*_freeState()`. * - * Example code for incrementally hashing a file: - * @code{.c} - * #include - * #include - * #define BUFFER_SIZE 256 - * - * // Note: XXH64 and XXH3 use the same interface. - * XXH32_hash_t - * hashFile(FILE* stream) - * { - * XXH32_state_t* state; - * unsigned char buf[BUFFER_SIZE]; - * size_t amt; - * XXH32_hash_t hash; - * - * state = XXH32_createState(); // Create a state - * assert(state != NULL); // Error check here - * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed - * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) { - * XXH32_update(state, buf, amt); // Hash the file in chunks - * } - * hash = XXH32_digest(state); // Finalize the hash - * XXH32_freeState(state); // Clean up - * return hash; - * } - * @endcode + * @see streaming_example at the top of @ref xxhash.h for an example. */ /*! @@ -437,7 +606,7 @@ typedef struct XXH32_state_s XXH32_state_t; * Must be freed with XXH32_freeState(). * @return An allocated XXH32_state_t on success, `NULL` on failure. */ -XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); /*! * @brief Frees an @ref XXH32_state_t. * @@ -505,7 +674,8 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* * * @return The calculated xxHash32 value from that state. */ -XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ @@ -556,7 +726,52 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t * * @return The converted hash. */ -XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif + +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif + +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif + +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif + +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((noescape)) +#else +# define XXH_NOESCAPE +#endif /*! @@ -595,18 +810,17 @@ typedef uint64_t XXH64_hash_t; /*! * @} * - * @defgroup xxh64_family XXH64 family + * @defgroup XXH64_family XXH64 family * @ingroup public * @{ * Contains functions used in the classic 64-bit xxHash algorithm. * * @note * XXH3 provides competitive speed for both 32-bit and 64-bit systems, - * and offers true 64/128 bit hash results. It provides a superior level of - * dispersion, and greatly reduces the risks of collisions. + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. */ - /*! * @brief Calculates the 64-bit hash of @p input using xxHash64. * @@ -630,32 +844,35 @@ typedef uint64_t XXH64_hash_t; * @see * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. */ -XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); /******* Streaming *******/ +#ifndef XXH_NO_STREAM /*! * @brief The opaque state struct for the XXH64 streaming API. * * @see XXH64_state_s for details. */ typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ -XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); - -XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); -XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 /*! * @} * ************************************************************************ - * @defgroup xxh3_family XXH3 family + * @defgroup XXH3_family XXH3 family * @ingroup public * @{ * @@ -675,12 +892,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src * * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, * but does not require it. - * Any 32-bit and 64-bit targets that can run XXH32 smoothly - * can run XXH3 at competitive speeds, even without vector support. - * Further details are explained in the implementation. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. * * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, - * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro. + * ZVector and scalar targets. This can be controlled via the @ref XXH_VECTOR + * macro. For the x86 family, an automatic dispatcher is included separately + * in @ref xxh_x86dispatch.c. * * XXH3 implementation is portable: * it has a generic C90 formulation that can be compiled on any platform, @@ -696,24 +915,42 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src * * The API supports one-shot hashing, streaming mode, and custom secrets. */ - /*-********************************************************************** * XXH3 64-bit variant ************************************************************************/ -/* XXH3_64bits(): - * default 64-bit variant, using default secret and default seed of 0. - * It's the fastest variant. */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); +/*! + * @brief 64-bit unseeded variant of XXH3. + * + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see + * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); -/* - * XXH3_64bits_withSeed(): - * This variant generates a custom secret on the fly - * based on default secret altered using the `seed` value. +/*! + * @brief 64-bit seeded variant of XXH3 + * + * This variant generates a custom secret on the fly based on default secret + * altered using the `seed` value. + * * While this operation is decently fast, note that it's not completely free. - * Note: seed==0 produces the same results as XXH3_64bits(). + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * @param input The data to hash + * @param length The length + * @param seed The 64-bit seed to alter the state. */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); /*! * The bare minimum size for a custom secret. @@ -724,23 +961,29 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, X */ #define XXH3_SECRET_SIZE_MIN 136 -/* - * XXH3_64bits_withSecret(): +/*! + * @brief 64-bit variant of XXH3 with a custom "secret". + * * It's possible to provide any blob of bytes as a "secret" to generate the hash. * This makes it more difficult for an external actor to prepare an intentional collision. * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). - * However, the quality of produced hash values depends on secret's entropy. - * Technically, the secret must look like a bunch of random bytes. + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. * Avoid "trivial" or structured data such as repeated sequences or a text document. - * Whenever unsure about the "randomness" of the blob of bytes, - * consider relabelling it as a "custom seed" instead, - * and employ "XXH3_generateSecret()" (see below) - * to generate a high entropy secret derived from the custom seed. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing "XXH3_generateSecret()" instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); /******* Streaming *******/ +#ifndef XXH_NO_STREAM /* * Streaming requires state maintenance. * This operation costs memory and CPU. @@ -754,23 +997,23 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, * @see XXH3_state_s for details. */ typedef struct XXH3_state_s XXH3_state_t; -XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); -XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); /* * XXH3_64bits_reset(): * Initialize with default parameters. * digest will be equivalent to `XXH3_64bits()`. */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); /* * XXH3_64bits_reset_withSeed(): * Generate a custom secret from `seed`, and store it into `statePtr`. * digest will be equivalent to `XXH3_64bits_withSeed()`. */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); -/* +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! * XXH3_64bits_reset_withSecret(): * `secret` is referenced, it _must outlive_ the hash streaming session. * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, @@ -779,10 +1022,11 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, * When in doubt about the randomness of a candidate `secret`, * consider employing `XXH3_generateSecret()` instead (see below). */ -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); -XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ /* note : canonical representation of XXH3 is the same as XXH64 * since they both produce XXH64_hash_t values */ @@ -803,11 +1047,31 @@ typedef struct { XXH64_hash_t high64; /*!< `value >> 64` */ } XXH128_hash_t; -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); +/*! + * @brief Unseeded 128-bit variant of XXH3 + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see + * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); /******* Streaming *******/ +#ifndef XXH_NO_STREAM /* * Streaming requires state maintenance. * This operation costs memory and CPU. @@ -820,12 +1084,13 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t le * All reset and streaming functions have same meaning as their 64-bit counterpart. */ -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); -XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ /* Following helper functions make it possible to compare XXH128_hast_t values. * Since XXH128_hash_t is a structure, this capability is not offered by the language. @@ -835,26 +1100,26 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); * XXH128_isEqual(): * Return: 1 if `h1` and `h2` are equal, 0 if they are not. */ -XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); /*! - * XXH128_cmp(): - * + * @brief Compares two @ref XXH128_hash_t * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. * - * return: >0 if *h128_1 > *h128_2 - * =0 if *h128_1 == *h128_2 - * <0 if *h128_1 < *h128_2 + * @return: >0 if *h128_1 > *h128_2 + * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 */ -XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); /******* Canonical representation *******/ typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; -XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); -XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); +#endif /* !XXH_NO_XXH3 */ #endif /* XXH_NO_LONG_LONG */ /*! @@ -895,13 +1160,10 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* struct XXH32_state_s { XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ - XXH32_hash_t v1; /*!< First accumulator lane */ - XXH32_hash_t v2; /*!< Second accumulator lane */ - XXH32_hash_t v3; /*!< Third accumulator lane */ - XXH32_hash_t v4; /*!< Fourth accumulator lane */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ - XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may be removed. */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ }; /* typedef'd to XXH32_state_t */ @@ -921,19 +1183,21 @@ struct XXH32_state_s { */ struct XXH64_state_s { XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ - XXH64_hash_t v1; /*!< First accumulator lane */ - XXH64_hash_t v2; /*!< Second accumulator lane */ - XXH64_hash_t v3; /*!< Third accumulator lane */ - XXH64_hash_t v4; /*!< Fourth accumulator lane */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ - XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it may be removed. */ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ }; /* typedef'd to XXH64_state_t */ -#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ # include # define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) #elif defined(__GNUC__) # define XXH_ALIGN(n) __attribute__ ((aligned(n))) #elif defined(_MSC_VER) @@ -944,6 +1208,7 @@ struct XXH64_state_s { /* Old GCC versions only accept the attribute after the type in structures. */ #if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ && defined(__GNUC__) # define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) #else @@ -973,16 +1238,18 @@ struct XXH64_state_s { * @brief Structure for XXH3 streaming API. * * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, - * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is - * an opaque type. This allows fields to safely be changed. + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. * - * @note **This structure has a strict alignment requirement of 64 bytes.** Do - * not allocate this with `malloc()` or `new`, it will not be sufficiently - * aligned. Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack - * allocation. + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. * * Typedef'd to @ref XXH3_state_t. - * Do not access the members of this struct directly. + * Do never access the members of this struct directly. * * @see XXH3_INITSTATE() for stack initialization. * @see XXH3_createState(), XXH3_freeState(). @@ -990,14 +1257,14 @@ struct XXH64_state_s { */ struct XXH3_state_s { XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); - /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */ + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); /*!< Used to store a custom secret generated from a seed. */ XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); /*!< The internal buffer. @see XXH32_state_s::mem32 */ XXH32_hash_t bufferedSize; /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ - XXH32_hash_t reserved32; + XXH32_hash_t useSeed; /*!< Reserved field. Needed for padding on 64-bit. */ size_t nbStripesSoFar; /*!< Number or stripes processed. */ @@ -1033,45 +1300,156 @@ struct XXH3_state_s { #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; } +/*! + * simple alias to pre-selected XXH3_128bits variant + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); + + /* === Experimental API === */ /* Symbols defined below must be considered tied to a specific library version. */ -/* +/*! * XXH3_generateSecret(): * * Derive a high-entropy secret from any user-defined content, named customSeed. * The generated secret can be used in combination with `*_withSecret()` functions. - * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed, - * as it becomes much more difficult for an external actor to guess how to impact the calculation logic. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. * * The function accepts as input a custom seed of any length and any content, - * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE - * into an already allocated buffer secretBuffer. - * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long. + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. * * The generated secret can then be used with any `*_withSecret()` variant. - * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`, - * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()` + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() * are part of this list. They all accept a `secret` parameter - * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN) + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) * _and_ feature very high entropy (consist of random-looking bytes). - * These conditions can be a high bar to meet, so - * this function can be used to generate a secret of proper quality. + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. * - * customSeed can be anything. It can have any size, even small ones, - * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes. - * The resulting `secret` will nonetheless provide all expected qualities. + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. * - * Supplying NULL as the customSeed copies the default secret into `secretBuffer`. - * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * Example code: + * @code{.c} + * #include + * #include + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode */ -XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize); +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The seed to seed the state. + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); -/* simple short-cut to pre-selected XXH3_128bits variant */ -XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); - +/*! + * These variants generate hash values using either + * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ +#endif /* !XXH_NO_XXH3 */ #endif /* XXH_NO_LONG_LONG */ #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) # define XXH_IMPLEMENTATION @@ -1125,7 +1503,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s /*! * @brief Define this to disable 64-bit code. * - * Useful if only using the @ref xxh32_family and you have a strict C90 compiler. + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. */ # define XXH_NO_LONG_LONG # undef XXH_NO_LONG_LONG /* don't actually */ @@ -1148,7 +1526,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s * Use `memcpy()`. Safe and portable. Note that most modern compilers will * eliminate the function call and treat it as an unaligned access. * - * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))` + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` * @par * Depends on compiler extensions and is therefore not portable. * This method is safe _if_ your compiler supports it, @@ -1175,22 +1553,40 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s * care, as what works on one compiler/platform/optimization level may cause * another to read garbage data or even crash. * - * See https://stackoverflow.com/a/32095106/646947 for details. + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. * * Prefer these methods in priority order (0 > 3 > 1 > 2) */ # define XXH_FORCE_MEMORY_ACCESS 0 + /*! - * @def XXH_ACCEPT_NULL_INPUT_POINTER - * @brief Whether to add explicit `NULL` checks. + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. * - * If the input pointer is `NULL` and the length is non-zero, xxHash's default - * behavior is to dereference it, triggering a segfault. + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. * - * When this macro is enabled, xxHash actively checks the input for a null pointer. - * If it is, the result for null input pointers is the same as a zero-length input. + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. */ -# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +# define XXH_SIZE_OPT 0 + /*! * @def XXH_FORCE_ALIGN_CHECK * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() @@ -1212,9 +1608,11 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s * * In these cases, the alignment check can be removed by setting this macro to 0. * Then the code will always use unaligned memory access. - * Align check is automatically disabled on x86, x64 & arm64, + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips * which are platforms known to offer good unaligned memory accesses performance. * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * * This option does not affect XXH3 (only XXH32 and XXH64). */ # define XXH_FORCE_ALIGN_CHECK 0 @@ -1236,24 +1634,22 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the * compiler full control on whether to inline or not. * - * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using - * -fno-inline with GCC or Clang, this will automatically be defined. + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. */ # define XXH_NO_INLINE_HINTS 0 /*! - * @def XXH_REROLL - * @brief Whether to reroll `XXH32_finalize` and `XXH64_finalize`. - * - * For performance, `XXH32_finalize` and `XXH64_finalize` use an unrolled loop - * in the form of a switch statement. + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. * - * This is not always desirable, as it generates larger code, and depending on - * the architecture, may even be slower + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. * - * This is automatically defined with `-Os`/`-Oz` on GCC and Clang. + * This setting is only possibly making a difference for very small inputs. */ -# define XXH_REROLL 0 +# define XXH32_ENDJMP 0 /*! * @internal @@ -1264,27 +1660,46 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s */ # define XXH_OLD_NAMES # undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ #endif /* XXH_DOXYGEN */ /*! * @} */ #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ - /* prefer __packed__ structures (method 1) for gcc on armv7 and armv8 */ -# if !defined(__clang__) && ( \ - (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ - (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)) ) + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) # define XXH_FORCE_MEMORY_ACCESS 1 # endif #endif -#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ -# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif #endif #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ -# if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \ - || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) \ + || defined(__loongarch64) /* visual */ # define XXH_FORCE_ALIGN_CHECK 0 # else # define XXH_FORCE_ALIGN_CHECK 1 @@ -1292,20 +1707,16 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s #endif #ifndef XXH_NO_INLINE_HINTS -# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ - || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ # define XXH_NO_INLINE_HINTS 1 # else # define XXH_NO_INLINE_HINTS 0 # endif #endif -#ifndef XXH_REROLL -# if defined(__OPTIMIZE_SIZE__) -# define XXH_REROLL 1 -# else -# define XXH_REROLL 0 -# endif +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 #endif /*! @@ -1317,6 +1728,24 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s /* ************************************* * Includes & Memory related functions ***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + /* * Modify the local functions below should you wish to use * different memory routines for malloc() and free() @@ -1327,7 +1756,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s * @internal * @brief Modify this function to use a different routine than malloc(). */ -static void* XXH_malloc(size_t s) { return malloc(s); } +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } /*! * @internal @@ -1335,6 +1764,8 @@ static void* XXH_malloc(size_t s) { return malloc(s); } */ static void XXH_free(void* p) { free(p); } +#endif /* XXH_NO_STDLIB */ + #include /*! @@ -1357,19 +1788,19 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) #endif #if XXH_NO_INLINE_HINTS /* disable inlining hints */ -# if defined(__GNUC__) +# if defined(__GNUC__) || defined(__clang__) # define XXH_FORCE_INLINE static __attribute__((unused)) # else # define XXH_FORCE_INLINE static # endif # define XXH_NO_INLINE static /* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) #elif defined(_MSC_VER) /* Visual Studio */ # define XXH_FORCE_INLINE static __forceinline # define XXH_NO_INLINE static __declspec(noinline) -#elif defined(__GNUC__) -# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) -# define XXH_NO_INLINE static __attribute__((noinline)) #elif defined (__cplusplus) \ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ # define XXH_FORCE_INLINE static inline @@ -1404,11 +1835,20 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) # include /* note: can still be disabled with NDEBUG */ # define XXH_ASSERT(c) assert(c) #else -# define XXH_ASSERT(c) ((void)0) +# define XXH_ASSERT(c) XXH_ASSUME(c) #endif /* note: use after variable declarations */ -#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0) +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif /*! * @internal @@ -1426,12 +1866,18 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) * We also use it to prevent unwanted constant folding for AArch64 in * XXH3_initCustomSecret_scalar(). */ -#ifdef __GNUC__ +#if defined(__GNUC__) || defined(__clang__) # define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var)) #else # define XXH_COMPILER_GUARD(var) ((void)0) #endif +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD_W(var) __asm__ __volatile__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_W(var) ((void)0) +#endif + /* ************************************* * Basic Types ***************************************/ @@ -1519,30 +1965,31 @@ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) /* - * __pack instructions are safer but compiler specific, hence potentially - * problematic for some compilers. - * - * Currently only defined for GCC and ICC. + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. */ #ifdef XXH_OLD_NAMES typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; #endif static xxh_u32 XXH_read32(const void* ptr) { - typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign; - return ((const xxh_unalign*)ptr)->u32; + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); } #else /* * Portable and safe solution. Generally efficient. - * see: https://stackoverflow.com/a/32095106/646947 + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html */ static xxh_u32 XXH_read32(const void* memPtr) { xxh_u32 val; - memcpy(&val, memPtr, sizeof(val)); + XXH_memcpy(&val, memPtr, sizeof(val)); return val; } @@ -1550,6 +1997,7 @@ static xxh_u32 XXH_read32(const void* memPtr) /* *** Endianness *** */ + /*! * @ingroup tuning * @def XXH_CPU_LITTLE_ENDIAN @@ -1558,8 +2006,8 @@ static xxh_u32 XXH_read32(const void* memPtr) * Defined to 1 if the target is little endian, or 0 if it is big endian. * It can be defined externally, for example on the compiler command line. * - * If it is not defined, a runtime check (which is usually constant folded) - * is used instead. + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. * * @note * This is not necessarily defined to an integer constant. @@ -1612,6 +2060,29 @@ static int XXH_isLittleEndian(void) # define XXH_HAS_BUILTIN(x) 0 #endif + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) +/* C23 and future versions have standard "unreachable()" */ +# include +# define XXH_UNREACHABLE() unreachable() + +#elif defined(__cplusplus) && (__cplusplus > 202002L) +/* C++23 and future versions have std::unreachable() */ +# include /* std::unreachable() */ +# define XXH_UNREACHABLE() std::unreachable() + +#elif XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } + /*! * @internal * @def XXH_rotl32(x,r) @@ -1734,8 +2205,10 @@ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } *********************************************************************/ /*! * @} - * @defgroup xxh32_impl XXH32 implementation + * @defgroup XXH32_impl XXH32 implementation * @ingroup impl + * + * Details on the XXH32 implementation. * @{ */ /* #define instead of static const, to be used as initializers */ @@ -1815,17 +2288,17 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) * The final mix ensures that all input bits have a chance to impact any bit in * the output digest, resulting in an unbiased distribution. * - * @param h32 The hash to avalanche. + * @param hash The hash to avalanche. * @return The avalanched hash. */ -static xxh_u32 XXH32_avalanche(xxh_u32 h32) +static xxh_u32 XXH32_avalanche(xxh_u32 hash) { - h32 ^= h32 >> 15; - h32 *= XXH_PRIME32_2; - h32 ^= h32 >> 13; - h32 *= XXH_PRIME32_3; - h32 ^= h32 >> 16; - return(h32); + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; } #define XXH_get32bits(p) XXH_readLE32_align(p, align) @@ -1838,28 +2311,31 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32) * This final stage will digest them to ensure that all input bytes are present * in the final mix. * - * @param h32 The hash to finalize. + * @param hash The hash to finalize. * @param ptr The pointer to the remaining input. * @param len The remaining length, modulo 16. * @param align Whether @p ptr is aligned. * @return The finalized hash. + * @see XXH64_finalize(). */ -static xxh_u32 -XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) { -#define XXH_PROCESS1 do { \ - h32 += (*ptr++) * XXH_PRIME32_5; \ - h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ } while (0) -#define XXH_PROCESS4 do { \ - h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \ - ptr += 4; \ - h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \ +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ } while (0) - /* Compact rerolled version */ - if (XXH_REROLL) { + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { len &= 15; while (len >= 4) { XXH_PROCESS4; @@ -1869,49 +2345,49 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) XXH_PROCESS1; --len; } - return XXH32_avalanche(h32); + return XXH32_avalanche(hash); } else { switch(len&15) /* or switch(bEnd - p) */ { case 12: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 8: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 4: XXH_PROCESS4; - return XXH32_avalanche(h32); + return XXH32_avalanche(hash); case 13: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 9: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 5: XXH_PROCESS4; XXH_PROCESS1; - return XXH32_avalanche(h32); + return XXH32_avalanche(hash); case 14: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 10: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 6: XXH_PROCESS4; XXH_PROCESS1; XXH_PROCESS1; - return XXH32_avalanche(h32); + return XXH32_avalanche(hash); case 15: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 11: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 7: XXH_PROCESS4; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 3: XXH_PROCESS1; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 2: XXH_PROCESS1; - FALLTHROUGH_INTENDED; + XXH_FALLTHROUGH; /* fallthrough */ case 1: XXH_PROCESS1; - FALLTHROUGH_INTENDED; - case 0: return XXH32_avalanche(h32); + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); } XXH_ASSERT(0); - return h32; /* reaching this point is deemed impossible */ + return hash; /* reaching this point is deemed impossible */ } } @@ -1927,24 +2403,19 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) * @internal * @brief The implementation for @ref XXH32(). * - * @param input, len, seed Directly passed from @ref XXH32(). + * @param input , len , seed Directly passed from @ref XXH32(). * @param align Whether @p input is aligned. * @return The calculated hash. */ -XXH_FORCE_INLINE xxh_u32 +XXH_FORCE_INLINE XXH_PUREF xxh_u32 XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) { - const xxh_u8* bEnd = input ? input + len : NULL; xxh_u32 h32; -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) - if (input==NULL) { - len=0; - bEnd=input=(const xxh_u8*)(size_t)16; - } -#endif + if (input==NULL) XXH_ASSERT(len == 0); if (len>=16) { + const xxh_u8* const bEnd = input + len; const xxh_u8* const limit = bEnd - 15; xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; xxh_u32 v2 = seed + XXH_PRIME32_2; @@ -1969,10 +2440,10 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment return XXH32_finalize(h32, input, len&15, align); } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) { -#if 0 +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ XXH32_state_t state; XXH32_reset(&state, seed); @@ -1991,51 +2462,46 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t s /******* Hash streaming *******/ -/*! - * @ingroup xxh32_family - */ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) { return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) { XXH_free(statePtr); return XXH_OK; } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) { - memcpy(dstState, srcState, sizeof(*dstState)); + XXH_memcpy(dstState, srcState, sizeof(*dstState)); } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) { - XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ - memset(&state, 0, sizeof(state)); - state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; - state.v2 = seed + XXH_PRIME32_2; - state.v3 = seed + 0; - state.v4 = seed - XXH_PRIME32_1; - /* do not write into reserved, planned to be removed in a future version */ - memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; return XXH_OK; } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* state, const void* input, size_t len) { - if (input==NULL) -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + XXH_ASSERT(len == 0); return XXH_OK; -#else - return XXH_ERROR; -#endif + } { const xxh_u8* p = (const xxh_u8*)input; const xxh_u8* const bEnd = p + len; @@ -2052,35 +2518,25 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len) if (state->memsize) { /* some data left from previous update */ XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); { const xxh_u32* p32 = state->mem32; - state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; - state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; - state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; - state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); } p += 16-state->memsize; state->memsize = 0; } - /* uintptr_t casts avoid UB or compiler warning on out-of-bounds - * pointer arithmetic */ - if ((uintptr_t)p <= (uintptr_t)bEnd - 16) { - const uintptr_t limit = (uintptr_t)bEnd - 16; - xxh_u32 v1 = state->v1; - xxh_u32 v2 = state->v2; - xxh_u32 v3 = state->v3; - xxh_u32 v4 = state->v4; + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; do { - v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; - v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; - v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; - v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; - } while ((uintptr_t)p<=limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + } if (p < bEnd) { @@ -2093,30 +2549,30 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len) } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) { xxh_u32 h32; if (state->large_len) { - h32 = XXH_rotl32(state->v1, 1) - + XXH_rotl32(state->v2, 7) - + XXH_rotl32(state->v3, 12) - + XXH_rotl32(state->v4, 18); + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); } else { - h32 = state->v3 /* == seed */ + XXH_PRIME32_5; + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; } h32 += state->total_len_32; return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); } - +#endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ /*! - * @ingroup xxh32_family + * @ingroup XXH32_family * The default return values from XXH functions are unsigned 32 and 64 bit * integers. * @@ -2133,9 +2589,9 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t { XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); - memcpy(dst, &hash, sizeof(*dst)); + XXH_memcpy(dst, &hash, sizeof(*dst)); } -/*! @ingroup xxh32_family */ +/*! @ingroup XXH32_family */ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) { return XXH_readBE32(src); @@ -2176,30 +2632,31 @@ static xxh_u64 XXH_read64(const void* memPtr) #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) /* - * __pack instructions are safer, but compiler specific, hence potentially - * problematic for some compilers. - * - * Currently only defined for GCC and ICC. + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. */ #ifdef XXH_OLD_NAMES typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; #endif static xxh_u64 XXH_read64(const void* ptr) { - typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64; - return ((const xxh_unalign64*)ptr)->u64; + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); } #else /* * Portable and safe solution. Generally efficient. - * see: https://stackoverflow.com/a/32095106/646947 + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html */ static xxh_u64 XXH_read64(const void* memPtr) { xxh_u64 val; - memcpy(&val, memPtr, sizeof(val)); + XXH_memcpy(&val, memPtr, sizeof(val)); return val; } @@ -2278,8 +2735,10 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align) /******* xxh64 *******/ /*! * @} - * @defgroup xxh64_impl XXH64 implementation + * @defgroup XXH64_impl XXH64 implementation * @ingroup impl + * + * Details on the XXH64 implementation. * @{ */ /* #define rather that static const, to be used as initializers */ @@ -2297,6 +2756,7 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align) # define PRIME64_5 XXH_PRIME64_5 #endif +/*! @copydoc XXH32_round */ static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) { acc += input * XXH_PRIME64_2; @@ -2313,42 +2773,59 @@ static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) return acc; } -static xxh_u64 XXH64_avalanche(xxh_u64 h64) +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) { - h64 ^= h64 >> 33; - h64 *= XXH_PRIME64_2; - h64 ^= h64 >> 29; - h64 *= XXH_PRIME64_3; - h64 ^= h64 >> 32; - return h64; + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; } #define XXH_get64bits(p) XXH_readLE64_align(p, align) -static xxh_u64 -XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) -{ - len &= 31; - while (len >= 8) { - xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); ptr += 8; - h64 ^= k1; - h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; len -= 8; } if (len >= 4) { - h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; ptr += 4; - h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; len -= 4; } while (len > 0) { - h64 ^= (*ptr++) * XXH_PRIME64_5; - h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; --len; } - return XXH64_avalanche(h64); + return XXH64_avalanche(hash); } #ifdef XXH_OLD_NAMES @@ -2361,21 +2838,23 @@ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) # undef XXH_PROCESS8_64 #endif -XXH_FORCE_INLINE xxh_u64 +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) { - const xxh_u8* bEnd = input ? input + len : NULL; xxh_u64 h64; - -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) - if (input==NULL) { - len=0; - bEnd=input=(const xxh_u8*)(size_t)32; - } -#endif + if (input==NULL) XXH_ASSERT(len == 0); if (len>=32) { - const xxh_u8* const limit = bEnd - 32; + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; xxh_u64 v2 = seed + XXH_PRIME64_2; xxh_u64 v3 = seed + 0; @@ -2386,7 +2865,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; - } while (input<=limit); + } while (input= 2 /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ XXH64_state_t state; XXH64_reset(&state, seed); @@ -2425,49 +2904,45 @@ XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t s } /******* Hash Streaming *******/ - -/*! @ingroup xxh64_family*/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) { return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); } -/*! @ingroup xxh64_family */ +/*! @ingroup XXH64_family */ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) { XXH_free(statePtr); return XXH_OK; } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) { - memcpy(dstState, srcState, sizeof(*dstState)); + XXH_memcpy(dstState, srcState, sizeof(*dstState)); } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) { - XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ - memset(&state, 0, sizeof(state)); - state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; - state.v2 = seed + XXH_PRIME64_2; - state.v3 = seed + 0; - state.v4 = seed - XXH_PRIME64_1; - /* do not write into reserved64, might be removed in a future version */ - memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; return XXH_OK; } -/*! @ingroup xxh64_family */ +/*! @ingroup XXH64_family */ XXH_PUBLIC_API XXH_errorcode -XXH64_update (XXH64_state_t* state, const void* input, size_t len) +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) { - if (input==NULL) -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + XXH_ASSERT(len == 0); return XXH_OK; -#else - return XXH_ERROR; -#endif + } { const xxh_u8* p = (const xxh_u8*)input; const xxh_u8* const bEnd = p + len; @@ -2482,34 +2957,24 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len) if (state->memsize) { /* tmp buffer is full */ XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); - state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); - state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); - state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); - state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); p += 32 - state->memsize; state->memsize = 0; } - /* uintptr_t casts avoid UB or compiler warning on out-of-bounds - * pointer arithmetic */ - if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) { - const uintptr_t limit = (uintptr_t)bEnd - 32; - xxh_u64 v1 = state->v1; - xxh_u64 v2 = state->v2; - xxh_u64 v3 = state->v3; - xxh_u64 v4 = state->v4; + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; do { - v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; - v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; - v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; - v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; - } while ((uintptr_t)p<=limit); - - state->v1 = v1; - state->v2 = v2; - state->v3 = v3; - state->v4 = v4; + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + } if (p < bEnd) { @@ -2522,44 +2987,39 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len) } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) { xxh_u64 h64; if (state->total_len >= 32) { - xxh_u64 const v1 = state->v1; - xxh_u64 const v2 = state->v2; - xxh_u64 const v3 = state->v3; - xxh_u64 const v4 = state->v4; - - h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); - h64 = XXH64_mergeRound(h64, v1); - h64 = XXH64_mergeRound(h64, v2); - h64 = XXH64_mergeRound(h64, v3); - h64 = XXH64_mergeRound(h64, v4); + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); } else { - h64 = state->v3 /*seed*/ + XXH_PRIME64_5; + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; } h64 += (xxh_u64) state->total_len; return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); } - +#endif /* !XXH_NO_STREAM */ /******* Canonical representation *******/ -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) { XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); - memcpy(dst, &hash, sizeof(*dst)); + XXH_memcpy(dst, &hash, sizeof(*dst)); } -/*! @ingroup xxh64_family */ -XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) { return XXH_readBE64(src); } @@ -2572,7 +3032,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src ************************************************************************ */ /*! * @} - * @defgroup xxh3_impl XXH3 implementation + * @defgroup XXH3_impl XXH3 implementation * @ingroup impl * @{ */ @@ -2598,17 +3058,23 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src # define XXH_unlikely(x) (x) #endif -#if defined(__GNUC__) -# if defined(__AVX2__) -# include -# elif defined(__SSE2__) -# include -# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || defined(__aarch64__) || defined(_M_ARM) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) # define inline __inline__ /* circumvent a clang bug */ # include # undef inline +# elif defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include # endif -#elif defined(_MSC_VER) +#endif + +#if defined(_MSC_VER) # include #endif @@ -2722,12 +3188,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ { XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ }; /*! * @ingroup tuning * @brief Selects the minimum alignment for XXH3's accumulators. * - * When using SIMD, this should match the alignment reqired for said vector + * When using SIMD, this should match the alignment required for said vector * type, so, for example, 32 for AVX2. * * Default: Auto detected. @@ -2743,20 +3210,26 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_AVX512 3 # define XXH_NEON 4 # define XXH_VSX 5 +# define XXH_SVE 6 #endif #ifndef XXH_VECTOR /* can be defined on command line */ -# if defined(__AVX512F__) +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) # define XXH_VECTOR XXH_AVX512 # elif defined(__AVX2__) # define XXH_VECTOR XXH_AVX2 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) # define XXH_VECTOR XXH_SSE2 -# elif defined(__GNUC__) /* msvc support maybe later */ \ - && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ - && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) -# define XXH_VECTOR XXH_NEON # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ || (defined(__s390x__) && defined(__VEC__)) \ && defined(__GNUC__) /* TODO: IBM XL */ @@ -2766,6 +3239,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # endif #endif +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + /* * Controls the alignment of the accumulator, * for compatibility with aligned vector loads, which are usually faster. @@ -2785,12 +3269,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ { # define XXH_ACC_ALIGN 16 # elif XXH_VECTOR == XXH_AVX512 /* avx512 */ # define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 # endif #endif #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 # define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN #else # define XXH_SEC_ALIGN 8 #endif @@ -2818,7 +3306,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { */ #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ - && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ # pragma GCC push_options # pragma GCC optimize("-O2") #endif @@ -2906,8 +3394,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ { * } */ # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ - && defined(__GNUC__) \ - && !defined(__aarch64__) && !defined(__arm64__) + && (defined(__GNUC__) || defined(__clang__)) \ + && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM)) # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ do { \ /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ @@ -2924,6 +3412,78 @@ enum XXH_VECTOR_TYPE /* fake enum */ { (outHi) = vshrn_n_u64 ((in), 32); \ } while (0) # endif + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(uint64x2_t const*)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and + * 2 lanes on scalar by default (except on Apple platforms, as Apple CPUs benefit + * from only using NEON). + * + * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the + * emulated 64-bit arithmetic is too slow. + * + * Modern ARM CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't + * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions, + * you are only using 2/3 of the CPU bandwidth. + * + * This is even more noticeable on the more advanced cores like the A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the + * remaining lanes will use scalar instructions. This improves the bandwidth + * and also gives the integer pipelines something to do besides twiddling loop + * counters and pointers. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif #endif /* XXH_VECTOR == XXH_NEON */ /* @@ -2935,23 +3495,33 @@ enum XXH_VECTOR_TYPE /* fake enum */ { * inconsistent intrinsics, spotty coverage, and multiple endiannesses. */ #if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + # if defined(__s390x__) # include # else -/* gcc's altivec.h can have the unwanted consequence to unconditionally - * #define bool, vector, and pixel keywords, - * with bad consequences for programs already using these keywords for other purposes. - * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined. - * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler, - * but it seems that, in some cases, it isn't. - * Force the build macro to be defined, so that keywords are not altered. - */ -# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) -# define __APPLE_ALTIVEC__ -# endif # include # endif +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + typedef __vector unsigned long long xxh_u64x2; typedef __vector unsigned char xxh_u8x16; typedef __vector unsigned xxh_u32x4; @@ -2990,7 +3560,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) { xxh_u64x2 ret; - memcpy(&ret, ptr, sizeof(xxh_u64x2)); + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); # if XXH_VSX_BE ret = XXH_vec_revb(ret); # endif @@ -3007,8 +3577,9 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) /* s390x is always big endian, no issue on this platform */ # define XXH_vec_mulo vec_mulo # define XXH_vec_mule vec_mule -# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ # define XXH_vec_mulo __builtin_altivec_vmulouw # define XXH_vec_mule __builtin_altivec_vmuleuw # else @@ -3029,13 +3600,29 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) # endif /* XXH_vec_mulo, XXH_vec_mule */ #endif /* XXH_VECTOR == XXH_VSX */ +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + /* prefetch * can be disabled, by declaring XXH_NO_PREFETCH build macro */ #if defined(XXH_NO_PREFETCH) # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ #else -# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ # include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) @@ -3100,7 +3687,6 @@ XXH_mult32to64(xxh_u64 x, xxh_u64 y) return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); } #elif defined(_MSC_VER) && defined(_M_IX86) -# include # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) #else /* @@ -3119,7 +3705,7 @@ XXH_mult32to64(xxh_u64 x, xxh_u64 y) * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar * version. * - * @param lhs, rhs The 64-bit integers to be multiplied + * @param lhs , rhs The 64-bit integers to be multiplied * @return The 128-bit result represented in an @ref XXH128_hash_t. */ static XXH128_hash_t @@ -3140,7 +3726,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) * In that case it is best to use the portable one. * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 */ -#if defined(__GNUC__) && !defined(__wasm__) \ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ && defined(__SIZEOF_INT128__) \ || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) @@ -3157,7 +3743,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) * * This compiles to single operand MUL on x64. */ -#elif defined(_M_X64) || defined(_M_IA64) +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) #ifndef _MSC_VER # pragma intrinsic(_umul128) @@ -3169,6 +3755,21 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) r128.high64 = product_high; return r128; + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + #else /* * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. @@ -3237,7 +3838,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) * The reason for the separate function is to prevent passing too many structs * around by value. This will hopefully inline the multiply, but we don't force it. * - * @param lhs, rhs The 64-bit integers to multiply + * @param lhs , rhs The 64-bit integers to multiply * @return The low 64 bits of the product XOR'd by the high 64 bits. * @see XXH_mult64to128() */ @@ -3249,7 +3850,7 @@ XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) } /*! Seems to produce slightly better code on GCC for some reason. */ -XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) { XXH_ASSERT(0 <= shift && shift < 64); return v64 ^ (v64 >> shift); @@ -3316,7 +3917,7 @@ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) * * This adds an extra layer of strength for custom secrets. */ -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -3338,7 +3939,7 @@ XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h } } -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -3354,7 +3955,7 @@ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h } } -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -3371,7 +3972,7 @@ XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ } } -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(len <= 16); @@ -3441,7 +4042,7 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, } /* For mid range keys, XXH3 uses a Mum-hash variant. */ -XXH_FORCE_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) @@ -3449,29 +4050,39 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; XXH_ASSERT(16 < len && len <= 128); - { xxh_u64 acc = len * XXH_PRIME64_1; + { xxh_u64 acc = len * XXH_PRIME64_1, acc_end; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); + acc_end = 0; +#else + acc += XXH3_mix16B(input+0, secret+0, seed); + acc_end = XXH3_mix16B(input+len-16, secret+16, seed); if (len > 32) { + acc += XXH3_mix16B(input+16, secret+32, seed); + acc_end += XXH3_mix16B(input+len-32, secret+48, seed); if (len > 64) { + acc += XXH3_mix16B(input+32, secret+64, seed); + acc_end += XXH3_mix16B(input+len-48, secret+80, seed); + if (len > 96) { acc += XXH3_mix16B(input+48, secret+96, seed); - acc += XXH3_mix16B(input+len-64, secret+112, seed); + acc_end += XXH3_mix16B(input+len-64, secret+112, seed); } - acc += XXH3_mix16B(input+32, secret+64, seed); - acc += XXH3_mix16B(input+len-48, secret+80, seed); } - acc += XXH3_mix16B(input+16, secret+32, seed); - acc += XXH3_mix16B(input+len-32, secret+48, seed); } - acc += XXH3_mix16B(input+0, secret+0, seed); - acc += XXH3_mix16B(input+len-16, secret+16, seed); - - return XXH3_avalanche(acc); +#endif + return XXH3_avalanche(acc + acc_end); } } #define XXH3_MIDSIZE_MAX 240 -XXH_NO_INLINE XXH64_hash_t +XXH_NO_INLINE XXH_PUREF XXH64_hash_t XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) @@ -3483,13 +4094,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, #define XXH3_MIDSIZE_LASTOFFSET 17 { xxh_u64 acc = len * XXH_PRIME64_1; - int const nbRounds = (int)len / 16; - int i; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); for (i=0; i<8; i++) { acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); } - acc = XXH3_avalanche(acc); + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); #if defined(__clang__) /* Clang */ \ && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ @@ -3516,11 +4131,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, #pragma clang loop vectorize(disable) #endif for (i=8 ; i < nbRounds; i++) { - acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); } - /* last bytes */ - acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); - return XXH3_avalanche(acc); + return XXH3_avalanche(acc + acc_end); } } @@ -3536,10 +4153,51 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, # define ACC_NB XXH_ACC_NB #endif +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_() and it calls + * XXH3_accumulate_512_(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) { if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); - memcpy(dst, &v64, sizeof(v64)); + XXH_memcpy(dst, &v64, sizeof(v64)); } /* Several intrinsic functions below are supposed to accept __int64 as argument, @@ -3556,6 +4214,7 @@ XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) typedef long long xxh_i64; #endif + /* * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. * @@ -3591,7 +4250,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { - XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; + __m512i* const xacc = (__m512i *) acc; XXH_ASSERT((((size_t)acc) & 63) == 0); XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); @@ -3603,7 +4262,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, /* data_key = data_vec ^ key_vec; */ __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); /* data_key_lo = data_key >> 32; */ - __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); /* xacc[0] += swap(data_vec); */ @@ -3613,6 +4272,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, *xacc = _mm512_add_epi64(product, sum); } } +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) /* * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. @@ -3640,19 +4300,18 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 63) == 0); XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); - { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; + { __m512i* const xacc = (__m512i*) acc; const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); /* xacc[0] ^= (xacc[0] >> 47) */ __m512i const acc_vec = *xacc; __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); - __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); /* xacc[0] ^= secret; */ __m512i const key_vec = _mm512_loadu_si512 (secret); - __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); /* xacc[0] *= XXH_PRIME32_1; */ - __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); @@ -3667,20 +4326,16 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) XXH_ASSERT(((size_t)customSecret & 63) == 0); (void)(&XXH_writeLE64); { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); - __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64)); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); - XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret; - XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret; + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); for (i=0; i < nbRounds; ++i) { - /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*', - * this will warn "discards 'const' qualifier". */ - union { - XXH_ALIGN(64) const __m512i* cp; - XXH_ALIGN(64) void* p; - } remote_const_void; - remote_const_void.cp = src + i; - dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); } } } @@ -3699,7 +4354,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 31) == 0); - { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + { __m256i* const xacc = (__m256i *) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ const __m256i* const xinput = (const __m256i *) input; @@ -3716,7 +4371,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, /* data_key = data_vec ^ key_vec; */ __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* data_key_lo = data_key >> 32; */ - __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); /* xacc[i] += swap(data_vec); */ @@ -3726,12 +4381,13 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, xacc[i] = _mm256_add_epi64(product, sum); } } } +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 31) == 0); - { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; + { __m256i* const xacc = (__m256i*) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ const __m256i* const xsecret = (const __m256i *) secret; @@ -3748,7 +4404,7 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1; */ - __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); @@ -3765,8 +4421,8 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR XXH_PREFETCH(customSecret); { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); - XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret; - XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret; + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; # if defined(__GNUC__) || defined(__clang__) /* @@ -3776,14 +4432,16 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR */ XXH_COMPILER_GUARD(dest); # endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); /* GCC -O2 need unroll loop manually */ - dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed); - dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed); - dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed); - dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed); - dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed); - dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed); + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); } } @@ -3803,7 +4461,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, { /* SSE2 is just a half-scale version of the AVX2 version. */ XXH_ASSERT((((size_t)acc) & 15) == 0); - { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + { __m128i* const xacc = (__m128i *) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i* const xinput = (const __m128i *) input; @@ -3830,12 +4488,13 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, xacc[i] = _mm_add_epi64(product, sum); } } } +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 15) == 0); - { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; + { __m128i* const xacc = (__m128i*) acc; /* Unaligned. This is mainly for pointer arithmetic, and because * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ const __m128i* const xsecret = (const __m128i *) secret; @@ -3867,7 +4526,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); # if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - // MSVC 32bit mode does not support _mm_set_epi64x before 2015 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); # else @@ -3875,19 +4534,21 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR # endif int i; - XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret; - XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret; + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; # if defined(__GNUC__) || defined(__clang__) /* * On GCC & Clang, marking 'dest' as modified will cause the compiler: * - do not extract the secret from sse registers in the internal loop * - use less common registers, and avoid pushing these reg into stack */ - XXH_COMPILER_GUARD(dest); + XXH_COMPILER_GUARD(dst16); # endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); for (i=0; i < nbRounds; ++i) { - dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed); + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); } } } @@ -3895,42 +4556,112 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR #if (XXH_VECTOR == XXH_NEON) +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + */ XXH_FORCE_INLINE void XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); { - XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + uint64x2_t* const xacc = (uint64x2_t *) acc; /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ uint8_t const* const xinput = (const uint8_t *) input; uint8_t const* const xsecret = (const uint8_t *) secret; size_t i; - for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) { + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + uint64x2_t acc_vec1 = xacc[i]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec1 = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec1 = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_21 = vextq_u64(data_vec1, data_vec1, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key1 = veorq_u64(data_vec1, key_vec1); + + uint64x2_t acc_vec2 = xacc[i+1]; /* data_vec = xinput[i]; */ - uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); + uint64x2_t data_vec2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); /* key_vec = xsecret[i]; */ - uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t key_vec2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_22 = vextq_u64(data_vec2, data_vec2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key2 = veorq_u64(data_vec2, key_vec2); + + /* data_key_lo = {(data_key1 & 0xFFFFFFFF), (data_key2 & 0xFFFFFFFF)}; + * data_key_hi = {(data_key1 >> 32), (data_key2 >> 32)}; + */ + uint32x4x2_t zipped = vuzpq_u32(vreinterpretq_u32_u64(data_key1), vreinterpretq_u32_u64(data_key2)); + uint32x4_t data_key_lo = zipped.val[0]; + uint32x4_t data_key_hi = zipped.val[1]; + + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_21 = vmlal_u32 (acc_vec_21, vget_low_u32(data_key_lo), vget_low_u32(data_key_hi)); + XXH_COMPILER_GUARD_W(acc_vec_21); + /* xacc[i] += acc_vec_2; */ + acc_vec1 = vaddq_u64 (acc_vec1, acc_vec_21); + xacc[i] = acc_vec1; + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_22 = vmlal_u32 (acc_vec_22, vget_high_u32(data_key_lo), vget_high_u32(data_key_hi)); + XXH_COMPILER_GUARD_W(acc_vec_22); + /* xacc[i] += acc_vec_2; */ + acc_vec2 = vaddq_u64 (acc_vec2, acc_vec_22); + xacc[i+1] = acc_vec2; + } + for (; i < XXH3_NEON_LANES / 2; i++) { + uint64x2_t acc_vec = xacc[i]; + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); uint64x2_t data_key; uint32x2_t data_key_lo, data_key_hi; - /* xacc[i] += swap(data_vec); */ - uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); - uint64x2_t const swapped = vextq_u64(data64, data64, 1); - xacc[i] = vaddq_u64 (xacc[i], swapped); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1); /* data_key = data_vec ^ key_vec; */ - data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + data_key = veorq_u64(data_vec, key_vec); /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); * data_key_hi = (uint32x2_t) (data_key >> 32); * data_key = UNDEFINED; */ XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); - /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ - xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); - + /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi); + XXH_COMPILER_GUARD_W(acc_vec_2); + /* xacc[i] += acc_vec_2; */ + acc_vec = vaddq_u64 (acc_vec, acc_vec_2); + xacc[i] = acc_vec; } + } } +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) XXH_FORCE_INLINE void XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) @@ -3942,15 +4673,19 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); size_t i; - for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) { + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { /* xacc[i] ^= (xacc[i] >> 47); */ uint64x2_t acc_vec = xacc[i]; - uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); - uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); /* xacc[i] ^= xsecret[i]; */ - uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); - uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); + uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64 (data_vec, key_vec); /* xacc[i] *= XXH_PRIME32_1 */ uint32x2_t data_key_lo, data_key_hi; @@ -3978,11 +4713,12 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) */ uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); /* xacc[i] = prod_hi << 32; */ - xacc[i] = vshlq_n_u64(prod_hi, 32); + prod_hi = vshlq_n_u64(prod_hi, 32); /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ - xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); + xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime); } - } } + } + } } #endif @@ -3994,7 +4730,8 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { - xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ + /* presumed aligned */ + unsigned int* const xacc = (unsigned int*) acc; xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ xxh_u64x2 const v32 = { 32, 32 }; @@ -4009,16 +4746,21 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); - xacc[i] += product; + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i); + acc_vec += product; /* swap high and low halves */ #ifdef __s390x__ - xacc[i] += vec_permi(data_vec, data_vec, 2); + acc_vec += vec_permi(data_vec, data_vec, 2); #else - xacc[i] += vec_xxpermdi(data_vec, data_vec, 2); + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); #endif + /* xacc[i] = acc_vec; */ + vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i); } } +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) XXH_FORCE_INLINE void XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) @@ -4052,40 +4794,202 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) #endif +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + /* scalar variants - universal */ +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ XXH_FORCE_INLINE void XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret) { - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ - const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ - const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ size_t i; - XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif for (i=0; i < XXH_ACC_NB; i++) { - xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); - xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); - xacc[i ^ 1] += data_val; /* swap adjacent lanes */ - xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + XXH3_scalarRound(acc, input, secret, i); } } +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ XXH_FORCE_INLINE void -XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) { - XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ - size_t i; XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); - for (i=0; i < XXH_ACC_NB; i++) { - xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); - xxh_u64 acc64 = xacc[i]; + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; acc64 = XXH_xorshift64(acc64, 47); acc64 ^= key64; acc64 *= XXH_PRIME32_1; - xacc[i] = acc64; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); } } @@ -4107,8 +5011,9 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) * placed sequentially, in order, at the top of the unrolled loop. * * While MOVK is great for generating constants (2 cycles for a 64-bit - * constant compared to 4 cycles for LDR), long MOVK chains stall the - * integer pipelines: + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * * I L S * MOVK * MOVK @@ -4125,6 +5030,9 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) * ADD LDR * SUB STR * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * * XXH3_64bits_withSeed, len == 256, Snapdragon 835 * without hack: 2654.4 MB/s * with hack: 3202.9 MB/s @@ -4154,7 +5062,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) } -typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*); +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); @@ -4162,82 +5070,63 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); #if (XXH_VECTOR == XXH_AVX512) #define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 #elif (XXH_VECTOR == XXH_AVX2) #define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 #elif (XXH_VECTOR == XXH_SSE2) #define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 #elif (XXH_VECTOR == XXH_NEON) #define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon #define XXH3_scrambleAcc XXH3_scrambleAcc_neon #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar #elif (XXH_VECTOR == XXH_VSX) #define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + #else /* scalar */ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar #endif - - -#ifndef XXH_PREFETCH_DIST -# ifdef __clang__ -# define XXH_PREFETCH_DIST 320 -# else -# if (XXH_VECTOR == XXH_AVX512) -# define XXH_PREFETCH_DIST 512 -# else -# define XXH_PREFETCH_DIST 384 -# endif -# endif /* __clang__ */ -#endif /* XXH_PREFETCH_DIST */ - -/* - * XXH3_accumulate() - * Loops over XXH3_accumulate_512(). - * Assumption: nbStripes will not overflow the secret size - */ -XXH_FORCE_INLINE void -XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, - const xxh_u8* XXH_RESTRICT input, - const xxh_u8* XXH_RESTRICT secret, - size_t nbStripes, - XXH3_f_accumulate_512 f_acc512) -{ - size_t n; - for (n = 0; n < nbStripes; n++ ) { - const xxh_u8* const in = input + n*XXH_STRIPE_LEN; - XXH_PREFETCH(in + XXH_PREFETCH_DIST); - f_acc512(acc, - in, - secret + n*XXH_SECRET_CONSUME_RATE); - } -} +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; @@ -4249,7 +5138,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); for (n = 0; n < nb_blocks; n++) { - XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512); + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); } @@ -4257,12 +5146,12 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, XXH_ASSERT(len > XXH_STRIPE_LEN); { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); - XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); /* last stripe */ { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; #define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ - f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); } } } @@ -4307,12 +5196,12 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, const void* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble); + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); @@ -4323,29 +5212,30 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, } /* - * It's important for performance that XXH3_hashLong is not inlined. + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. */ -XXH_NO_INLINE XXH64_hash_t +XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; - return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc); + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); } /* - * It's important for performance that XXH3_hashLong is not inlined. - * Since the function is not inlined, the compiler may not be able to understand that, - * in some scenarios, its `secret` argument is actually a compile time constant. - * This variant enforces that the compiler can detect that, - * and uses this opportunity to streamline the generated code for better performance. + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. */ -XXH_NO_INLINE XXH64_hash_t +XXH_NO_INLINE XXH_PUREF XXH64_hash_t XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; (void)secret; (void)secretLen; - return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc); + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); } /* @@ -4362,18 +5252,20 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, XXH64_hash_t seed, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble, XXH3_f_initCustomSecret f_initSec) { +#if XXH_SIZE_OPT <= 0 if (seed == 0) return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), - f_acc512, f_scramble); + f_acc, f_scramble); +#endif { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; f_initSec(secret, seed); return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), - f_acc512, f_scramble); + f_acc, f_scramble); } } @@ -4381,12 +5273,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, * It's important for performance that XXH3_hashLong is not inlined. */ XXH_NO_INLINE XXH64_hash_t -XXH3_hashLong_64b_withSeed(const void* input, size_t len, - XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen) +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) { (void)secret; (void)secretLen; return XXH3_hashLong_64b_withSeed_internal(input, len, seed, - XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); } @@ -4418,29 +5310,37 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, /* === Public entry point === */ -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) { - return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) { - return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH64_hash_t -XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) { - return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); } +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} -/* === XXH3 streaming === */ +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM /* * Malloc's a pointer that is always aligned to align. * @@ -4464,7 +5364,7 @@ XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) * * Align must be a power of 2 and 8 <= align <= 128. */ -static void* XXH_alignedMalloc(size_t s, size_t align) +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) { XXH_ASSERT(align <= 128 && align >= 8); /* range check */ XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ @@ -4506,7 +5406,7 @@ static void XXH_alignedFree(void* p) XXH_free(base); } } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) { XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); @@ -4515,24 +5415,24 @@ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) return state; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) { XXH_alignedFree(statePtr); return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API void -XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) { - memcpy(dst_state, src_state, sizeof(*dst_state)); + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); } static void XXH3_reset_internal(XXH3_state_t* statePtr, - XXH64_hash_t seed, - const void* secret, size_t secretSize) + XXH64_hash_t seed, + const void* secret, size_t secretSize) { size_t const initStart = offsetof(XXH3_state_t, bufferedSize); size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; @@ -4549,24 +5449,25 @@ XXH3_reset_internal(XXH3_state_t* statePtr, statePtr->acc[6] = XXH_PRIME64_5; statePtr->acc[7] = XXH_PRIME32_1; statePtr->seed = seed; + statePtr->useSeed = (seed != 0); statePtr->extSecret = (const unsigned char*)secret; XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset(XXH3_state_t* statePtr) +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) { if (statePtr == NULL) return XXH_ERROR; XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) { if (statePtr == NULL) return XXH_ERROR; XXH3_reset_internal(statePtr, 0, secret, secretSize); @@ -4575,17 +5476,30 @@ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) { if (statePtr == NULL) return XXH_ERROR; if (seed==0) return XXH3_64bits_reset(statePtr); - if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); return XXH_OK; } +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + /* Note : when XXH3_consumeStripes() is invoked, * there must be a guarantee that at least one more byte must be consumed from input * so that the function can blindly consume all stripes using the "normal" secret segment */ @@ -4594,7 +5508,7 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, const xxh_u8* XXH_RESTRICT input, size_t nbStripes, const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ @@ -4603,45 +5517,58 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, /* need a scrambling operation */ size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; - XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512); + f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock); f_scramble(acc, secret + secretLimit); - XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512); + f_acc(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock); *nbStripesSoFarPtr = nbStripesAfterBlock; } else { - XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512); + f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes); *nbStripesSoFarPtr += nbStripes; } } +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif /* * Both XXH3_64bits_update and XXH3_128bits_update use this routine. */ XXH_FORCE_INLINE XXH_errorcode -XXH3_update(XXH3_state_t* state, - const xxh_u8* input, size_t len, - XXH3_f_accumulate_512 f_acc512, +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { - if (input==NULL) -#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + XXH_ASSERT(len == 0); return XXH_OK; -#else - return XXH_ERROR; -#endif + } + XXH_ASSERT(state != NULL); { const xxh_u8* const bEnd = input + len; const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; - +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif state->totalLen += len; XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); - if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ + /* small input : just fill in tmp buffer */ + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { XXH_memcpy(state->buffer + state->bufferedSize, input, len); state->bufferedSize += (XXH32_hash_t)len; return XXH_OK; } - /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ @@ -4653,45 +5580,82 @@ XXH3_update(XXH3_state_t* state, size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); input += loadSize; - XXH3_consumeStripes(state->acc, + XXH3_consumeStripes(acc, &state->nbStripesSoFar, state->nbStripesPerBlock, state->buffer, XXH3_INTERNALBUFFER_STRIPES, secret, state->secretLimit, - f_acc512, f_scramble); + f_acc, f_scramble); state->bufferedSize = 0; } XXH_ASSERT(input < bEnd); - /* Consume input by a multiple of internal buffer size */ - if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) { - const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; - do { - XXH3_consumeStripes(state->acc, - &state->nbStripesSoFar, state->nbStripesPerBlock, - input, XXH3_INTERNALBUFFER_STRIPES, - secret, state->secretLimit, - f_acc512, f_scramble); - input += XXH3_INTERNALBUFFER_SIZE; - } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + /* large input to consume : ingest per full block */ + if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar); + /* join to current block's end */ + { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar; + XXH_ASSERT(nbStripesToEnd <= nbStripes); + f_acc(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd); + f_scramble(acc, secret + state->secretLimit); + state->nbStripesSoFar = 0; + input += nbStripesToEnd * XXH_STRIPE_LEN; + nbStripes -= nbStripesToEnd; + } + /* consume per entire blocks */ + while(nbStripes >= state->nbStripesPerBlock) { + f_acc(acc, input, secret, state->nbStripesPerBlock); + f_scramble(acc, secret + state->secretLimit); + input += state->nbStripesPerBlock * XXH_STRIPE_LEN; + nbStripes -= state->nbStripesPerBlock; + } + /* consume last partial block */ + f_acc(acc, input, secret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + XXH_ASSERT(input < bEnd); /* at least some bytes left */ + state->nbStripesSoFar = nbStripes; + /* buffer predecessor of last partial stripe */ + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN); + } else { + /* content to consume <= block size */ + /* Consume input by a multiple of internal buffer size */ + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + input += XXH3_INTERNALBUFFER_SIZE; + } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + } } - XXH_ASSERT(input < bEnd); /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + memcpy(state->acc, acc, sizeof(acc)); +#endif } return XXH_OK; } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) { return XXH3_update(state, (const xxh_u8*)input, len, - XXH3_accumulate_512, XXH3_scrambleAcc); + XXH3_accumulate, XXH3_scrambleAcc); } @@ -4704,7 +5668,7 @@ XXH3_digest_long (XXH64_hash_t* acc, * Digest on a local copy. This way, the state remains unaltered, and it can * continue ingesting more input afterwards. */ - memcpy(acc, state->acc, sizeof(state->acc)); + XXH_memcpy(acc, state->acc, sizeof(state->acc)); if (state->bufferedSize >= XXH_STRIPE_LEN) { size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; size_t nbStripesSoFar = state->nbStripesSoFar; @@ -4712,7 +5676,7 @@ XXH3_digest_long (XXH64_hash_t* acc, &nbStripesSoFar, state->nbStripesPerBlock, state->buffer, nbStripes, secret, state->secretLimit, - XXH3_accumulate_512, XXH3_scrambleAcc); + XXH3_accumulate, XXH3_scrambleAcc); /* last stripe */ XXH3_accumulate_512(acc, state->buffer + state->bufferedSize - XXH_STRIPE_LEN, @@ -4721,16 +5685,16 @@ XXH3_digest_long (XXH64_hash_t* acc, xxh_u8 lastStripe[XXH_STRIPE_LEN]; size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ - memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); - memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); XXH3_accumulate_512(acc, lastStripe, secret + state->secretLimit - XXH_SECRET_LASTACC_START); } } -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) { const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; if (state->totalLen > XXH3_MIDSIZE_MAX) { @@ -4741,57 +5705,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) (xxh_u64)state->totalLen * XXH_PRIME64_1); } /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ - if (state->seed) + if (state->useSeed) return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), secret, state->secretLimit + XXH_STRIPE_LEN); } - - -#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) - -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API void -XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize) -{ - XXH_ASSERT(secretBuffer != NULL); - if (customSeedSize == 0) { - memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); - return; - } - XXH_ASSERT(customSeed != NULL); - - { size_t const segmentSize = sizeof(XXH128_hash_t); - size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize; - XXH128_canonical_t scrambler; - XXH64_hash_t seeds[12]; - size_t segnb; - XXH_ASSERT(nbSegments == 12); - XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */ - XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); - - /* - * Copy customSeed to seeds[], truncating or repeating as necessary. - */ - { size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds)); - size_t filled = toFill; - memcpy(seeds, customSeed, toFill); - while (filled < sizeof(seeds)) { - toFill = XXH_MIN(filled, sizeof(seeds) - filled); - memcpy((char*)seeds + filled, seeds, toFill); - filled += toFill; - } } - - /* generate secret */ - memcpy(secretBuffer, &scrambler, sizeof(scrambler)); - for (segnb=1; segnb < nbSegments; segnb++) { - size_t const segmentStart = segnb * segmentSize; - XXH128_canonical_t segment; - XXH128_canonicalFromHash(&segment, - XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) ); - memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment)); - } } -} +#endif /* !XXH_NO_STREAM */ /* ========================================== @@ -4811,7 +5730,7 @@ XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSee * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). */ -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { /* A doubled version of 1to3_64b with different constants. */ @@ -4840,7 +5759,7 @@ XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ } } -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -4867,7 +5786,7 @@ XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_ } } -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(input != NULL); @@ -4942,7 +5861,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64 /* * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN */ -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) { XXH_ASSERT(len <= 16); @@ -4973,7 +5892,7 @@ XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, } -XXH_FORCE_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) @@ -4984,6 +5903,16 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, { XXH128_hash_t acc; acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else if (len > 32) { if (len > 64) { if (len > 96) { @@ -4994,6 +5923,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); } acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif { XXH128_hash_t h128; h128.low64 = acc.low64 + acc.high64; h128.high64 = (acc.low64 * XXH_PRIME64_1) @@ -5006,7 +5936,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, } } -XXH_NO_INLINE XXH128_hash_t +XXH_NO_INLINE XXH_PUREF XXH128_hash_t XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed) @@ -5015,25 +5945,34 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); { XXH128_hash_t acc; - int const nbRounds = (int)len / 32; - int i; + unsigned i; acc.low64 = len * XXH_PRIME64_1; acc.high64 = 0; - for (i=0; i<4; i++) { + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { acc = XXH128_mix32B(acc, - input + (32 * i), - input + (32 * i) + 16, - secret + (32 * i), + input + i - 32, + input + i - 16, + secret + i - 32, seed); } acc.low64 = XXH3_avalanche(acc.low64); acc.high64 = XXH3_avalanche(acc.high64); - XXH_ASSERT(nbRounds >= 4); - for (i=4 ; i < nbRounds; i++) { + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { acc = XXH128_mix32B(acc, - input + (32 * i), - input + (32 * i) + 16, - secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, seed); } /* last bytes */ @@ -5041,7 +5980,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, input + len - 16, input + len - 32, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, - 0ULL - seed); + (XXH64_hash_t)0 - seed); { XXH128_hash_t h128; h128.low64 = acc.low64 + acc.high64; @@ -5058,12 +5997,12 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble) { XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; - XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble); + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); /* converge into final hash */ XXH_STATIC_ASSERT(sizeof(acc) == 64); @@ -5081,46 +6020,47 @@ XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, } /* - * It's important for performance that XXH3_hashLong is not inlined. + * It's important for performance that XXH3_hashLong() is not inlined. */ -XXH_NO_INLINE XXH128_hash_t +XXH_NO_INLINE XXH_PUREF XXH128_hash_t XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; (void)secret; (void)secretLen; return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), - XXH3_accumulate_512, XXH3_scrambleAcc); + XXH3_accumulate, XXH3_scrambleAcc); } /* - * It's important for performance that XXH3_hashLong is not inlined. + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. */ -XXH_NO_INLINE XXH128_hash_t +XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) { (void)seed64; return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, - XXH3_accumulate_512, XXH3_scrambleAcc); + XXH3_accumulate, XXH3_scrambleAcc); } XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, XXH64_hash_t seed64, - XXH3_f_accumulate_512 f_acc512, + XXH3_f_accumulate f_acc, XXH3_f_scrambleAcc f_scramble, XXH3_f_initCustomSecret f_initSec) { if (seed64 == 0) return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), - f_acc512, f_scramble); + f_acc, f_scramble); { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; f_initSec(secret, seed64); return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), - f_acc512, f_scramble); + f_acc, f_scramble); } } @@ -5133,7 +6073,7 @@ XXH3_hashLong_128b_withSeed(const void* input, size_t len, { (void)secret; (void)secretLen; return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, - XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); } typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, @@ -5163,88 +6103,94 @@ XXH3_128bits_internal(const void* input, size_t len, /* === Public XXH128 API === */ -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) { return XXH3_128bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_default); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) { return XXH3_128bits_internal(input, len, 0, (const xxh_u8*)secret, secretSize, XXH3_hashLong_128b_withSecret); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) { return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_128b_withSeed); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH128(const void* input, size_t len, XXH64_hash_t seed) +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) { return XXH3_128bits_withSeed(input, len, seed); } /* === XXH3 128-bit streaming === */ - +#ifndef XXH_NO_STREAM /* - * All the functions are actually the same as for 64-bit streaming variant. + * All initialization and update functions are identical to 64-bit streaming variant. * The only difference is the finalization routine. */ -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset(XXH3_state_t* statePtr) +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) { - if (statePtr == NULL) return XXH_ERROR; - XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); - return XXH_OK; + return XXH3_64bits_reset(statePtr); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) { - if (statePtr == NULL) return XXH_ERROR; - XXH3_reset_internal(statePtr, 0, secret, secretSize); - if (secret == NULL) return XXH_ERROR; - if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; - return XXH_OK; + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) { - if (statePtr == NULL) return XXH_ERROR; - if (seed==0) return XXH3_128bits_reset(statePtr); - if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); - XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); - return XXH_OK; + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH_errorcode -XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) { return XXH3_update(state, (const xxh_u8*)input, len, - XXH3_accumulate_512, XXH3_scrambleAcc); + XXH3_accumulate, XXH3_scrambleAcc); } -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) { const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; if (state->totalLen > XXH3_MIDSIZE_MAX) { @@ -5268,13 +6214,13 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), secret, state->secretLimit + XXH_STRIPE_LEN); } - +#endif /* !XXH_NO_STREAM */ /* 128-bit utility functions */ #include /* memcmp, memcpy */ /* return : 1 is equal, 0 if different */ -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) { /* note : XXH128_hash_t is compact, it has no padding byte */ @@ -5282,11 +6228,11 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) } /* This prototype is compatible with stdlib's qsort(). - * return : >0 if *h128_1 > *h128_2 - * <0 if *h128_1 < *h128_2 - * =0 if *h128_1 == *h128_2 */ -/*! @ingroup xxh3_family */ -XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) { XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; @@ -5298,22 +6244,22 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) /*====== Canonical representation ======*/ -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API void -XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) { XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); if (XXH_CPU_LITTLE_ENDIAN) { hash.high64 = XXH_swap64(hash.high64); hash.low64 = XXH_swap64(hash.low64); } - memcpy(dst, &hash.high64, sizeof(hash.high64)); - memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); } -/*! @ingroup xxh3_family */ +/*! @ingroup XXH3_family */ XXH_PUBLIC_API XXH128_hash_t -XXH128_hashFromCanonical(const XXH128_canonical_t* src) +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) { XXH128_hash_t h; h.high64 = XXH_readBE64(src); @@ -5321,10 +6267,81 @@ XXH128_hashFromCanonical(const XXH128_canonical_t* src) return h; } + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; n /* END RocksDB customizations */ +// clang-format off #if defined (__cplusplus) extern "C" { #endif diff --git a/utilities/backup/backup_engine.cc b/utilities/backup/backup_engine.cc index 877a884eb02..d8f7d928a08 100644 --- a/utilities/backup/backup_engine.cc +++ b/utilities/backup/backup_engine.cc @@ -13,11 +13,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -88,9 +90,7 @@ const std::string kSharedChecksumDirSlash = kSharedChecksumDirName + "/"; void BackupStatistics::IncrementNumberSuccessBackup() { number_success_backup++; } -void BackupStatistics::IncrementNumberFailBackup() { - number_fail_backup++; -} +void BackupStatistics::IncrementNumberFailBackup() { number_fail_backup++; } uint32_t BackupStatistics::GetNumberSuccessBackup() const { return number_success_backup; @@ -157,16 +157,10 @@ class BackupEngineImpl { void GetCorruptedBackups(std::vector* corrupt_backup_ids) const; - IOStatus RestoreDBFromBackup(const RestoreOptions& options, - BackupID backup_id, const std::string& db_dir, - const std::string& wal_dir) const; - - IOStatus RestoreDBFromLatestBackup(const RestoreOptions& options, - const std::string& db_dir, - const std::string& wal_dir) const { - // Note: don't read latest_valid_backup_id_ outside of lock - return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir); - } + IOStatus RestoreDBFromBackup( + const RestoreOptions& options, BackupID backup_id, + const std::string& db_dir, const std::string& wal_dir, + const std::list& locked_restore_from_dirs) const; IOStatus VerifyBackup(BackupID backup_id, bool verify_with_checksum = false) const; @@ -222,6 +216,7 @@ class BackupEngineImpl { FileInfo& operator=(const FileInfo&) = delete; int refs; + // Relative path from backup dir const std::string filename; const uint64_t size; // crc32c checksum as hex. empty == unknown / unavailable @@ -235,7 +230,7 @@ class BackupEngineImpl { const std::string db_session_id; Temperature temp; - std::string GetDbFileName() { + std::string GetDbFileName() const { std::string rv; // extract the filename part size_t slash = filename.find_last_of('/'); @@ -399,12 +394,8 @@ class BackupEngineImpl { timestamp_ = /* something clearly fabricated */ 1; } } - int64_t GetTimestamp() const { - return timestamp_; - } - uint64_t GetSize() const { - return size_; - } + int64_t GetTimestamp() const { return timestamp_; } + uint64_t GetSize() const { return size_; } uint32_t GetNumberFiles() const { return static_cast(files_.size()); } @@ -421,6 +412,10 @@ class BackupEngineImpl { IOStatus AddFile(std::shared_ptr file_info); + void AddExcludedFile(const std::string& relative_file) { + excluded_files_.emplace_back(relative_file); + } + IOStatus Delete(bool delete_meta = true); bool Empty() const { return files_.empty(); } @@ -437,6 +432,10 @@ class BackupEngineImpl { return files_; } + const std::vector& GetExcludedFiles() const { + return excluded_files_; + } + // @param abs_path_to_size Pre-fetched file sizes (bytes). IOStatus LoadFromFile( const std::string& backup_dir, @@ -494,6 +493,7 @@ class BackupEngineImpl { std::string const meta_tmp_filename_; // files with relative paths (without "/" prefix!!) std::vector> files_; + std::vector excluded_files_; std::unordered_map>* file_infos_; Env* env_; mutable std::shared_ptr env_for_open_; @@ -506,12 +506,11 @@ class BackupEngineImpl { bool include_file_details) const; inline std::string GetAbsolutePath( - const std::string &relative_path = "") const { + const std::string& relative_path = "") const { assert(relative_path.size() == 0 || relative_path[0] != '/'); return options_.backup_dir + "/" + relative_path; } - inline std::string GetPrivateFileRel(BackupID backup_id, - bool tmp = false, + inline std::string GetPrivateFileRel(BackupID backup_id, bool tmp = false, const std::string& file = "") const { assert(file.size() == 0 || file[0] != '/'); return kPrivateDirSlash + std::to_string(backup_id) + (tmp ? ".tmp" : "") + @@ -688,17 +687,19 @@ class BackupEngineImpl { return *this; } - CopyOrCreateWorkItem( - std::string _src_path, std::string _dst_path, - const Temperature _src_temperature, const Temperature _dst_temperature, - std::string _contents, Env* _src_env, Env* _dst_env, - EnvOptions _src_env_options, bool _sync, RateLimiter* _rate_limiter, - uint64_t _size_limit, Statistics* _stats, - std::function _progress_callback = []() {}, - const std::string& _src_checksum_func_name = - kUnknownFileChecksumFuncName, - const std::string& _src_checksum_hex = "", - const std::string& _db_id = "", const std::string& _db_session_id = "") + CopyOrCreateWorkItem(std::string _src_path, std::string _dst_path, + const Temperature _src_temperature, + const Temperature _dst_temperature, + std::string _contents, Env* _src_env, Env* _dst_env, + EnvOptions _src_env_options, bool _sync, + RateLimiter* _rate_limiter, uint64_t _size_limit, + Statistics* _stats, + std::function _progress_callback = {}, + const std::string& _src_checksum_func_name = + kUnknownFileChecksumFuncName, + const std::string& _src_checksum_hex = "", + const std::string& _db_id = "", + const std::string& _db_session_id = "") : src_path(std::move(_src_path)), dst_path(std::move(_dst_path)), src_temperature(_src_temperature), @@ -727,12 +728,12 @@ class BackupEngineImpl { std::string dst_path; std::string dst_relative; BackupAfterCopyOrCreateWorkItem() - : shared(false), - needed_to_copy(false), - backup_env(nullptr), - dst_path_tmp(""), - dst_path(""), - dst_relative("") {} + : shared(false), + needed_to_copy(false), + backup_env(nullptr), + dst_path_tmp(""), + dst_path(""), + dst_relative("") {} BackupAfterCopyOrCreateWorkItem( BackupAfterCopyOrCreateWorkItem&& o) noexcept { @@ -765,6 +766,9 @@ class BackupEngineImpl { dst_relative(std::move(_dst_relative)) {} }; + using BackupWorkItemPair = + std::pair; + struct RestoreAfterCopyOrCreateWorkItem { std::future result; std::string from_file; @@ -814,13 +818,14 @@ class BackupEngineImpl { // @param contents If non-empty, the file will be created with these contents. IOStatus AddBackupFileWorkItem( std::unordered_set& live_dst_paths, - std::vector& backup_items_to_finish, - BackupID backup_id, bool shared, const std::string& src_dir, + std::deque& backup_items_to_finish, + std::deque* excludable_items, BackupID backup_id, + bool shared, const std::string& src_dir, const std::string& fname, // starts with "/" const EnvOptions& src_env_options, RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes, Statistics* stats, uint64_t size_limit = 0, bool shared_checksum = false, - std::function progress_callback = []() {}, + std::function progress_callback = {}, const std::string& contents = std::string(), const std::string& src_checksum_func_name = kUnknownFileChecksumFuncName, const std::string& src_checksum_str = kUnknownFileChecksum, @@ -832,8 +837,8 @@ class BackupEngineImpl { std::map> backups_; std::map>> corrupt_backups_; - std::unordered_map> backuped_file_infos_; + std::unordered_map> + backuped_file_infos_; std::atomic stop_backup_; // options data @@ -930,8 +935,37 @@ class BackupEngineImplThreadSafe : public BackupEngine, IOStatus RestoreDBFromBackup(const RestoreOptions& options, BackupID backup_id, const std::string& db_dir, const std::string& wal_dir) const override { - ReadLock lock(&mutex_); - return impl_.RestoreDBFromBackup(options, backup_id, db_dir, wal_dir); + // TSAN reports a lock inversion (potential deadlock) if we acquire read + // locks in different orders. Assuming the implementation of RWMutex + // allows simultaneous read locks, there should be no deadlock, because + // there is no write lock involved here. Nevertheless, to appease TSAN and + // in case of degraded RWMutex implementation, we lock the BackupEngines + // including this one and those in options.alternate_dirs in a consistent + // order. + // However, locked_restore_from_dirs is kept in "search" order. + std::list locked_restore_from_dirs; + std::vector mutexes; + + // Add `this` + locked_restore_from_dirs.emplace_back(&impl_); + mutexes.push_back(&mutex_); + + // Add alternates + for (BackupEngineReadOnlyBase* be : options.alternate_dirs) { + BackupEngineImplThreadSafe* bets = + static_cast_with_check( + be->AsBackupEngine()); + locked_restore_from_dirs.emplace_back(&bets->impl_); + mutexes.push_back(&bets->mutex_); + } + + // Acquire read locks in pointer order + std::sort(mutexes.begin(), mutexes.end()); + std::vector locks(mutexes.begin(), mutexes.end()); + + // Impl + return impl_.RestoreDBFromBackup(options, backup_id, db_dir, wal_dir, + locked_restore_from_dirs); } using BackupEngine::RestoreDBFromLatestBackup; @@ -948,6 +982,8 @@ class BackupEngineImplThreadSafe : public BackupEngine, return impl_.VerifyBackup(backup_id, verify_with_checksum); } + BackupEngine* AsBackupEngine() override { return this; } + // Not public API but needed IOStatus Initialize() { // No locking needed @@ -973,6 +1009,7 @@ class BackupEngineImplThreadSafe : public BackupEngine, mutable port::RWMutex mutex_; BackupEngineImpl impl_; }; + } // namespace IOStatus BackupEngine::Open(const BackupEngineOptions& options, Env* env, @@ -1044,8 +1081,8 @@ IOStatus BackupEngineImpl::Initialize() { options_.max_valid_backups_to_open = std::numeric_limits::max(); ROCKS_LOG_WARN( options_.info_log, - "`max_valid_backups_to_open` is not set to the default value. Ignoring " - "its value since BackupEngine is not read-only."); + "`max_valid_backups_to_open` is not set to the default value. " + "Ignoring its value since BackupEngine is not read-only."); } // gather the list of directories that we need to create @@ -1147,8 +1184,7 @@ IOStatus BackupEngineImpl::Initialize() { // load the backups if any, until valid_backups_to_open of the latest // non-corrupted backups have been successfully opened. int valid_backups_to_open = options_.max_valid_backups_to_open; - for (auto backup_iter = backups_.rbegin(); - backup_iter != backups_.rend(); + for (auto backup_iter = backups_.rbegin(); backup_iter != backups_.rend(); ++backup_iter) { assert(latest_backup_id_ == 0 || latest_backup_id_ > backup_iter->first); if (latest_backup_id_ == 0) { @@ -1308,6 +1344,12 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( return IOStatus::InvalidArgument("App metadata too large"); } + bool maybe_exclude_items = bool{options.exclude_files_callback}; + if (maybe_exclude_items && options_.schema_version < 2) { + return IOStatus::InvalidArgument( + "exclude_files_callback requires schema_version >= 2"); + } + if (options.decrease_background_thread_cpu_priority) { if (options.background_thread_cpu_priority < threads_cpu_priority_) { threads_cpu_priority_.store(options.background_thread_cpu_priority); @@ -1372,7 +1414,8 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( // live file. std::unordered_set live_dst_paths; - std::vector backup_items_to_finish; + std::deque excludable_items; + std::deque backup_items_to_finish; // Add a CopyOrCreateWorkItem to the channel for each live file Status disabled = db->DisableFileDeletions(); DBOptions db_options = db->GetDBOptions(); @@ -1444,7 +1487,8 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( break; } io_st = AddBackupFileWorkItem( - live_dst_paths, backup_items_to_finish, new_backup_id, + live_dst_paths, backup_items_to_finish, + maybe_exclude_items ? &excludable_items : nullptr, new_backup_id, options_.share_table_files && (type == kTableFile || type == kBlobFile), src_dirname, fname, src_env_options, rate_limiter, type, @@ -1459,7 +1503,8 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( FileType type) { Log(options_.info_log, "add file for backup %s", fname.c_str()); return AddBackupFileWorkItem( - live_dst_paths, backup_items_to_finish, new_backup_id, + live_dst_paths, backup_items_to_finish, + maybe_exclude_items ? &excludable_items : nullptr, new_backup_id, false /* shared */, "" /* src_dir */, fname, EnvOptions() /* src_env_options */, rate_limiter, type, contents.size(), db_options.statistics.get(), 0 /* size_limit */, @@ -1472,7 +1517,46 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( new_backup->SetSequenceNumber(sequence_number); } } - ROCKS_LOG_INFO(options_.info_log, "add files for backup done, wait finish."); + ROCKS_LOG_INFO(options_.info_log, "add files for backup done."); + if (io_s.ok() && maybe_exclude_items) { + assert(options.exclude_files_callback); + size_t count = excludable_items.size(); + std::vector maybe_exclude_files; + maybe_exclude_files.reserve(count); + for (auto& e : excludable_items) { + maybe_exclude_files.emplace_back( + BackupExcludedFileInfo(e.second.dst_relative)); + } + if (count > 0) { + try { + options.exclude_files_callback( + &maybe_exclude_files.front(), + /*end pointer*/ &maybe_exclude_files.back() + 1); + } catch (const std::exception& exn) { + io_s = IOStatus::Aborted("Exception in exclude_files_callback: " + + std::string(exn.what())); + } catch (...) { + io_s = IOStatus::Aborted("Unknown exception in exclude_files_callback"); + } + } + if (io_s.ok()) { + for (size_t i = 0; i < count; ++i) { + auto& e = excludable_items[i]; + if (maybe_exclude_files[i].exclude_decision) { + new_backup.get()->AddExcludedFile(e.second.dst_relative); + } else { + files_to_copy_or_create_.write(std::move(e.first)); + backup_items_to_finish.push_back(std::move(e.second)); + } + } + } + excludable_items.clear(); + } else { + assert(!options.exclude_files_callback); + assert(excludable_items.empty()); + } + ROCKS_LOG_INFO(options_.info_log, + "dispatch files for backup done, wait for finish."); IOStatus item_io_status; for (auto& item : backup_items_to_finish) { item.result.wait(); @@ -1643,6 +1727,11 @@ IOStatus BackupEngineImpl::DeleteBackupNoGC(BackupID backup_id) { return io_s; } backups_.erase(backup); + if (backups_.empty()) { + latest_valid_backup_id_ = 0; + } else { + latest_valid_backup_id_ = backups_.rbegin()->first; + } } else { auto corrupt = corrupt_backups_.find(backup_id); if (corrupt == corrupt_backups_.end()) { @@ -1702,7 +1791,7 @@ void BackupEngineImpl::SetBackupInfoFromBackupMeta( auto& file_details = backup_info->file_details; file_details.reserve(meta.GetFiles().size()); for (auto& file_ptr : meta.GetFiles()) { - BackupFileInfo& finfo = *file_details.emplace(file_details.end()); + BackupFileInfo& finfo = file_details.emplace_back(); finfo.relative_filename = file_ptr->filename; finfo.size = file_ptr->size; finfo.directory = dir; @@ -1714,7 +1803,10 @@ void BackupEngineImpl::SetBackupInfoFromBackupMeta( finfo.file_type = type; } // TODO: temperature, file_checksum, file_checksum_func_name + // finfo.temperature = file_ptr->temp; } + backup_info->excluded_files = meta.GetExcludedFiles(); + backup_info->name_for_open = GetAbsolutePath(GetPrivateFileRel(id)); backup_info->name_for_open.pop_back(); // remove trailing '/' backup_info->env_for_open = meta.GetEnvForOpen(); @@ -1772,7 +1864,8 @@ void BackupEngineImpl::GetCorruptedBackups( IOStatus BackupEngineImpl::RestoreDBFromBackup( const RestoreOptions& options, BackupID backup_id, - const std::string& db_dir, const std::string& wal_dir) const { + const std::string& db_dir, const std::string& wal_dir, + const std::list& locked_restore_from_dirs) const { assert(initialized_); if (backup_id == kLatestBackupIDMarker) { // Note: Read latest_valid_backup_id_ inside of lock @@ -1832,6 +1925,37 @@ IOStatus BackupEngineImpl::RestoreDBFromBackup( DeleteChildren(db_dir); } + // Files to restore, and from where (taking into account excluded files) + std::vector> + restore_file_infos; + restore_file_infos.reserve(backup->GetFiles().size() + + backup->GetExcludedFiles().size()); + + for (const auto& ef : backup->GetExcludedFiles()) { + const std::string& file = ef.relative_file; + + bool found = false; + for (auto be : locked_restore_from_dirs) { + auto it = be->backuped_file_infos_.find(file); + if (it != backuped_file_infos_.end()) { + restore_file_infos.emplace_back(be, &*it->second); + found = true; + break; + } + } + if (!found) { + return IOStatus::InvalidArgument( + "Excluded file " + file + " not found in other backups nor in " + + std::to_string(locked_restore_from_dirs.size() - 1) + + " alternate backup directories"); + } + } + + // Non-excluded files + for (const auto& file_info_shared : backup->GetFiles()) { + restore_file_infos.emplace_back(this, &*file_info_shared); + } + IOStatus io_s; std::vector restore_items_to_finish; std::string temporary_current_file; @@ -1839,8 +1963,13 @@ IOStatus BackupEngineImpl::RestoreDBFromBackup( std::unique_ptr db_dir_for_fsync; std::unique_ptr wal_dir_for_fsync; - for (const auto& file_info : backup->GetFiles()) { + for (const auto& engine_and_file_info : restore_file_infos) { + const FileInfo* file_info = engine_and_file_info.second; const std::string& file = file_info->filename; + std::string absolute_file = + engine_and_file_info.first->GetAbsolutePath(file); + Env* src_env = engine_and_file_info.first->backup_env_; + // 1. get DB filename std::string dst = file_info->GetDbFileName(); @@ -1884,8 +2013,8 @@ IOStatus BackupEngineImpl::RestoreDBFromBackup( ROCKS_LOG_INFO(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str()); CopyOrCreateWorkItem copy_or_create_work_item( - GetAbsolutePath(file), dst, Temperature::kUnknown /* src_temp */, - file_info->temp, "" /* contents */, backup_env_, db_env_, + absolute_file, dst, Temperature::kUnknown /* src_temp */, + file_info->temp, "" /* contents */, src_env, db_env_, EnvOptions() /* src_env_options */, options_.sync, options_.restore_rate_limiter.get(), file_info->size, nullptr /* stats */); @@ -2125,8 +2254,19 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( while (*bytes_toward_next_callback >= options_.callback_trigger_interval_size) { *bytes_toward_next_callback -= options_.callback_trigger_interval_size; - std::lock_guard lock(byte_report_mutex_); - progress_callback(); + if (progress_callback) { + std::lock_guard lock(byte_report_mutex_); + try { + progress_callback(); + } catch (const std::exception& exn) { + io_s = IOStatus::Aborted("Exception in progress_callback: " + + std::string(exn.what())); + break; + } catch (...) { + io_s = IOStatus::Aborted("Unknown exception in progress_callback"); + break; + } + } } } while (io_s.ok() && contents.empty() && data.size() > 0 && size_limit > 0); @@ -2147,11 +2287,12 @@ IOStatus BackupEngineImpl::CopyOrCreateFile( // fname will always start with "/" IOStatus BackupEngineImpl::AddBackupFileWorkItem( std::unordered_set& live_dst_paths, - std::vector& backup_items_to_finish, - BackupID backup_id, bool shared, const std::string& src_dir, - const std::string& fname, const EnvOptions& src_env_options, - RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes, - Statistics* stats, uint64_t size_limit, bool shared_checksum, + std::deque& backup_items_to_finish, + std::deque* excludable_items, BackupID backup_id, + bool shared, const std::string& src_dir, const std::string& fname, + const EnvOptions& src_env_options, RateLimiter* rate_limiter, + FileType file_type, uint64_t size_bytes, Statistics* stats, + uint64_t size_limit, bool shared_checksum, std::function progress_callback, const std::string& contents, const std::string& src_checksum_func_name, const std::string& src_checksum_str, const Temperature src_temperature) { @@ -2345,8 +2486,6 @@ IOStatus BackupEngineImpl::AddBackupFileWorkItem( // Step 3: Add work item if (!contents.empty() || need_to_copy) { - ROCKS_LOG_INFO(options_.info_log, "Copying %s to %s", fname.c_str(), - copy_dest_path->c_str()); CopyOrCreateWorkItem copy_or_create_work_item( src_dir.empty() ? "" : src_path, *copy_dest_path, src_temperature, Temperature::kUnknown /*dst_temp*/, contents, db_env_, backup_env_, @@ -2356,8 +2495,21 @@ IOStatus BackupEngineImpl::AddBackupFileWorkItem( BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item( copy_or_create_work_item.result.get_future(), shared, need_to_copy, backup_env_, temp_dest_path, final_dest_path, dst_relative); - files_to_copy_or_create_.write(std::move(copy_or_create_work_item)); - backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item)); + if (excludable_items != nullptr && shared && shared_checksum && + need_to_copy) { + ROCKS_LOG_INFO(options_.info_log, "Copying (if not excluded) %s to %s", + fname.c_str(), copy_dest_path->c_str()); + excludable_items->emplace_back(std::move(copy_or_create_work_item), + std::move(after_copy_or_create_work_item)); + } else { + // For files known not excluded, can start copying even before finishing + // the checkpoint + ROCKS_LOG_INFO(options_.info_log, "Copying %s to %s", fname.c_str(), + copy_dest_path->c_str()); + files_to_copy_or_create_.write(std::move(copy_or_create_work_item)); + backup_items_to_finish.push_back( + std::move(after_copy_or_create_work_item)); + } } else { std::promise promise_result; BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item( @@ -2748,6 +2900,7 @@ const std::string kAppMetaDataFieldName{"metadata"}; const std::string kFileCrc32cFieldName{"crc32"}; const std::string kFileSizeFieldName{"size"}; const std::string kTemperatureFieldName{"temp"}; +const std::string kExcludedFieldName{"ni::excluded"}; // Marks a (future) field that should cause failure if not recognized. // Other fields are assumed to be ignorable. For example, in the future @@ -2768,8 +2921,7 @@ const std::string kNonIgnorableFieldPrefix{"ni::"}; // ... //---------------------------------------------------------- // -// For schema version 2.x (not in public APIs, but -// forward-compatibility started): +// For schema version 2.x: //---------------------------------------------------------- // schema_version // @@ -2912,20 +3064,6 @@ IOStatus BackupEngineImpl::BackupMeta::LoadFromFile( const std::string& filename = components[0]; - uint64_t actual_size; - const std::shared_ptr file_info = GetFile(filename); - if (file_info) { - actual_size = file_info->size; - } else { - std::string abs_path = backup_dir + "/" + filename; - auto e = abs_path_to_size.find(abs_path); - if (e == abs_path_to_size.end()) { - return IOStatus::Corruption( - "Pathname in meta file not found on disk: " + abs_path); - } - actual_size = e->second; - } - if (schema_major_version >= 2) { if (components.size() % 2 != 1) { return IOStatus::Corruption( @@ -2947,8 +3085,10 @@ IOStatus BackupEngineImpl::BackupMeta::LoadFromFile( } } + std::optional expected_size{}; std::string checksum_hex; Temperature temp = Temperature::kUnknown; + bool excluded = false; for (unsigned i = 1; i < components.size(); i += 2) { const std::string& field_name = components[i]; const std::string& field_data = components[i + 1]; @@ -2962,14 +3102,7 @@ IOStatus BackupEngineImpl::BackupMeta::LoadFromFile( } checksum_hex = ChecksumInt32ToHex(checksum_value); } else if (field_name == kFileSizeFieldName) { - uint64_t ex_size = - std::strtoull(field_data.c_str(), nullptr, /*base*/ 10); - if (ex_size != actual_size) { - return IOStatus::Corruption( - "For file " + filename + " expected size " + - std::to_string(ex_size) + " but found size" + - std::to_string(actual_size)); - } + expected_size = std::strtoull(field_data.c_str(), nullptr, /*base*/ 10); } else if (field_name == kTemperatureFieldName) { auto iter = temperature_string_map.find(field_data); if (iter != temperature_string_map.end()) { @@ -2980,6 +3113,15 @@ IOStatus BackupEngineImpl::BackupMeta::LoadFromFile( // be safe. temp = Temperature::kUnknown; } + } else if (field_name == kExcludedFieldName) { + if (field_data == "true") { + excluded = true; + } else if (field_data == "false") { + excluded = false; + } else { + return IOStatus::NotSupported("Unrecognized value \"" + field_data + + "\" for field " + field_name); + } } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) { return IOStatus::NotSupported("Unrecognized non-ignorable file field " + field_name + " (from future version?)"); @@ -2992,8 +3134,29 @@ IOStatus BackupEngineImpl::BackupMeta::LoadFromFile( } } - files.emplace_back(new FileInfo(filename, actual_size, checksum_hex, - /*id*/ "", /*sid*/ "", temp)); + if (excluded) { + excluded_files_.emplace_back(filename); + } else { + // Verify file exists, with expected size + std::string abs_path = backup_dir + "/" + filename; + auto e = abs_path_to_size.find(abs_path); + if (e == abs_path_to_size.end()) { + return IOStatus::Corruption( + "Pathname in meta file not found on disk: " + abs_path); + } + uint64_t actual_size = e->second; + if (expected_size.has_value() && *expected_size != actual_size) { + return IOStatus::Corruption("For file " + filename + " expected size " + + std::to_string(*expected_size) + + " but found size" + + std::to_string(actual_size)); + } + + // NOTE: FileInfo will be coalesced for sharing later (AddFile below) + files.emplace_back( + std::make_shared(filename, actual_size, checksum_hex, + /*id*/ "", /*sid*/ "", temp)); + } } if (footer_present) { @@ -3049,7 +3212,7 @@ IOStatus BackupEngineImpl::BackupMeta::LoadFromFile( const std::vector minor_version_strings{ "", // invalid major version 0 "", // implicit major version 1 - "2.0", + "2.1", }; IOStatus BackupEngineImpl::BackupMeta::StoreToFile( @@ -3127,6 +3290,11 @@ IOStatus BackupEngineImpl::BackupMeta::StoreToFile( buf << "\n"; } + for (const auto& file : excluded_files_) { + assert(schema_version >= 2); + buf << file.relative_file << " " << kExcludedFieldName << " true\n"; + } + if (schema_test_options && !schema_test_options->footer_fields.empty()) { buf << kFooterMarker << "\n"; for (auto& e : schema_test_options->footer_fields) { diff --git a/utilities/backup/backup_engine_impl.h b/utilities/backup/backup_engine_impl.h index 0ac218cb24d..398f47f2725 100644 --- a/utilities/backup/backup_engine_impl.h +++ b/utilities/backup/backup_engine_impl.h @@ -23,7 +23,7 @@ struct TEST_BackupMetaSchemaOptions { // unpublished schema version 2, for the life of this object (not backup_dir). // TEST_BackupMetaSchemaOptions offers some customization for testing. void TEST_SetBackupMetaSchemaOptions( - BackupEngine *engine, const TEST_BackupMetaSchemaOptions &options); + BackupEngine* engine, const TEST_BackupMetaSchemaOptions& options); // Modifies the BackupEngine(Impl) to use specified clocks for backup and // restore rate limiters created by default if not specified by users for diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc index b145c3a8c33..d780a1b2b26 100644 --- a/utilities/backup/backup_engine_test.cc +++ b/utilities/backup/backup_engine_test.cc @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -63,8 +65,11 @@ class DummyDB : public StackableDB { public: /* implicit */ DummyDB(const Options& options, const std::string& dbname) - : StackableDB(nullptr), options_(options), dbname_(dbname), - deletions_enabled_(true), sequence_number_(0) {} + : StackableDB(nullptr), + options_(options), + dbname_(dbname), + deletions_enabled_(true), + sequence_number_(0) {} SequenceNumber GetLatestSequenceNumber() const override { return ++sequence_number_; @@ -139,7 +144,7 @@ class DummyDB : public StackableDB { std::string dbname_; bool deletions_enabled_; mutable SequenceNumber sequence_number_; -}; // DummyDB +}; // DummyDB class TestFs : public FileSystemWrapper { public: @@ -545,7 +550,7 @@ class FileManager : public EnvWrapper { private: Random rnd_; -}; // FileManager +}; // FileManager // utility functions namespace { @@ -608,8 +613,8 @@ class BackupEngineTest : public testing::Test { kShareWithChecksum, }; - const std::vector kAllShareOptions = { - kNoShare, kShareNoChecksum, kShareWithChecksum}; + const std::vector kAllShareOptions = {kNoShare, kShareNoChecksum, + kShareWithChecksum}; BackupEngineTest() { // set up files @@ -632,7 +637,7 @@ class BackupEngineTest : public testing::Test { // set up db options options_.create_if_missing = true; options_.paranoid_checks = true; - options_.write_buffer_size = 1 << 17; // 128KB + options_.write_buffer_size = 1 << 17; // 128KB options_.wal_dir = dbname_; options_.enable_blob_files = true; @@ -751,7 +756,7 @@ class BackupEngineTest : public testing::Test { void CloseBackupEngine() { backup_engine_.reset(nullptr); } // cross-cutting test of GetBackupInfo - void AssertBackupInfoConsistency() { + void AssertBackupInfoConsistency(bool allow_excluded = false) { std::vector backup_info; backup_engine_->GetBackupInfo(&backup_info, /*with file details*/ true); std::map file_sizes; @@ -771,6 +776,9 @@ class BackupEngineTest : public testing::Test { sum_for_backup += file.size; } ASSERT_EQ(backup.size, sum_for_backup); + if (!allow_excluded) { + ASSERT_EQ(backup.excluded_files.size(), 0); + } } std::vector corrupt_backup_ids; @@ -1210,6 +1218,10 @@ TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) { // check backup 5 AssertBackupConsistency(5, 0, max_key); + // check that "latest backup" still works after deleting latest + ASSERT_OK(backup_engine_->DeleteBackup(5)); + AssertBackupConsistency(0, 0, 3 * keys_iteration, max_key); + CloseBackupEngine(); } #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) @@ -3087,10 +3099,25 @@ TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) { TEST_F(BackupEngineTest, ProgressCallbackDuringBackup) { DestroyDBWithoutCheck(dbname_, options_); - // Too big for this small DB - engine_options_->callback_trigger_interval_size = 100000; + OpenDBAndBackupEngine(true); FillDB(db_.get(), 0, 100); + + // First test exception handling + // Easily small enough for this small DB + engine_options_->callback_trigger_interval_size = 1000; + OpenBackupEngine(); + ASSERT_TRUE( + backup_engine_->CreateNewBackup(db_.get(), true, []() { throw 42; }) + .IsAborted()); + ASSERT_TRUE(backup_engine_ + ->CreateNewBackup(db_.get(), true, + []() { throw std::out_of_range("blah"); }) + .IsAborted()); + + // Too big for this small DB + engine_options_->callback_trigger_interval_size = 100000; + OpenBackupEngine(); bool is_callback_invoked = false; ASSERT_OK(backup_engine_->CreateNewBackup( db_.get(), true, @@ -3540,107 +3567,106 @@ TEST_F(BackupEngineTest, Concurrency) { std::array restore_verify_threads; for (uint32_t i = 0; i < read_threads.size(); ++i) { uint32_t sleep_micros = rng() % 100000; - read_threads[i] = - std::thread([this, i, sleep_micros, &db_opts, &be_opts, - &restore_verify_threads, &limiter] { - test_db_env_->SleepForMicroseconds(sleep_micros); - - // Whether to also re-open the BackupEngine, potentially seeing - // additional backups - bool reopen = i == 3; - // Whether we are going to restore "latest" - bool latest = i > 1; - - BackupEngine* my_be; - if (reopen) { - ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &my_be)); - } else { - my_be = backup_engine_.get(); - } + read_threads[i] = std::thread([this, i, sleep_micros, &db_opts, &be_opts, + &restore_verify_threads, &limiter] { + test_db_env_->SleepForMicroseconds(sleep_micros); - // Verify metadata (we don't receive updates from concurrently - // creating a new backup) - std::vector infos; - my_be->GetBackupInfo(&infos); - const uint32_t count = static_cast(infos.size()); - infos.clear(); - if (reopen) { - ASSERT_GE(count, 2U); - ASSERT_LE(count, 4U); - fprintf(stderr, "Reopen saw %u backups\n", count); - } else { - ASSERT_EQ(count, 2U); - } - std::vector ids; - my_be->GetCorruptedBackups(&ids); - ASSERT_EQ(ids.size(), 0U); - - // (Eventually, see below) Restore one of the backups, or "latest" - std::string restore_db_dir = dbname_ + "/restore" + std::to_string(i); - DestroyDir(test_db_env_.get(), restore_db_dir).PermitUncheckedError(); - BackupID to_restore; - if (latest) { - to_restore = count; - } else { - to_restore = i + 1; - } + // Whether to also re-open the BackupEngine, potentially seeing + // additional backups + bool reopen = i == 3; + // Whether we are going to restore "latest" + bool latest = i > 1; - // Open restored DB to verify its contents, but test atomic restore - // by doing it async and ensuring we either get OK or InvalidArgument - restore_verify_threads[i] = - std::thread([this, &db_opts, restore_db_dir, to_restore] { - DB* restored; - Status s; - for (;;) { - s = DB::Open(db_opts, restore_db_dir, &restored); - if (s.IsInvalidArgument()) { - // Restore hasn't finished - test_db_env_->SleepForMicroseconds(1000); - continue; - } else { - // We should only get InvalidArgument if restore is - // incomplete, or OK if complete - ASSERT_OK(s); - break; - } - } - int factor = std::min(static_cast(to_restore), max_factor); - AssertExists(restored, 0, factor * keys_iteration); - AssertEmpty(restored, factor * keys_iteration, - (factor + 1) * keys_iteration); - delete restored; - }); - - // (Ok now) Restore one of the backups, or "latest" - if (latest) { - ASSERT_OK(my_be->RestoreDBFromLatestBackup(restore_db_dir, - restore_db_dir)); - } else { - ASSERT_OK(my_be->VerifyBackup(to_restore, true)); - ASSERT_OK(my_be->RestoreDBFromBackup(to_restore, restore_db_dir, - restore_db_dir)); - } + BackupEngine* my_be; + if (reopen) { + ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &my_be)); + } else { + my_be = backup_engine_.get(); + } - // Test for race condition in reconfiguring limiter - // FIXME: this could set to a different value in all threads, except - // GenericRateLimiter::SetBytesPerSecond has a write-write race - // reported by TSAN - if (i == 0) { - limiter->SetBytesPerSecond(2000000000); - } + // Verify metadata (we don't receive updates from concurrently + // creating a new backup) + std::vector infos; + my_be->GetBackupInfo(&infos); + const uint32_t count = static_cast(infos.size()); + infos.clear(); + if (reopen) { + ASSERT_GE(count, 2U); + ASSERT_LE(count, 4U); + fprintf(stderr, "Reopen saw %u backups\n", count); + } else { + ASSERT_EQ(count, 2U); + } + std::vector ids; + my_be->GetCorruptedBackups(&ids); + ASSERT_EQ(ids.size(), 0U); + + // (Eventually, see below) Restore one of the backups, or "latest" + std::string restore_db_dir = dbname_ + "/restore" + std::to_string(i); + DestroyDir(test_db_env_.get(), restore_db_dir).PermitUncheckedError(); + BackupID to_restore; + if (latest) { + to_restore = count; + } else { + to_restore = i + 1; + } - // Re-verify metadata (we don't receive updates from concurrently - // creating a new backup) - my_be->GetBackupInfo(&infos); - ASSERT_EQ(infos.size(), count); - my_be->GetCorruptedBackups(&ids); - ASSERT_EQ(ids.size(), 0); - // fprintf(stderr, "Finished read thread\n"); + // Open restored DB to verify its contents, but test atomic restore + // by doing it async and ensuring we either get OK or InvalidArgument + restore_verify_threads[i] = + std::thread([this, &db_opts, restore_db_dir, to_restore] { + DB* restored; + Status s; + for (;;) { + s = DB::Open(db_opts, restore_db_dir, &restored); + if (s.IsInvalidArgument()) { + // Restore hasn't finished + test_db_env_->SleepForMicroseconds(1000); + continue; + } else { + // We should only get InvalidArgument if restore is + // incomplete, or OK if complete + ASSERT_OK(s); + break; + } + } + int factor = std::min(static_cast(to_restore), max_factor); + AssertExists(restored, 0, factor * keys_iteration); + AssertEmpty(restored, factor * keys_iteration, + (factor + 1) * keys_iteration); + delete restored; + }); + + // (Ok now) Restore one of the backups, or "latest" + if (latest) { + ASSERT_OK( + my_be->RestoreDBFromLatestBackup(restore_db_dir, restore_db_dir)); + } else { + ASSERT_OK(my_be->VerifyBackup(to_restore, true)); + ASSERT_OK(my_be->RestoreDBFromBackup(to_restore, restore_db_dir, + restore_db_dir)); + } - if (reopen) { - delete my_be; - } - }); + // Test for race condition in reconfiguring limiter + // FIXME: this could set to a different value in all threads, except + // GenericRateLimiter::SetBytesPerSecond has a write-write race + // reported by TSAN + if (i == 0) { + limiter->SetBytesPerSecond(2000000000); + } + + // Re-verify metadata (we don't receive updates from concurrently + // creating a new backup) + my_be->GetBackupInfo(&infos); + ASSERT_EQ(infos.size(), count); + my_be->GetCorruptedBackups(&ids); + ASSERT_EQ(ids.size(), 0); + // fprintf(stderr, "Finished read thread\n"); + + if (reopen) { + delete my_be; + } + }); } BackupEngine* alt_be; @@ -4174,7 +4200,7 @@ TEST_F(BackupEngineTest, FileTemperatures) { &info, /*include_file_details*/ true)); ASSERT_GT(info.file_details.size(), 2); for (auto& e : info.file_details) { - ASSERT_EQ(expected_temps[e.file_number], e.temperature); + EXPECT_EQ(expected_temps[e.file_number], e.temperature); } // Restore backup to another virtual (tiered) dir @@ -4196,7 +4222,178 @@ TEST_F(BackupEngineTest, FileTemperatures) { } } -} // anon namespace +TEST_F(BackupEngineTest, ExcludeFiles) { + // Required for excluding files + engine_options_->schema_version = 2; + + // Need a sufficent set of file numbers + options_.level0_file_num_compaction_trigger = 100; + + OpenDBAndBackupEngine(true, false, kShareWithChecksum); + // Need a sufficent set of file numbers + const int keys_iteration = 5000; + FillDB(db_.get(), 0, keys_iteration / 3); + FillDB(db_.get(), keys_iteration / 3, keys_iteration * 2 / 3); + FillDB(db_.get(), keys_iteration * 2 / 3, keys_iteration); + CloseAndReopenDB(); + + BackupEngine* alt_backup_engine; + BackupEngineOptions alt_engine_options{*engine_options_}; + // Use an alternate Env to test that support + std::string backup_alt_chroot = test::PerThreadDBPath("db_alt_backups"); + EXPECT_OK(Env::Default()->CreateDirIfMissing(backup_alt_chroot)); + alt_engine_options.backup_dir = "/altbk"; + std::shared_ptr alt_fs{ + NewChrootFileSystem(FileSystem::Default(), backup_alt_chroot)}; + std::unique_ptr alt_env{new CompositeEnvWrapper(Env::Default(), alt_fs)}; + alt_engine_options.backup_env = alt_env.get(); + + ASSERT_OK(BackupEngine::Open(test_db_env_.get(), alt_engine_options, + &alt_backup_engine)); + + // Ensure each backup is same set of files + db_.reset(); + DB* db = nullptr; + ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db)); + + // A callback that throws should cleanly fail the backup creation. + // Do this early to ensure later operations still work. + CreateBackupOptions cbo; + cbo.exclude_files_callback = [](MaybeExcludeBackupFile* /*files_begin*/, + MaybeExcludeBackupFile* /*files_end*/) { + throw 42; + }; + ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db).IsAborted()); + cbo.exclude_files_callback = [](MaybeExcludeBackupFile* /*files_begin*/, + MaybeExcludeBackupFile* /*files_end*/) { + throw std::out_of_range("blah"); + }; + ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db).IsAborted()); + + // Include files only in given bucket, based on modulus and remainder + constexpr int modulus = 4; + int remainder = 0; + + cbo.exclude_files_callback = [&remainder](MaybeExcludeBackupFile* files_begin, + MaybeExcludeBackupFile* files_end) { + for (auto* f = files_begin; f != files_end; ++f) { + std::string s = StringSplit(f->info.relative_file, '/').back(); + s = s.substr(0, s.find("_")); + int64_t num = std::strtoll(s.c_str(), nullptr, /*base*/ 10); + // Exclude if not a match + f->exclude_decision = (num % modulus) != remainder; + } + }; + + BackupID first_id{}; + BackupID last_alt_id{}; + remainder = 0; + ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db, &first_id)); + AssertBackupInfoConsistency(/*allow excluded*/ true); + remainder = 1; + ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db)); + AssertBackupInfoConsistency(/*allow excluded*/ true); + remainder = 2; + ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db)); + AssertBackupInfoConsistency(/*allow excluded*/ true); + remainder = 3; + ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db, &last_alt_id)); + AssertBackupInfoConsistency(/*allow excluded*/ true); + + // Close DB + ASSERT_OK(db->Close()); + delete db; + db = nullptr; + + for (auto be_pair : + {std::make_pair(backup_engine_.get(), alt_backup_engine), + std::make_pair(alt_backup_engine, backup_engine_.get())}) { + DestroyDB(dbname_, options_); + RestoreOptions ro; + // Fails without alternate dir + ASSERT_TRUE(be_pair.first->RestoreDBFromLatestBackup(dbname_, dbname_, ro) + .IsInvalidArgument()); + + DestroyDB(dbname_, options_); + // Works with alternate dir + ro.alternate_dirs.push_front(be_pair.second); + ASSERT_OK(be_pair.first->RestoreDBFromLatestBackup(dbname_, dbname_, ro)); + + // Check DB contents + db = OpenDB(); + AssertExists(db, 0, keys_iteration); + delete db; + } + + // Should still work after close and re-open + CloseBackupEngine(); + OpenBackupEngine(); + + for (auto be_pair : + {std::make_pair(backup_engine_.get(), alt_backup_engine), + std::make_pair(alt_backup_engine, backup_engine_.get())}) { + DestroyDB(dbname_, options_); + RestoreOptions ro; + ro.alternate_dirs.push_front(be_pair.second); + ASSERT_OK(be_pair.first->RestoreDBFromLatestBackup(dbname_, dbname_, ro)); + } + + // Deletion semantics are tricky when within a single backup dir one backup + // includes a file and the other backup excluded the file. The excluded one + // does not have a persistent record of metadata like file checksum, etc. + // Although it would be possible to amend the backup with the excluded file, + // that is not currently supported (unless you open the backup as read-only + // DB and take another backup of it). The "excluded" reference to the file + // is like a weak reference: it doesn't prevent the file from being deleted + // if all the backups with "included" references to it are deleted. + CloseBackupEngine(); + OpenBackupEngine(); + + AssertBackupInfoConsistency(/*allow excluded*/ true); + + ASSERT_OK(backup_engine_->DeleteBackup(first_id)); + ASSERT_OK(alt_backup_engine->DeleteBackup(last_alt_id)); + + // Includes check for any leaked backup files + AssertBackupInfoConsistency(/*allow excluded*/ true); + + // Excluded file(s) deleted, unable to restore + for (auto be_pair : + {std::make_pair(backup_engine_.get(), alt_backup_engine), + std::make_pair(alt_backup_engine, backup_engine_.get())}) { + RestoreOptions ro; + ro.alternate_dirs.push_front(be_pair.second); + ASSERT_TRUE(be_pair.first->RestoreDBFromLatestBackup(dbname_, dbname_, ro) + .IsInvalidArgument()); + } + + // Close & Re-open (no crash, etc.) + CloseBackupEngine(); + OpenBackupEngine(); + + AssertBackupInfoConsistency(/*allow excluded*/ true); + + // Excluded file(s) deleted, unable to restore + for (auto be_pair : + {std::make_pair(backup_engine_.get(), alt_backup_engine), + std::make_pair(alt_backup_engine, backup_engine_.get())}) { + RestoreOptions ro; + ro.alternate_dirs.push_front(be_pair.second); + ASSERT_TRUE(be_pair.first->RestoreDBFromLatestBackup(dbname_, dbname_, ro) + .IsInvalidArgument()); + } + + // Ensure files are not leaked after removing everything. + ASSERT_OK(backup_engine_->DeleteBackup(first_id + 1)); + ASSERT_OK(alt_backup_engine->DeleteBackup(last_alt_id - 1)); + + // Includes check for leaked backups files + AssertBackupInfoConsistency(/*allow excluded*/ false); + + delete alt_backup_engine; +} + +} // namespace } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h index ab430f8c0d8..e9d92486f9b 100644 --- a/utilities/blob_db/blob_db.h +++ b/utilities/blob_db/blob_db.h @@ -155,8 +155,7 @@ class BlobDB : public StackableDB { using ROCKSDB_NAMESPACE::StackableDB::MultiGet; virtual std::vector MultiGet( - const ReadOptions& options, - const std::vector& keys, + const ReadOptions& options, const std::vector& keys, std::vector* values) override = 0; virtual std::vector MultiGet( const ReadOptions& options, @@ -179,8 +178,8 @@ class BlobDB : public StackableDB { PinnableSlice* /*values*/, Status* statuses, const bool /*sorted_input*/ = false) override { for (size_t i = 0; i < num_keys; ++i) { - statuses[i] = Status::NotSupported( - "Blob DB doesn't support batched MultiGet"); + statuses[i] = + Status::NotSupported("Blob DB doesn't support batched MultiGet"); } } diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 4522c193e4e..87e294c5c09 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -6,6 +6,7 @@ #ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_db_impl.h" + #include #include #include @@ -1023,9 +1024,8 @@ Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key, return PutUntil(options, key, value, kNoExpiration); } -Status BlobDBImpl::PutWithTTL(const WriteOptions& options, - const Slice& key, const Slice& value, - uint64_t ttl) { +Status BlobDBImpl::PutWithTTL(const WriteOptions& options, const Slice& key, + const Slice& value, uint64_t ttl) { uint64_t now = EpochNow(); uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration; return PutUntil(options, key, value, expiration); @@ -1385,9 +1385,9 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, return s; } -std::vector BlobDBImpl::MultiGet( - const ReadOptions& read_options, - const std::vector& keys, std::vector* values) { +std::vector BlobDBImpl::MultiGet(const ReadOptions& read_options, + const std::vector& keys, + std::vector* values) { StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS); RecordTick(statistics_, BLOB_DB_NUM_MULTIGET); // Get a snapshot to avoid blob file get deleted between we diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h index 07d88113623..0b4dbf5e53b 100644 --- a/utilities/blob_db/blob_db_impl.h +++ b/utilities/blob_db/blob_db_impl.h @@ -124,8 +124,7 @@ class BlobDBImpl : public BlobDB { using BlobDB::MultiGet; virtual std::vector MultiGet( - const ReadOptions& read_options, - const std::vector& keys, + const ReadOptions& read_options, const std::vector& keys, std::vector* values) override; using BlobDB::Write; diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 9d5591bfa51..1744bda1ff4 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -58,8 +58,7 @@ class BlobDBTest : public testing::Test { }; BlobDBTest() - : dbname_(test::PerThreadDBPath("blob_db_test")), - blob_db_(nullptr) { + : dbname_(test::PerThreadDBPath("blob_db_test")), blob_db_(nullptr) { mock_clock_ = std::make_shared(SystemClock::Default()); mock_env_.reset(new CompositeEnvWrapper(Env::Default(), mock_clock_)); fault_injection_env_.reset(new FaultInjectionTestEnv(Env::Default())); @@ -209,7 +208,7 @@ class BlobDBTest : public testing::Test { void VerifyDB(DB *db, const std::map &data) { // Verify normal Get - auto* cfh = db->DefaultColumnFamily(); + auto *cfh = db->DefaultColumnFamily(); for (auto &p : data) { PinnableSlice value_slice; ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice)); @@ -2383,7 +2382,7 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) { } // namespace ROCKSDB_NAMESPACE // A black-box test for the ttl wrapper around rocksdb -int main(int argc, char** argv) { +int main(int argc, char **argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc index 5a628fd211f..1e063299015 100644 --- a/utilities/blob_db/blob_dump_tool.cc +++ b/utilities/blob_db/blob_dump_tool.cc @@ -226,7 +226,9 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob, DumpSlice(Slice(slice.data(), static_cast(key_size)), show_key); if (show_blob != DisplayType::kNone) { fprintf(stdout, " blob : "); - DumpSlice(Slice(slice.data() + static_cast(key_size), static_cast(value_size)), show_blob); + DumpSlice(Slice(slice.data() + static_cast(key_size), + static_cast(value_size)), + show_blob); } if (show_uncompressed_blob != DisplayType::kNone) { fprintf(stdout, " raw blob : "); diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index d092d45f8ab..c68e557c676 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -7,9 +7,9 @@ #include "utilities/blob_db/blob_file.h" #include -#include #include +#include #include #include "db/column_family.h" @@ -210,16 +210,14 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, file_size_ = file_size; } else { ROCKS_LOG_ERROR(info_log_, - "Failed to get size of blob file %" PRIu64 - ", status: %s", + "Failed to get size of blob file %" PRIu64 ", status: %s", file_number_, s.ToString().c_str()); return s; } if (file_size < BlobLogHeader::kSize) { - ROCKS_LOG_ERROR(info_log_, - "Incomplete blob file blob file %" PRIu64 - ", size: %" PRIu64, - file_number_, file_size); + ROCKS_LOG_ERROR( + info_log_, "Incomplete blob file blob file %" PRIu64 ", size: %" PRIu64, + file_number_, file_size); return Status::Corruption("Incomplete blob file header."); } @@ -250,10 +248,9 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, Env::IO_TOTAL /* rate_limiter_priority */); } if (!s.ok()) { - ROCKS_LOG_ERROR(info_log_, - "Failed to read header of blob file %" PRIu64 - ", status: %s", - file_number_, s.ToString().c_str()); + ROCKS_LOG_ERROR( + info_log_, "Failed to read header of blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); return s; } BlobLogHeader header; @@ -294,10 +291,9 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); } if (!s.ok()) { - ROCKS_LOG_ERROR(info_log_, - "Failed to read footer of blob file %" PRIu64 - ", status: %s", - file_number_, s.ToString().c_str()); + ROCKS_LOG_ERROR( + info_log_, "Failed to read footer of blob file %" PRIu64 ", status: %s", + file_number_, s.ToString().c_str()); return s; } BlobLogFooter footer; diff --git a/utilities/cache_dump_load_impl.cc b/utilities/cache_dump_load_impl.cc index 221634ea036..b5e21291b34 100644 --- a/utilities/cache_dump_load_impl.cc +++ b/utilities/cache_dump_load_impl.cc @@ -7,8 +7,6 @@ #include "table/block_based/block_based_table_reader.h" #ifndef ROCKSDB_LITE -#include "utilities/cache_dump_load_impl.h" - #include "cache/cache_entry_roles.h" #include "file/writable_file_writer.h" #include "port/lang.h" @@ -17,6 +15,7 @@ #include "rocksdb/utilities/ldb_cmd.h" #include "table/format.h" #include "util/crc32c.h" +#include "utilities/cache_dump_load_impl.h" namespace ROCKSDB_NAMESPACE { @@ -68,8 +67,7 @@ IOStatus CacheDumperImpl::DumpCacheEntriesToWriter() { return IOStatus::InvalidArgument("System clock is null"); } clock_ = options_.clock; - // We copy the Cache Deleter Role Map as its member. - role_map_ = CopyCacheDeleterRoleMap(); + // Set the sequence number sequence_num_ = 0; @@ -81,7 +79,8 @@ IOStatus CacheDumperImpl::DumpCacheEntriesToWriter() { // Then, we iterate the block cache and dump out the blocks that are not // filtered out. - cache_->ApplyToAllEntries(DumpOneBlockCallBack(), {}); + std::string buf; + cache_->ApplyToAllEntries(DumpOneBlockCallBack(buf), {}); // Finally, write the footer io_s = WriteFooter(); @@ -106,77 +105,57 @@ bool CacheDumperImpl::ShouldFilterOut(const Slice& key) { // This is the callback function which will be applied to // Cache::ApplyToAllEntries. In this callback function, we will get the block // type, decide if the block needs to be dumped based on the filter, and write -// the block through the provided writer. -std::function -CacheDumperImpl::DumpOneBlockCallBack() { - return [&](const Slice& key, void* value, size_t /*charge*/, - Cache::DeleterFn deleter) { - // Step 1: get the type of the block from role_map_ - auto e = role_map_.find(deleter); - CacheEntryRole role; - CacheDumpUnitType type = CacheDumpUnitType::kBlockTypeMax; - if (e == role_map_.end()) { - role = CacheEntryRole::kMisc; - } else { - role = e->second; +// the block through the provided writer. `buf` is passed in for efficiennt +// reuse. +std::function +CacheDumperImpl::DumpOneBlockCallBack(std::string& buf) { + return [&](const Slice& key, Cache::ObjectPtr value, size_t /*charge*/, + const Cache::CacheItemHelper* helper) { + if (helper == nullptr || helper->size_cb == nullptr || + helper->saveto_cb == nullptr) { + // Not compatible with dumping. Skip this entry. + return; } - bool filter_out = false; - // Step 2: based on the key prefix, check if the block should be filter out. - if (ShouldFilterOut(key)) { - filter_out = true; - } + CacheEntryRole role = helper->role; + CacheDumpUnitType type = CacheDumpUnitType::kBlockTypeMax; - // Step 3: based on the block type, get the block raw pointer and length. - const char* block_start = nullptr; - size_t block_len = 0; switch (role) { case CacheEntryRole::kDataBlock: type = CacheDumpUnitType::kData; - block_start = (static_cast(value))->data(); - block_len = (static_cast(value))->size(); break; case CacheEntryRole::kFilterBlock: type = CacheDumpUnitType::kFilter; - block_start = (static_cast(value)) - ->GetBlockContentsData() - .data(); - block_len = (static_cast(value)) - ->GetBlockContentsData() - .size(); break; case CacheEntryRole::kFilterMetaBlock: type = CacheDumpUnitType::kFilterMetaBlock; - block_start = (static_cast(value))->data(); - block_len = (static_cast(value))->size(); break; case CacheEntryRole::kIndexBlock: type = CacheDumpUnitType::kIndex; - block_start = (static_cast(value))->data(); - block_len = (static_cast(value))->size(); - break; - case CacheEntryRole::kDeprecatedFilterBlock: - // Obsolete - filter_out = true; - break; - case CacheEntryRole::kMisc: - filter_out = true; - break; - case CacheEntryRole::kOtherBlock: - filter_out = true; - break; - case CacheEntryRole::kWriteBuffer: - filter_out = true; break; default: - filter_out = true; + // Filter out other entries + // FIXME? Do we need the CacheDumpUnitTypes? UncompressionDict? + return; } - // Step 4: if the block should not be filter out, write the block to the - // CacheDumpWriter - if (!filter_out && block_start != nullptr) { - WriteBlock(type, key, Slice(block_start, block_len)) - .PermitUncheckedError(); + // based on the key prefix, check if the block should be filter out. + if (ShouldFilterOut(key)) { + return; + } + + assert(type != CacheDumpUnitType::kBlockTypeMax); + + // Use cache item helper to get persistable data + // FIXME: reduce copying + size_t len = helper->size_cb(value); + buf.assign(len, '\0'); + Status s = helper->saveto_cb(value, /*start*/ 0, len, buf.data()); + + if (s.ok()) { + // Write it out + WriteBlock(type, key, buf).PermitUncheckedError(); } }; } @@ -265,8 +244,6 @@ IOStatus CacheDumpedLoaderImpl::RestoreCacheEntriesToSecondaryCache() { if (reader_ == nullptr) { return IOStatus::InvalidArgument("CacheDumpReader is null"); } - // we copy the Cache Deleter Role Map as its member. - role_map_ = CopyCacheDeleterRoleMap(); // Step 2: read the header // TODO: we need to check the cache dump format version and RocksDB version @@ -281,7 +258,7 @@ IOStatus CacheDumpedLoaderImpl::RestoreCacheEntriesToSecondaryCache() { // Step 3: read out the rest of the blocks from the reader. The loop will stop // either I/O status is not ok or we reach to the the end. - while (io_s.ok() && dump_unit.type != CacheDumpUnitType::kFooter) { + while (io_s.ok()) { dump_unit.reset(); data.clear(); // read the content and store in the dump_unit @@ -289,74 +266,14 @@ IOStatus CacheDumpedLoaderImpl::RestoreCacheEntriesToSecondaryCache() { if (!io_s.ok()) { break; } + if (dump_unit.type == CacheDumpUnitType::kFooter) { + break; + } // Create the uncompressed_block based on the information in the dump_unit // (There is no block trailer here compatible with block-based SST file.) - BlockContents uncompressed_block( - Slice(static_cast(dump_unit.value), dump_unit.value_len)); - Cache::CacheItemHelper* helper = nullptr; - Statistics* statistics = nullptr; - Status s = Status::OK(); - // according to the block type, get the helper callback function and create - // the corresponding block - switch (dump_unit.type) { - case CacheDumpUnitType::kFilter: { - helper = BlocklikeTraits::GetCacheItemHelper( - BlockType::kFilter); - std::unique_ptr block_holder; - block_holder.reset(BlocklikeTraits::Create( - std::move(uncompressed_block), toptions_.read_amp_bytes_per_bit, - statistics, false, toptions_.filter_policy.get())); - if (helper != nullptr) { - s = secondary_cache_->Insert(dump_unit.key, - (void*)(block_holder.get()), helper); - } - break; - } - case CacheDumpUnitType::kData: { - helper = BlocklikeTraits::GetCacheItemHelper(BlockType::kData); - std::unique_ptr block_holder; - block_holder.reset(BlocklikeTraits::Create( - std::move(uncompressed_block), toptions_.read_amp_bytes_per_bit, - statistics, false, toptions_.filter_policy.get())); - if (helper != nullptr) { - s = secondary_cache_->Insert(dump_unit.key, - (void*)(block_holder.get()), helper); - } - break; - } - case CacheDumpUnitType::kIndex: { - helper = BlocklikeTraits::GetCacheItemHelper(BlockType::kIndex); - std::unique_ptr block_holder; - block_holder.reset(BlocklikeTraits::Create( - std::move(uncompressed_block), 0, statistics, false, - toptions_.filter_policy.get())); - if (helper != nullptr) { - s = secondary_cache_->Insert(dump_unit.key, - (void*)(block_holder.get()), helper); - } - break; - } - case CacheDumpUnitType::kFilterMetaBlock: { - helper = BlocklikeTraits::GetCacheItemHelper( - BlockType::kFilterPartitionIndex); - std::unique_ptr block_holder; - block_holder.reset(BlocklikeTraits::Create( - std::move(uncompressed_block), toptions_.read_amp_bytes_per_bit, - statistics, false, toptions_.filter_policy.get())); - if (helper != nullptr) { - s = secondary_cache_->Insert(dump_unit.key, - (void*)(block_holder.get()), helper); - } - break; - } - case CacheDumpUnitType::kFooter: - break; - case CacheDumpUnitType::kDeprecatedFilterBlock: - // Obsolete - break; - default: - continue; - } + Slice content = + Slice(static_cast(dump_unit.value), dump_unit.value_len); + Status s = secondary_cache_->InsertSaved(dump_unit.key, content); if (!s.ok()) { io_s = status_to_io_status(std::move(s)); } diff --git a/utilities/cache_dump_load_impl.h b/utilities/cache_dump_load_impl.h index f45b3360b1e..ad637b00dd7 100644 --- a/utilities/cache_dump_load_impl.h +++ b/utilities/cache_dump_load_impl.h @@ -12,11 +12,11 @@ #include "file/writable_file_writer.h" #include "rocksdb/utilities/cache_dump_load.h" #include "table/block_based/block.h" -#include "table/block_based/block_like_traits.h" #include "table/block_based/block_type.h" #include "table/block_based/cachable_entry.h" #include "table/block_based/parsed_full_filter_block.h" #include "table/block_based/reader_common.h" +#include "util/hash_containers.h" namespace ROCKSDB_NAMESPACE { @@ -108,13 +108,13 @@ class CacheDumperImpl : public CacheDumper { IOStatus WriteHeader(); IOStatus WriteFooter(); bool ShouldFilterOut(const Slice& key); - std::function - DumpOneBlockCallBack(); + std::function + DumpOneBlockCallBack(std::string& buf); CacheDumpOptions options_; std::shared_ptr cache_; std::unique_ptr writer_; - UnorderedMap role_map_; SystemClock* clock_; uint32_t sequence_num_; // The cache key prefix filter. Currently, we use db_session_id as the prefix, @@ -128,11 +128,10 @@ class CacheDumperImpl : public CacheDumper { class CacheDumpedLoaderImpl : public CacheDumpedLoader { public: CacheDumpedLoaderImpl(const CacheDumpOptions& dump_options, - const BlockBasedTableOptions& toptions, + const BlockBasedTableOptions& /*toptions*/, const std::shared_ptr& secondary_cache, std::unique_ptr&& reader) : options_(dump_options), - toptions_(toptions), secondary_cache_(secondary_cache), reader_(std::move(reader)) {} ~CacheDumpedLoaderImpl() {} @@ -145,10 +144,8 @@ class CacheDumpedLoaderImpl : public CacheDumpedLoader { IOStatus ReadCacheBlock(std::string* data, DumpUnit* dump_unit); CacheDumpOptions options_; - const BlockBasedTableOptions& toptions_; std::shared_ptr secondary_cache_; std::unique_ptr reader_; - UnorderedMap role_map_; }; // The default implementation of CacheDumpWriter. We write the blocks to a file diff --git a/utilities/cassandra/cassandra_compaction_filter.cc b/utilities/cassandra/cassandra_compaction_filter.cc index d59db47d418..4e48d63aabe 100644 --- a/utilities/cassandra/cassandra_compaction_filter.cc +++ b/utilities/cassandra/cassandra_compaction_filter.cc @@ -40,8 +40,8 @@ CompactionFilter::Decision CassandraCompactionFilter::FilterV2( const Slice& existing_value, std::string* new_value, std::string* /*skip_until*/) const { bool value_changed = false; - RowValue row_value = RowValue::Deserialize( - existing_value.data(), existing_value.size()); + RowValue row_value = + RowValue::Deserialize(existing_value.data(), existing_value.size()); RowValue compacted = options_.purge_ttl_on_expiration ? row_value.RemoveExpiredColumns(&value_changed) @@ -51,7 +51,7 @@ CompactionFilter::Decision CassandraCompactionFilter::FilterV2( compacted = compacted.RemoveTombstones(options_.gc_grace_period_in_seconds); } - if(compacted.Empty()) { + if (compacted.Empty()) { return Decision::kRemove; } diff --git a/utilities/cassandra/cassandra_compaction_filter.h b/utilities/cassandra/cassandra_compaction_filter.h index becadde3222..0325a4c3957 100644 --- a/utilities/cassandra/cassandra_compaction_filter.h +++ b/utilities/cassandra/cassandra_compaction_filter.h @@ -25,18 +25,18 @@ namespace cassandra { * promoted to kValue type after serials of merging in compaction. */ class CassandraCompactionFilter : public CompactionFilter { -public: - explicit CassandraCompactionFilter(bool purge_ttl_on_expiration, - int32_t gc_grace_period_in_seconds); - static const char* kClassName() { return "CassandraCompactionFilter"; } - const char* Name() const override { return kClassName(); } - - virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, - const Slice& existing_value, std::string* new_value, - std::string* skip_until) const override; - -private: - CassandraOptions options_; + public: + explicit CassandraCompactionFilter(bool purge_ttl_on_expiration, + int32_t gc_grace_period_in_seconds); + static const char* kClassName() { return "CassandraCompactionFilter"; } + const char* Name() const override { return kClassName(); } + + virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, + const Slice& existing_value, std::string* new_value, + std::string* skip_until) const override; + + private: + CassandraOptions options_; }; class CassandraCompactionFilterFactory : public CompactionFilterFactory { diff --git a/utilities/cassandra/cassandra_format_test.cc b/utilities/cassandra/cassandra_format_test.cc index 62c6ae508bc..4f12947ad9c 100644 --- a/utilities/cassandra/cassandra_format_test.cc +++ b/utilities/cassandra/cassandra_format_test.cc @@ -5,12 +5,12 @@ #include #include + #include "test_util/testharness.h" #include "utilities/cassandra/format.h" #include "utilities/cassandra/serialize.h" #include "utilities/cassandra/test_utils.h" - namespace ROCKSDB_NAMESPACE { namespace cassandra { @@ -51,8 +51,8 @@ TEST(ColumnTest, Column) { c1->Serialize(&dest); EXPECT_EQ(dest.size(), 2 * c.Size()); - EXPECT_TRUE( - std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == 0); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == + 0); // Verify the ColumnBase::Deserialization. saved_dest = dest; @@ -60,9 +60,8 @@ TEST(ColumnTest, Column) { ColumnBase::Deserialize(saved_dest.c_str(), c.Size()); c2->Serialize(&dest); EXPECT_EQ(dest.size(), 3 * c.Size()); - EXPECT_TRUE( - std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, c.Size()) - == 0); + EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, + c.Size()) == 0); } TEST(ExpiringColumnTest, ExpiringColumn) { @@ -71,8 +70,8 @@ TEST(ExpiringColumnTest, ExpiringColumn) { int8_t index = 3; int64_t timestamp = 1494022807044; int32_t ttl = 3600; - ExpiringColumn c = ExpiringColumn(mask, index, timestamp, - sizeof(data), data, ttl); + ExpiringColumn c = + ExpiringColumn(mask, index, timestamp, sizeof(data), data, ttl); EXPECT_EQ(c.Index(), index); EXPECT_EQ(c.Timestamp(), timestamp); @@ -107,8 +106,8 @@ TEST(ExpiringColumnTest, ExpiringColumn) { c1->Serialize(&dest); EXPECT_EQ(dest.size(), 2 * c.Size()); - EXPECT_TRUE( - std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == 0); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == + 0); // Verify the ColumnBase::Deserialization. saved_dest = dest; @@ -116,23 +115,24 @@ TEST(ExpiringColumnTest, ExpiringColumn) { ColumnBase::Deserialize(saved_dest.c_str(), c.Size()); c2->Serialize(&dest); EXPECT_EQ(dest.size(), 3 * c.Size()); - EXPECT_TRUE( - std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, c.Size()) - == 0); + EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, + c.Size()) == 0); } TEST(TombstoneTest, TombstoneCollectable) { int32_t now = (int32_t)time(nullptr); int32_t gc_grace_seconds = 16440; int32_t time_delta_seconds = 10; - EXPECT_TRUE(Tombstone(ColumnTypeMask::DELETION_MASK, 0, - now - gc_grace_seconds - time_delta_seconds, - ToMicroSeconds(now - gc_grace_seconds - time_delta_seconds)) - .Collectable(gc_grace_seconds)); - EXPECT_FALSE(Tombstone(ColumnTypeMask::DELETION_MASK, 0, - now - gc_grace_seconds + time_delta_seconds, - ToMicroSeconds(now - gc_grace_seconds + time_delta_seconds)) - .Collectable(gc_grace_seconds)); + EXPECT_TRUE( + Tombstone(ColumnTypeMask::DELETION_MASK, 0, + now - gc_grace_seconds - time_delta_seconds, + ToMicroSeconds(now - gc_grace_seconds - time_delta_seconds)) + .Collectable(gc_grace_seconds)); + EXPECT_FALSE( + Tombstone(ColumnTypeMask::DELETION_MASK, 0, + now - gc_grace_seconds + time_delta_seconds, + ToMicroSeconds(now - gc_grace_seconds + time_delta_seconds)) + .Collectable(gc_grace_seconds)); } TEST(TombstoneTest, Tombstone) { @@ -140,8 +140,8 @@ TEST(TombstoneTest, Tombstone) { int8_t index = 2; int32_t local_deletion_time = 1494022807; int64_t marked_for_delete_at = 1494022807044; - Tombstone c = Tombstone(mask, index, local_deletion_time, - marked_for_delete_at); + Tombstone c = + Tombstone(mask, index, local_deletion_time, marked_for_delete_at); EXPECT_EQ(c.Index(), index); EXPECT_EQ(c.Timestamp(), marked_for_delete_at); @@ -170,17 +170,16 @@ TEST(TombstoneTest, Tombstone) { c1->Serialize(&dest); EXPECT_EQ(dest.size(), 2 * c.Size()); - EXPECT_TRUE( - std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == 0); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + c.Size(), c.Size()) == + 0); // Verify the ColumnBase::Deserialization. std::shared_ptr c2 = - ColumnBase::Deserialize(dest.c_str(), c.Size()); + ColumnBase::Deserialize(dest.c_str(), c.Size()); c2->Serialize(&dest); EXPECT_EQ(dest.size(), 3 * c.Size()); - EXPECT_TRUE( - std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, c.Size()) - == 0); + EXPECT_TRUE(std::memcmp(dest.c_str() + c.Size(), dest.c_str() + c.Size() * 2, + c.Size()) == 0); } class RowValueTest : public testing::Test {}; @@ -213,8 +212,8 @@ TEST(RowValueTest, RowTombstone) { r1.Serialize(&dest); EXPECT_EQ(dest.size(), 2 * r.Size()); - EXPECT_TRUE( - std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) == 0); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) == + 0); } TEST(RowValueTest, RowWithColumns) { @@ -227,23 +226,23 @@ TEST(RowValueTest, RowWithColumns) { int64_t e_timestamp = 1494022807044; int32_t e_ttl = 3600; columns.push_back(std::shared_ptr( - new ExpiringColumn(ColumnTypeMask::EXPIRATION_MASK, e_index, - e_timestamp, sizeof(e_data), e_data, e_ttl))); + new ExpiringColumn(ColumnTypeMask::EXPIRATION_MASK, e_index, e_timestamp, + sizeof(e_data), e_data, e_ttl))); columns_data_size += columns[0]->Size(); char c_data[4] = {'d', 'a', 't', 'a'}; int8_t c_index = 1; int64_t c_timestamp = 1494022807048; columns.push_back(std::shared_ptr( - new Column(0, c_index, c_timestamp, sizeof(c_data), c_data))); + new Column(0, c_index, c_timestamp, sizeof(c_data), c_data))); columns_data_size += columns[1]->Size(); int8_t t_index = 2; int32_t t_local_deletion_time = 1494022801; int64_t t_marked_for_delete_at = 1494022807043; columns.push_back(std::shared_ptr( - new Tombstone(ColumnTypeMask::DELETION_MASK, - t_index, t_local_deletion_time, t_marked_for_delete_at))); + new Tombstone(ColumnTypeMask::DELETION_MASK, t_index, + t_local_deletion_time, t_marked_for_delete_at))); columns_data_size += columns[2]->Size(); RowValue r = RowValue(std::move(columns), last_modified_time); @@ -260,15 +259,15 @@ TEST(RowValueTest, RowWithColumns) { EXPECT_EQ(dest.size(), r.Size()); std::size_t offset = 0; EXPECT_EQ(Deserialize(dest.c_str(), offset), - std::numeric_limits::max()); + std::numeric_limits::max()); offset += sizeof(int32_t); EXPECT_EQ(Deserialize(dest.c_str(), offset), - std::numeric_limits::min()); + std::numeric_limits::min()); offset += sizeof(int64_t); // Column0: ExpiringColumn EXPECT_EQ(Deserialize(dest.c_str(), offset), - ColumnTypeMask::EXPIRATION_MASK); + ColumnTypeMask::EXPIRATION_MASK); offset += sizeof(int8_t); EXPECT_EQ(Deserialize(dest.c_str(), offset), e_index); offset += sizeof(int8_t); @@ -295,7 +294,7 @@ TEST(RowValueTest, RowWithColumns) { // Column2: Tombstone EXPECT_EQ(Deserialize(dest.c_str(), offset), - ColumnTypeMask::DELETION_MASK); + ColumnTypeMask::DELETION_MASK); offset += sizeof(int8_t); EXPECT_EQ(Deserialize(dest.c_str(), offset), t_index); offset += sizeof(int8_t); @@ -311,19 +310,20 @@ TEST(RowValueTest, RowWithColumns) { r1.Serialize(&dest); EXPECT_EQ(dest.size(), 2 * r.Size()); - EXPECT_TRUE( - std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) == 0); + EXPECT_TRUE(std::memcmp(dest.c_str(), dest.c_str() + r.Size(), r.Size()) == + 0); } TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired) { int64_t now = time(nullptr); - auto row_value = CreateTestRowValue({ - CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)), - CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 10)), //expired - CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now)), // not expired - CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now)) - }); + auto row_value = CreateTestRowValue( + {CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)), + CreateTestColumnSpec(kExpiringColumn, 1, + ToMicroSeconds(now - kTtl - 10)), // expired + CreateTestColumnSpec(kExpiringColumn, 2, + ToMicroSeconds(now)), // not expired + CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}); bool changed = false; auto purged = row_value.RemoveExpiredColumns(&changed); @@ -343,12 +343,13 @@ TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired) { TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) { int64_t now = time(nullptr); - auto row_value = CreateTestRowValue({ - CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)), - CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 10)), //expired - CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now)), // not expired - CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now)) - }); + auto row_value = CreateTestRowValue( + {CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)), + CreateTestColumnSpec(kExpiringColumn, 1, + ToMicroSeconds(now - kTtl - 10)), // expired + CreateTestColumnSpec(kExpiringColumn, 2, + ToMicroSeconds(now)), // not expired + CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))}); bool changed = false; auto compacted = row_value.ConvertExpiredColumnsToTombstones(&changed); @@ -366,7 +367,7 @@ TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) { compacted.ConvertExpiredColumnsToTombstones(&changed); EXPECT_FALSE(changed); } -} // namespace cassandra +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index 17ad97262c8..c5be836e8f5 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -18,7 +18,6 @@ #include "utilities/cassandra/test_utils.h" #include "utilities/merge_operators.h" - namespace ROCKSDB_NAMESPACE { namespace cassandra { @@ -32,7 +31,7 @@ class CassandraStore { assert(db); } - bool Append(const std::string& key, const RowValue& val){ + bool Append(const std::string& key, const RowValue& val) { std::string result; val.Serialize(&result); Slice valSlice(result.data(), result.size()); @@ -72,14 +71,13 @@ class CassandraStore { db_->DefaultColumnFamily()); } - std::tuple Get(const std::string& key){ + std::tuple Get(const std::string& key) { std::string result; auto s = db_->Get(get_option_, key, &result); if (s.ok()) { - return std::make_tuple(true, - RowValue::Deserialize(result.data(), - result.size())); + return std::make_tuple( + true, RowValue::Deserialize(result.data(), result.size())); } if (!s.IsNotFound()) { @@ -98,29 +96,28 @@ class CassandraStore { }; class TestCompactionFilterFactory : public CompactionFilterFactory { -public: - explicit TestCompactionFilterFactory(bool purge_ttl_on_expiration, - int32_t gc_grace_period_in_seconds) - : purge_ttl_on_expiration_(purge_ttl_on_expiration), - gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {} - - std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& /*context*/) override { - return std::unique_ptr(new CassandraCompactionFilter( - purge_ttl_on_expiration_, gc_grace_period_in_seconds_)); - } + public: + explicit TestCompactionFilterFactory(bool purge_ttl_on_expiration, + int32_t gc_grace_period_in_seconds) + : purge_ttl_on_expiration_(purge_ttl_on_expiration), + gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {} + + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr(new CassandraCompactionFilter( + purge_ttl_on_expiration_, gc_grace_period_in_seconds_)); + } - const char* Name() const override { return "TestCompactionFilterFactory"; } + const char* Name() const override { return "TestCompactionFilterFactory"; } -private: + private: bool purge_ttl_on_expiration_; int32_t gc_grace_period_in_seconds_; }; - // The class for unit-testing class CassandraFunctionalTest : public testing::Test { -public: + public: CassandraFunctionalTest() { EXPECT_OK( DestroyDB(kDbName, Options())); // Start each test with a fresh DB @@ -130,7 +127,8 @@ class CassandraFunctionalTest : public testing::Test { DB* db; Options options; options.create_if_missing = true; - options.merge_operator.reset(new CassandraValueMergeOperator(gc_grace_period_in_seconds_)); + options.merge_operator.reset( + new CassandraValueMergeOperator(gc_grace_period_in_seconds_)); auto* cf_factory = new TestCompactionFilterFactory( purge_ttl_on_expiration_, gc_grace_period_in_seconds_); options.compaction_filter_factory.reset(cf_factory); @@ -148,23 +146,29 @@ TEST_F(CassandraFunctionalTest, SimpleMergeTest) { CassandraStore store(OpenDb()); int64_t now = time(nullptr); - store.Append("k1", CreateTestRowValue({ - CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now + 5)), - CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now + 8)), - CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now + 5)), - })); - store.Append("k1",CreateTestRowValue({ - CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now + 2)), - CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now + 5)), - CreateTestColumnSpec(kTombstone, 2, ToMicroSeconds(now + 7)), - CreateTestColumnSpec(kExpiringColumn, 7, ToMicroSeconds(now + 17)), - })); - store.Append("k1", CreateTestRowValue({ - CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now + 6)), - CreateTestColumnSpec(kTombstone, 1, ToMicroSeconds(now + 5)), - CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now + 4)), - CreateTestColumnSpec(kTombstone, 11, ToMicroSeconds(now + 11)), - })); + store.Append( + "k1", + CreateTestRowValue({ + CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now + 5)), + CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now + 8)), + CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now + 5)), + })); + store.Append( + "k1", + CreateTestRowValue({ + CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now + 2)), + CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now + 5)), + CreateTestColumnSpec(kTombstone, 2, ToMicroSeconds(now + 7)), + CreateTestColumnSpec(kExpiringColumn, 7, ToMicroSeconds(now + 17)), + })); + store.Append( + "k1", + CreateTestRowValue({ + CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now + 6)), + CreateTestColumnSpec(kTombstone, 1, ToMicroSeconds(now + 5)), + CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now + 4)), + CreateTestColumnSpec(kTombstone, 11, ToMicroSeconds(now + 11)), + })); auto ret = store.Get("k1"); @@ -188,7 +192,7 @@ constexpr int64_t kTestTimeoutSecs = 600; TEST_F(CassandraFunctionalTest, CompactionShouldConvertExpiredColumnsToTombstone) { CassandraStore store(OpenDb()); - int64_t now= time(nullptr); + int64_t now = time(nullptr); store.Append( "k1", @@ -202,10 +206,12 @@ TEST_F(CassandraFunctionalTest, ASSERT_OK(store.Flush()); - store.Append("k1",CreateTestRowValue({ - CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired - CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now)) - })); + store.Append( + "k1", + CreateTestRowValue( + {CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 10)), // expired + CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))})); ASSERT_OK(store.Flush()); ASSERT_OK(store.Compact()); @@ -224,25 +230,29 @@ TEST_F(CassandraFunctionalTest, ToMicroSeconds(now)); } - TEST_F(CassandraFunctionalTest, CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn) { purge_ttl_on_expiration_ = true; CassandraStore store(OpenDb()); int64_t now = time(nullptr); - store.Append("k1", CreateTestRowValue({ - CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), //expired - CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now)), // not expired - CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now)) - })); + store.Append( + "k1", + CreateTestRowValue( + {CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 20)), // expired + CreateTestColumnSpec(kExpiringColumn, 1, + ToMicroSeconds(now)), // not expired + CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))})); ASSERT_OK(store.Flush()); - store.Append("k1",CreateTestRowValue({ - CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired - CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now)) - })); + store.Append( + "k1", + CreateTestRowValue( + {CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 10)), // expired + CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))})); ASSERT_OK(store.Flush()); ASSERT_OK(store.Compact()); @@ -266,15 +276,18 @@ TEST_F(CassandraFunctionalTest, int64_t now = time(nullptr); store.Append("k1", CreateTestRowValue({ - CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), - CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 20)), - })); + CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 20)), + CreateTestColumnSpec(kExpiringColumn, 1, + ToMicroSeconds(now - kTtl - 20)), + })); ASSERT_OK(store.Flush()); - store.Append("k1",CreateTestRowValue({ - CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), - })); + store.Append("k1", CreateTestRowValue({ + CreateTestColumnSpec(kExpiringColumn, 0, + ToMicroSeconds(now - kTtl - 10)), + })); ASSERT_OK(store.Flush()); ASSERT_OK(store.Compact()); @@ -287,20 +300,21 @@ TEST_F(CassandraFunctionalTest, CassandraStore store(OpenDb()); int64_t now = time(nullptr); - store.Append("k1", CreateTestRowValue({ - CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)), - CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)) - })); + store.Append("k1", + CreateTestRowValue( + {CreateTestColumnSpec( + kTombstone, 0, + ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)), + CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now))})); - store.Append("k2", CreateTestRowValue({ - CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)) - })); + store.Append("k2", CreateTestRowValue({CreateTestColumnSpec( + kColumn, 0, ToMicroSeconds(now))})); ASSERT_OK(store.Flush()); - store.Append("k1",CreateTestRowValue({ - CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)), - })); + store.Append("k1", CreateTestRowValue({ + CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)), + })); ASSERT_OK(store.Flush()); ASSERT_OK(store.Compact()); @@ -317,9 +331,12 @@ TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) { CassandraStore store(OpenDb()); int64_t now = time(nullptr); - store.Put("k1", CreateTestRowValue({ - CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)), - })); + store.Put("k1", + CreateTestRowValue({ + CreateTestColumnSpec( + kTombstone, 0, + ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)), + })); ASSERT_OK(store.Flush()); ASSERT_OK(store.Compact()); @@ -419,7 +436,7 @@ TEST_F(CassandraFunctionalTest, LoadCompactionFilterFactory) { } #endif // ROCKSDB_LITE -} // namespace cassandra +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/utilities/cassandra/cassandra_row_merge_test.cc b/utilities/cassandra/cassandra_row_merge_test.cc index 6f6b7514be0..0b4a8928717 100644 --- a/utilities/cassandra/cassandra_row_merge_test.cc +++ b/utilities/cassandra/cassandra_row_merge_test.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include + #include "test_util/testharness.h" #include "utilities/cassandra/format.h" #include "utilities/cassandra/test_utils.h" @@ -15,31 +16,25 @@ class RowValueMergeTest : public testing::Test {}; TEST(RowValueMergeTest, Merge) { std::vector row_values; - row_values.push_back( - CreateTestRowValue({ + row_values.push_back(CreateTestRowValue({ CreateTestColumnSpec(kTombstone, 0, 5), CreateTestColumnSpec(kColumn, 1, 8), CreateTestColumnSpec(kExpiringColumn, 2, 5), - }) - ); + })); - row_values.push_back( - CreateTestRowValue({ + row_values.push_back(CreateTestRowValue({ CreateTestColumnSpec(kColumn, 0, 2), CreateTestColumnSpec(kExpiringColumn, 1, 5), CreateTestColumnSpec(kTombstone, 2, 7), CreateTestColumnSpec(kExpiringColumn, 7, 17), - }) - ); + })); - row_values.push_back( - CreateTestRowValue({ + row_values.push_back(CreateTestRowValue({ CreateTestColumnSpec(kExpiringColumn, 0, 6), CreateTestColumnSpec(kTombstone, 1, 5), CreateTestColumnSpec(kColumn, 2, 4), CreateTestColumnSpec(kTombstone, 11, 11), - }) - ); + })); RowValue merged = RowValue::Merge(std::move(row_values)); EXPECT_FALSE(merged.IsTombstone()); @@ -55,33 +50,25 @@ TEST(RowValueMergeTest, MergeWithRowTombstone) { std::vector row_values; // A row tombstone. - row_values.push_back( - CreateRowTombstone(11) - ); + row_values.push_back(CreateRowTombstone(11)); // This row's timestamp is smaller than tombstone. - row_values.push_back( - CreateTestRowValue({ + row_values.push_back(CreateTestRowValue({ CreateTestColumnSpec(kColumn, 0, 5), CreateTestColumnSpec(kColumn, 1, 6), - }) - ); + })); // Some of the column's row is smaller, some is larger. - row_values.push_back( - CreateTestRowValue({ + row_values.push_back(CreateTestRowValue({ CreateTestColumnSpec(kColumn, 2, 10), CreateTestColumnSpec(kColumn, 3, 12), - }) - ); + })); // All of the column's rows are larger than tombstone. - row_values.push_back( - CreateTestRowValue({ + row_values.push_back(CreateTestRowValue({ CreateTestColumnSpec(kColumn, 4, 13), CreateTestColumnSpec(kColumn, 5, 14), - }) - ); + })); RowValue merged = RowValue::Merge(std::move(row_values)); EXPECT_FALSE(merged.IsTombstone()); @@ -92,20 +79,16 @@ TEST(RowValueMergeTest, MergeWithRowTombstone) { // If the tombstone's timestamp is the latest, then it returns a // row tombstone. - row_values.push_back( - CreateRowTombstone(15) - ); + row_values.push_back(CreateRowTombstone(15)); - row_values.push_back( - CreateRowTombstone(17) - ); + row_values.push_back(CreateRowTombstone(17)); merged = RowValue::Merge(std::move(row_values)); EXPECT_TRUE(merged.IsTombstone()); EXPECT_EQ(merged.LastModifiedTime(), 17); } -} // namespace cassandra +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/utilities/cassandra/cassandra_serialize_test.cc b/utilities/cassandra/cassandra_serialize_test.cc index bd5932c9336..c14d8fd809d 100644 --- a/utilities/cassandra/cassandra_serialize_test.cc +++ b/utilities/cassandra/cassandra_serialize_test.cc @@ -6,46 +6,39 @@ #include "test_util/testharness.h" #include "utilities/cassandra/serialize.h" - namespace ROCKSDB_NAMESPACE { namespace cassandra { TEST(SerializeTest, SerializeI64) { std::string dest; Serialize(0, &dest); - EXPECT_EQ( - std::string( - {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}), - dest); + EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', + '\x00'}), + dest); dest.clear(); Serialize(1, &dest); - EXPECT_EQ( - std::string( - {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x01'}), - dest); - + EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', + '\x01'}), + dest); dest.clear(); Serialize(-1, &dest); - EXPECT_EQ( - std::string( - {'\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff'}), - dest); + EXPECT_EQ(std::string({'\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', + '\xff'}), + dest); dest.clear(); Serialize(9223372036854775807, &dest); - EXPECT_EQ( - std::string( - {'\x7f', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff'}), - dest); + EXPECT_EQ(std::string({'\x7f', '\xff', '\xff', '\xff', '\xff', '\xff', '\xff', + '\xff'}), + dest); dest.clear(); Serialize(-9223372036854775807, &dest); - EXPECT_EQ( - std::string( - {'\x80', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x01'}), - dest); + EXPECT_EQ(std::string({'\x80', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', + '\x01'}), + dest); } TEST(SerializeTest, DeserializeI64) { @@ -74,39 +67,23 @@ TEST(SerializeTest, DeserializeI64) { TEST(SerializeTest, SerializeI32) { std::string dest; Serialize(0, &dest); - EXPECT_EQ( - std::string( - {'\x00', '\x00', '\x00', '\x00'}), - dest); + EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x00'}), dest); dest.clear(); Serialize(1, &dest); - EXPECT_EQ( - std::string( - {'\x00', '\x00', '\x00', '\x01'}), - dest); - + EXPECT_EQ(std::string({'\x00', '\x00', '\x00', '\x01'}), dest); dest.clear(); Serialize(-1, &dest); - EXPECT_EQ( - std::string( - {'\xff', '\xff', '\xff', '\xff'}), - dest); + EXPECT_EQ(std::string({'\xff', '\xff', '\xff', '\xff'}), dest); dest.clear(); Serialize(2147483647, &dest); - EXPECT_EQ( - std::string( - {'\x7f', '\xff', '\xff', '\xff'}), - dest); + EXPECT_EQ(std::string({'\x7f', '\xff', '\xff', '\xff'}), dest); dest.clear(); Serialize(-2147483648LL, &dest); - EXPECT_EQ( - std::string( - {'\x80', '\x00', '\x00', '\x00'}), - dest); + EXPECT_EQ(std::string({'\x80', '\x00', '\x00', '\x00'}), dest); } TEST(SerializeTest, DeserializeI32) { @@ -141,7 +118,6 @@ TEST(SerializeTest, SerializeI8) { Serialize(1, &dest); EXPECT_EQ(std::string({'\x01'}), dest); - dest.clear(); Serialize(-1, &dest); EXPECT_EQ(std::string({'\xff'}), dest); @@ -178,7 +154,7 @@ TEST(SerializeTest, DeserializeI8) { EXPECT_EQ(-128, Deserialize(dest.c_str(), offset)); } -} // namespace cassandra +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/utilities/cassandra/format.cc b/utilities/cassandra/format.cc index a767f41e791..cc1dd2f2803 100644 --- a/utilities/cassandra/format.cc +++ b/utilities/cassandra/format.cc @@ -14,26 +14,18 @@ namespace ROCKSDB_NAMESPACE { namespace cassandra { namespace { -const int32_t kDefaultLocalDeletionTime = - std::numeric_limits::max(); -const int64_t kDefaultMarkedForDeleteAt = - std::numeric_limits::min(); -} +const int32_t kDefaultLocalDeletionTime = std::numeric_limits::max(); +const int64_t kDefaultMarkedForDeleteAt = std::numeric_limits::min(); +} // namespace ColumnBase::ColumnBase(int8_t mask, int8_t index) - : mask_(mask), index_(index) {} + : mask_(mask), index_(index) {} -std::size_t ColumnBase::Size() const { - return sizeof(mask_) + sizeof(index_); -} +std::size_t ColumnBase::Size() const { return sizeof(mask_) + sizeof(index_); } -int8_t ColumnBase::Mask() const { - return mask_; -} +int8_t ColumnBase::Mask() const { return mask_; } -int8_t ColumnBase::Index() const { - return index_; -} +int8_t ColumnBase::Index() const { return index_; } void ColumnBase::Serialize(std::string* dest) const { ROCKSDB_NAMESPACE::cassandra::Serialize(mask_, dest); @@ -52,22 +44,18 @@ std::shared_ptr ColumnBase::Deserialize(const char* src, } } -Column::Column( - int8_t mask, - int8_t index, - int64_t timestamp, - int32_t value_size, - const char* value -) : ColumnBase(mask, index), timestamp_(timestamp), - value_size_(value_size), value_(value) {} - -int64_t Column::Timestamp() const { - return timestamp_; -} +Column::Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size, + const char* value) + : ColumnBase(mask, index), + timestamp_(timestamp), + value_size_(value_size), + value_(value) {} + +int64_t Column::Timestamp() const { return timestamp_; } std::size_t Column::Size() const { - return ColumnBase::Size() + sizeof(timestamp_) + sizeof(value_size_) - + value_size_; + return ColumnBase::Size() + sizeof(timestamp_) + sizeof(value_size_) + + value_size_; } void Column::Serialize(std::string* dest) const { @@ -77,7 +65,7 @@ void Column::Serialize(std::string* dest) const { dest->append(value_, value_size_); } -std::shared_ptr Column::Deserialize(const char *src, +std::shared_ptr Column::Deserialize(const char* src, std::size_t offset) { int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); offset += sizeof(mask); @@ -89,19 +77,14 @@ std::shared_ptr Column::Deserialize(const char *src, int32_t value_size = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); offset += sizeof(value_size); - return std::make_shared( - mask, index, timestamp, value_size, src + offset); + return std::make_shared(mask, index, timestamp, value_size, + src + offset); } -ExpiringColumn::ExpiringColumn( - int8_t mask, - int8_t index, - int64_t timestamp, - int32_t value_size, - const char* value, - int32_t ttl -) : Column(mask, index, timestamp, value_size, value), - ttl_(ttl) {} +ExpiringColumn::ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp, + int32_t value_size, const char* value, + int32_t ttl) + : Column(mask, index, timestamp, value_size, value), ttl_(ttl) {} std::size_t ExpiringColumn::Size() const { return Column::Size() + sizeof(ttl_); @@ -112,8 +95,10 @@ void ExpiringColumn::Serialize(std::string* dest) const { ROCKSDB_NAMESPACE::cassandra::Serialize(ttl_, dest); } -std::chrono::time_point ExpiringColumn::TimePoint() const { - return std::chrono::time_point(std::chrono::microseconds(Timestamp())); +std::chrono::time_point ExpiringColumn::TimePoint() + const { + return std::chrono::time_point( + std::chrono::microseconds(Timestamp())); } std::chrono::seconds ExpiringColumn::Ttl() const { @@ -127,19 +112,16 @@ bool ExpiringColumn::Expired() const { std::shared_ptr ExpiringColumn::ToTombstone() const { auto expired_at = (TimePoint() + Ttl()).time_since_epoch(); int32_t local_deletion_time = static_cast( - std::chrono::duration_cast(expired_at).count()); + std::chrono::duration_cast(expired_at).count()); int64_t marked_for_delete_at = - std::chrono::duration_cast(expired_at).count(); + std::chrono::duration_cast(expired_at).count(); return std::make_shared( - static_cast(ColumnTypeMask::DELETION_MASK), - Index(), - local_deletion_time, - marked_for_delete_at); + static_cast(ColumnTypeMask::DELETION_MASK), Index(), + local_deletion_time, marked_for_delete_at); } std::shared_ptr ExpiringColumn::Deserialize( - const char *src, - std::size_t offset) { + const char* src, std::size_t offset) { int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); offset += sizeof(mask); int8_t index = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); @@ -153,25 +135,21 @@ std::shared_ptr ExpiringColumn::Deserialize( const char* value = src + offset; offset += value_size; int32_t ttl = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); - return std::make_shared( - mask, index, timestamp, value_size, value, ttl); + return std::make_shared(mask, index, timestamp, value_size, + value, ttl); } -Tombstone::Tombstone( - int8_t mask, - int8_t index, - int32_t local_deletion_time, - int64_t marked_for_delete_at -) : ColumnBase(mask, index), local_deletion_time_(local_deletion_time), - marked_for_delete_at_(marked_for_delete_at) {} +Tombstone::Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time, + int64_t marked_for_delete_at) + : ColumnBase(mask, index), + local_deletion_time_(local_deletion_time), + marked_for_delete_at_(marked_for_delete_at) {} -int64_t Tombstone::Timestamp() const { - return marked_for_delete_at_; -} +int64_t Tombstone::Timestamp() const { return marked_for_delete_at_; } std::size_t Tombstone::Size() const { - return ColumnBase::Size() + sizeof(local_deletion_time_) - + sizeof(marked_for_delete_at_); + return ColumnBase::Size() + sizeof(local_deletion_time_) + + sizeof(marked_for_delete_at_); } void Tombstone::Serialize(std::string* dest) const { @@ -187,7 +165,7 @@ bool Tombstone::Collectable(int32_t gc_grace_period_in_seconds) const { return local_deleted_at + gc_grace_period < std::chrono::system_clock::now(); } -std::shared_ptr Tombstone::Deserialize(const char *src, +std::shared_ptr Tombstone::Deserialize(const char* src, std::size_t offset) { int8_t mask = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); offset += sizeof(mask); @@ -198,26 +176,27 @@ std::shared_ptr Tombstone::Deserialize(const char *src, offset += sizeof(int32_t); int64_t marked_for_delete_at = ROCKSDB_NAMESPACE::cassandra::Deserialize(src, offset); - return std::make_shared( - mask, index, local_deletion_time, marked_for_delete_at); + return std::make_shared(mask, index, local_deletion_time, + marked_for_delete_at); } RowValue::RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at) - : local_deletion_time_(local_deletion_time), - marked_for_delete_at_(marked_for_delete_at), columns_(), - last_modified_time_(0) {} + : local_deletion_time_(local_deletion_time), + marked_for_delete_at_(marked_for_delete_at), + columns_(), + last_modified_time_(0) {} -RowValue::RowValue(Columns columns, - int64_t last_modified_time) - : local_deletion_time_(kDefaultLocalDeletionTime), - marked_for_delete_at_(kDefaultMarkedForDeleteAt), - columns_(std::move(columns)), last_modified_time_(last_modified_time) {} +RowValue::RowValue(Columns columns, int64_t last_modified_time) + : local_deletion_time_(kDefaultLocalDeletionTime), + marked_for_delete_at_(kDefaultMarkedForDeleteAt), + columns_(std::move(columns)), + last_modified_time_(last_modified_time) {} std::size_t RowValue::Size() const { - std::size_t size = sizeof(local_deletion_time_) - + sizeof(marked_for_delete_at_); + std::size_t size = + sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_); for (const auto& column : columns_) { - size += column -> Size(); + size += column->Size(); } return size; } @@ -238,7 +217,7 @@ void RowValue::Serialize(std::string* dest) const { ROCKSDB_NAMESPACE::cassandra::Serialize(local_deletion_time_, dest); ROCKSDB_NAMESPACE::cassandra::Serialize(marked_for_delete_at_, dest); for (const auto& column : columns_) { - column -> Serialize(dest); + column->Serialize(dest); } } @@ -246,11 +225,11 @@ RowValue RowValue::RemoveExpiredColumns(bool* changed) const { *changed = false; Columns new_columns; for (auto& column : columns_) { - if(column->Mask() == ColumnTypeMask::EXPIRATION_MASK) { + if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) { std::shared_ptr expiring_column = - std::static_pointer_cast(column); + std::static_pointer_cast(column); - if(expiring_column->Expired()){ + if (expiring_column->Expired()) { *changed = true; continue; } @@ -265,11 +244,11 @@ RowValue RowValue::ConvertExpiredColumnsToTombstones(bool* changed) const { *changed = false; Columns new_columns; for (auto& column : columns_) { - if(column->Mask() == ColumnTypeMask::EXPIRATION_MASK) { + if (column->Mask() == ColumnTypeMask::EXPIRATION_MASK) { std::shared_ptr expiring_column = - std::static_pointer_cast(column); + std::static_pointer_cast(column); - if(expiring_column->Expired()) { + if (expiring_column->Expired()) { std::shared_ptr tombstone = expiring_column->ToTombstone(); new_columns.push_back(tombstone); *changed = true; @@ -298,11 +277,9 @@ RowValue RowValue::RemoveTombstones(int32_t gc_grace_period) const { return RowValue(std::move(new_columns), last_modified_time_); } -bool RowValue::Empty() const { - return columns_.empty(); -} +bool RowValue::Empty() const { return columns_.empty(); } -RowValue RowValue::Deserialize(const char *src, std::size_t size) { +RowValue RowValue::Deserialize(const char* src, std::size_t size) { std::size_t offset = 0; assert(size >= sizeof(local_deletion_time_) + sizeof(marked_for_delete_at_)); int32_t local_deletion_time = @@ -321,9 +298,9 @@ RowValue RowValue::Deserialize(const char *src, std::size_t size) { int64_t last_modified_time = 0; while (offset < size) { auto c = ColumnBase::Deserialize(src, offset); - offset += c -> Size(); + offset += c->Size(); assert(offset <= size); - last_modified_time = std::max(last_modified_time, c -> Timestamp()); + last_modified_time = std::max(last_modified_time, c->Timestamp()); columns.push_back(std::move(c)); } @@ -344,9 +321,9 @@ RowValue RowValue::Merge(std::vector&& values) { // Merge columns by their last modified time, and skip once we hit // a row tombstone. std::sort(values.begin(), values.end(), - [](const RowValue& r1, const RowValue& r2) { - return r1.LastModifiedTime() > r2.LastModifiedTime(); - }); + [](const RowValue& r1, const RowValue& r2) { + return r1.LastModifiedTime() > r2.LastModifiedTime(); + }); std::map> merged_columns; int64_t tombstone_timestamp = 0; @@ -373,7 +350,7 @@ RowValue RowValue::Merge(std::vector&& values) { int64_t last_modified_time = 0; Columns columns; - for (auto& pair: merged_columns) { + for (auto& pair : merged_columns) { // For some row, its last_modified_time > row tombstone_timestamp, but // it might have rows whose timestamp is ealier than tombstone, so we // ned to filter these rows. @@ -386,5 +363,5 @@ RowValue RowValue::Merge(std::vector&& values) { return RowValue(std::move(columns), last_modified_time); } -} // namepsace cassandrda +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/format.h b/utilities/cassandra/format.h index 2aca9d3f939..1b27147351a 100644 --- a/utilities/cassandra/format.h +++ b/utilities/cassandra/format.h @@ -58,6 +58,7 @@ #include #include #include + #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" @@ -70,9 +71,8 @@ enum ColumnTypeMask { EXPIRATION_MASK = 0x02, }; - class ColumnBase { -public: + public: ColumnBase(int8_t mask, int8_t index); virtual ~ColumnBase() = default; @@ -84,15 +84,15 @@ class ColumnBase { static std::shared_ptr Deserialize(const char* src, std::size_t offset); -private: + private: int8_t mask_; int8_t index_; }; class Column : public ColumnBase { -public: - Column(int8_t mask, int8_t index, int64_t timestamp, - int32_t value_size, const char* value); + public: + Column(int8_t mask, int8_t index, int64_t timestamp, int32_t value_size, + const char* value); virtual int64_t Timestamp() const override; virtual std::size_t Size() const override; @@ -100,16 +100,16 @@ class Column : public ColumnBase { static std::shared_ptr Deserialize(const char* src, std::size_t offset); -private: + private: int64_t timestamp_; int32_t value_size_; const char* value_; }; class Tombstone : public ColumnBase { -public: - Tombstone(int8_t mask, int8_t index, - int32_t local_deletion_time, int64_t marked_for_delete_at); + public: + Tombstone(int8_t mask, int8_t index, int32_t local_deletion_time, + int64_t marked_for_delete_at); virtual int64_t Timestamp() const override; virtual std::size_t Size() const override; @@ -118,15 +118,15 @@ class Tombstone : public ColumnBase { static std::shared_ptr Deserialize(const char* src, std::size_t offset); -private: + private: int32_t local_deletion_time_; int64_t marked_for_delete_at_; }; class ExpiringColumn : public Column { -public: + public: ExpiringColumn(int8_t mask, int8_t index, int64_t timestamp, - int32_t value_size, const char* value, int32_t ttl); + int32_t value_size, const char* value, int32_t ttl); virtual std::size_t Size() const override; virtual void Serialize(std::string* dest) const override; @@ -136,7 +136,7 @@ class ExpiringColumn : public Column { static std::shared_ptr Deserialize(const char* src, std::size_t offset); -private: + private: int32_t ttl_; std::chrono::time_point TimePoint() const; std::chrono::seconds Ttl() const; @@ -145,12 +145,11 @@ class ExpiringColumn : public Column { using Columns = std::vector>; class RowValue { -public: + public: // Create a Row Tombstone. RowValue(int32_t local_deletion_time, int64_t marked_for_delete_at); // Create a Row containing columns. - RowValue(Columns columns, - int64_t last_modified_time); + RowValue(Columns columns, int64_t last_modified_time); RowValue(const RowValue& /*that*/) = delete; RowValue(RowValue&& /*that*/) noexcept = default; RowValue& operator=(const RowValue& /*that*/) = delete; @@ -180,5 +179,5 @@ class RowValue { int64_t last_modified_time_; }; -} // namepsace cassandrda +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/merge_operator.cc b/utilities/cassandra/merge_operator.cc index 9d0cdd38548..bde5dcbaddb 100644 --- a/utilities/cassandra/merge_operator.cc +++ b/utilities/cassandra/merge_operator.cc @@ -44,9 +44,8 @@ bool CassandraValueMergeOperator::FullMergeV2( merge_out->new_value.clear(); std::vector row_values; if (merge_in.existing_value) { - row_values.push_back( - RowValue::Deserialize(merge_in.existing_value->data(), - merge_in.existing_value->size())); + row_values.push_back(RowValue::Deserialize( + merge_in.existing_value->data(), merge_in.existing_value->size())); } for (auto& operand : merge_in.operand_list) { @@ -78,6 +77,6 @@ bool CassandraValueMergeOperator::PartialMergeMulti( return true; } -} // namespace cassandra +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/merge_operator.h b/utilities/cassandra/merge_operator.h index 4bf9128098a..af8725db7de 100644 --- a/utilities/cassandra/merge_operator.h +++ b/utilities/cassandra/merge_operator.h @@ -15,30 +15,30 @@ namespace cassandra { * A MergeOperator for rocksdb that implements Cassandra row value merge. */ class CassandraValueMergeOperator : public MergeOperator { -public: - explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds, - size_t operands_limit = 0); + public: + explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds, + size_t operands_limit = 0); - virtual bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override; + virtual bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; - virtual bool PartialMergeMulti(const Slice& key, - const std::deque& operand_list, - std::string* new_value, - Logger* logger) const override; + virtual bool PartialMergeMulti(const Slice& key, + const std::deque& operand_list, + std::string* new_value, + Logger* logger) const override; - const char* Name() const override { return kClassName(); } - static const char* kClassName() { return "CassandraValueMergeOperator"; } + const char* Name() const override { return kClassName(); } + static const char* kClassName() { return "CassandraValueMergeOperator"; } - virtual bool AllowSingleOperand() const override { return true; } + virtual bool AllowSingleOperand() const override { return true; } - virtual bool ShouldMerge(const std::vector& operands) const override { - return options_.operands_limit > 0 && - operands.size() >= options_.operands_limit; - } + virtual bool ShouldMerge(const std::vector& operands) const override { + return options_.operands_limit > 0 && + operands.size() >= options_.operands_limit; + } -private: - CassandraOptions options_; + private: + CassandraOptions options_; }; -} // namespace cassandra +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/serialize.h b/utilities/cassandra/serialize.h index 8f50a02dd97..4bd552bfc1f 100644 --- a/utilities/cassandra/serialize.h +++ b/utilities/cassandra/serialize.h @@ -20,61 +20,62 @@ namespace cassandra { namespace { const int64_t kCharMask = 0xFFLL; const int32_t kBitsPerByte = 8; -} +} // namespace -template +template void Serialize(T val, std::string* dest); -template -T Deserialize(const char* src, std::size_t offset=0); +template +T Deserialize(const char* src, std::size_t offset = 0); // Specializations -template<> +template <> inline void Serialize(int8_t t, std::string* dest) { dest->append(1, static_cast(t & kCharMask)); } -template<> +template <> inline void Serialize(int32_t t, std::string* dest) { for (unsigned long i = 0; i < sizeof(int32_t); i++) { - dest->append(1, static_cast( - (t >> (sizeof(int32_t) - 1 - i) * kBitsPerByte) & kCharMask)); + dest->append( + 1, static_cast((t >> (sizeof(int32_t) - 1 - i) * kBitsPerByte) & + kCharMask)); } } -template<> +template <> inline void Serialize(int64_t t, std::string* dest) { for (unsigned long i = 0; i < sizeof(int64_t); i++) { - dest->append( - 1, static_cast( - (t >> (sizeof(int64_t) - 1 - i) * kBitsPerByte) & kCharMask)); + dest->append( + 1, static_cast((t >> (sizeof(int64_t) - 1 - i) * kBitsPerByte) & + kCharMask)); } } -template<> +template <> inline int8_t Deserialize(const char* src, std::size_t offset) { return static_cast(src[offset]); } -template<> +template <> inline int32_t Deserialize(const char* src, std::size_t offset) { int32_t result = 0; for (unsigned long i = 0; i < sizeof(int32_t); i++) { result |= static_cast(static_cast(src[offset + i])) - << ((sizeof(int32_t) - 1 - i) * kBitsPerByte); + << ((sizeof(int32_t) - 1 - i) * kBitsPerByte); } return result; } -template<> +template <> inline int64_t Deserialize(const char* src, std::size_t offset) { int64_t result = 0; for (unsigned long i = 0; i < sizeof(int64_t); i++) { result |= static_cast(static_cast(src[offset + i])) - << ((sizeof(int64_t) - 1 - i) * kBitsPerByte); + << ((sizeof(int64_t) - 1 - i) * kBitsPerByte); } return result; } -} // namepsace cassandrda +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/test_utils.cc b/utilities/cassandra/test_utils.cc index 679878a7385..ec6e5752d0a 100644 --- a/utilities/cassandra/test_utils.cc +++ b/utilities/cassandra/test_utils.cc @@ -14,18 +14,17 @@ const int8_t kColumn = 0; const int8_t kTombstone = 1; const int8_t kExpiringColumn = 2; -std::shared_ptr CreateTestColumn(int8_t mask, - int8_t index, +std::shared_ptr CreateTestColumn(int8_t mask, int8_t index, int64_t timestamp) { if ((mask & ColumnTypeMask::DELETION_MASK) != 0) { return std::shared_ptr( new Tombstone(mask, index, ToSeconds(timestamp), timestamp)); } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) { return std::shared_ptr(new ExpiringColumn( - mask, index, timestamp, sizeof(kExpiringData), kExpiringData, kTtl)); + mask, index, timestamp, sizeof(kExpiringData), kExpiringData, kTtl)); } else { return std::shared_ptr( - new Column(mask, index, timestamp, sizeof(kData), kData)); + new Column(mask, index, timestamp, sizeof(kData), kData)); } } @@ -39,10 +38,10 @@ RowValue CreateTestRowValue( std::vector> column_specs) { std::vector> columns; int64_t last_modified_time = 0; - for (auto spec: column_specs) { + for (auto spec : column_specs) { auto c = CreateTestColumn(std::get<0>(spec), std::get<1>(spec), std::get<2>(spec)); - last_modified_time = std::max(last_modified_time, c -> Timestamp()); + last_modified_time = std::max(last_modified_time, c->Timestamp()); columns.push_back(std::move(c)); } return RowValue(std::move(columns), last_modified_time); @@ -61,12 +60,10 @@ void VerifyRowValueColumns( EXPECT_EQ(expected_index, columns[index_of_vector]->Index()); } -int64_t ToMicroSeconds(int64_t seconds) { - return seconds * (int64_t)1000000; -} +int64_t ToMicroSeconds(int64_t seconds) { return seconds * (int64_t)1000000; } int32_t ToSeconds(int64_t microseconds) { return (int32_t)(microseconds / (int64_t)1000000); } -} +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/test_utils.h b/utilities/cassandra/test_utils.h index 9d657527250..be23f707606 100644 --- a/utilities/cassandra/test_utils.h +++ b/utilities/cassandra/test_utils.h @@ -5,6 +5,7 @@ #pragma once #include + #include "test_util/testharness.h" #include "utilities/cassandra/format.h" #include "utilities/cassandra/serialize.h" @@ -18,9 +19,7 @@ extern const int8_t kColumn; extern const int8_t kTombstone; extern const int8_t kExpiringColumn; - -std::shared_ptr CreateTestColumn(int8_t mask, - int8_t index, +std::shared_ptr CreateTestColumn(int8_t mask, int8_t index, int64_t timestamp); std::tuple CreateTestColumnSpec(int8_t mask, @@ -39,5 +38,5 @@ void VerifyRowValueColumns( int64_t ToMicroSeconds(int64_t seconds); int32_t ToSeconds(int64_t microseconds); -} +} // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 44ce70b1b6e..cdea325cd71 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -382,6 +382,7 @@ Status CheckpointImpl::ExportColumnFamily( live_file_metadata.largestkey = std::move(file_metadata.largestkey); live_file_metadata.oldest_blob_file_number = file_metadata.oldest_blob_file_number; + live_file_metadata.epoch_number = file_metadata.epoch_number; live_file_metadata.level = level_metadata.level; result_metadata->files.push_back(live_file_metadata); } diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h index ad9f84a77d3..2947330ccef 100644 --- a/utilities/checkpoint/checkpoint_impl.h +++ b/utilities/checkpoint/checkpoint_impl.h @@ -6,11 +6,11 @@ #pragma once #ifndef ROCKSDB_LITE -#include "rocksdb/utilities/checkpoint.h" - #include + #include "file/filename.h" #include "rocksdb/db.h" +#include "rocksdb/utilities/checkpoint.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 10aaf16ba5c..3da753d5f3b 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -136,9 +136,8 @@ class CheckpointTest : public testing::Test { ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); } - Status TryReopenWithColumnFamilies( - const std::vector& cfs, - const std::vector& options) { + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const std::vector& options) { Close(); EXPECT_EQ(cfs.size(), options.size()); std::vector column_families; @@ -156,9 +155,7 @@ class CheckpointTest : public testing::Test { return TryReopenWithColumnFamilies(cfs, v_opts); } - void Reopen(const Options& options) { - ASSERT_OK(TryReopen(options)); - } + void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); } void CompactAll() { for (auto h : handles_) { @@ -223,9 +220,7 @@ class CheckpointTest : public testing::Test { return db_->Put(wo, handles_[cf], k, v); } - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } + Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); } Status Delete(int cf, const std::string& k) { return db_->Delete(WriteOptions(), handles_[cf], k); @@ -512,18 +507,18 @@ TEST_F(CheckpointTest, CheckpointCF) { std::vector cfs; cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"}; std::vector column_families; - for (size_t i = 0; i < cfs.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor(cfs[i], options)); - } - ASSERT_OK(DB::Open(options, snapshot_name_, - column_families, &cphandles, &snapshotDB)); + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options)); + } + ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles, + &snapshotDB)); ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result)); ASSERT_EQ("Default1", result); ASSERT_OK(snapshotDB->Get(roptions, cphandles[1], "one", &result)); ASSERT_EQ("eleven", result); ASSERT_OK(snapshotDB->Get(roptions, cphandles[2], "two", &result)); for (auto h : cphandles) { - delete h; + delete h; } cphandles.clear(); delete snapshotDB; diff --git a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc index f4dbce10043..b788dbf9b06 100644 --- a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +++ b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc @@ -5,10 +5,11 @@ #ifndef ROCKSDB_LITE +#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" + #include #include "rocksdb/slice.h" -#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/convenience/info_log_finder.cc b/utilities/convenience/info_log_finder.cc index 37f3bceee9a..fe62fd56168 100644 --- a/utilities/convenience/info_log_finder.cc +++ b/utilities/convenience/info_log_finder.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. #include "rocksdb/utilities/info_log_finder.h" + #include "file/filename.h" #include "rocksdb/env.h" diff --git a/utilities/env_mirror_test.cc b/utilities/env_mirror_test.cc index 13683b8ee64..c372de1da5e 100644 --- a/utilities/env_mirror_test.cc +++ b/utilities/env_mirror_test.cc @@ -7,6 +7,7 @@ #ifndef ROCKSDB_LITE #include "rocksdb/utilities/env_mirror.h" + #include "env/mock_env.h" #include "test_util/testharness.h" @@ -15,7 +16,7 @@ namespace ROCKSDB_NAMESPACE { class EnvMirrorTest : public testing::Test { public: Env* default_; - MockEnv* a_, *b_; + MockEnv *a_, *b_; EnvMirror* env_; const EnvOptions soptions_; @@ -97,8 +98,9 @@ TEST_F(EnvMirrorTest, Basics) { ASSERT_TRUE( !env_->NewSequentialFile("/dir/non_existent", &seq_file, soptions_).ok()); ASSERT_TRUE(!seq_file); - ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file, - soptions_).ok()); + ASSERT_TRUE( + !env_->NewRandomAccessFile("/dir/non_existent", &rand_file, soptions_) + .ok()); ASSERT_TRUE(!rand_file); // Check that deleting works. diff --git a/utilities/env_timed_test.cc b/utilities/env_timed_test.cc index 29952925c4a..6e392579d2b 100644 --- a/utilities/env_timed_test.cc +++ b/utilities/env_timed_test.cc @@ -11,8 +11,7 @@ namespace ROCKSDB_NAMESPACE { -class TimedEnvTest : public testing::Test { -}; +class TimedEnvTest : public testing::Test {}; TEST_F(TimedEnvTest, BasicTest) { SetPerfLevel(PerfLevel::kEnableTime); diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h index c492f998744..549bfe7168a 100644 --- a/utilities/fault_injection_env.h +++ b/utilities/fault_injection_env.h @@ -85,8 +85,7 @@ class TestWritableFile : public WritableFile { virtual Status Flush() override; virtual Status Sync() override; virtual bool IsSyncThreadSafe() const override { return true; } - virtual Status PositionedAppend(const Slice& data, - uint64_t offset) override { + virtual Status PositionedAppend(const Slice& data, uint64_t offset) override { return target_->PositionedAppend(data, offset); } virtual Status PositionedAppend( @@ -227,8 +226,8 @@ class FaultInjectionTestEnv : public EnvWrapper { MutexLock l(&mutex_); return filesystem_active_; } - void SetFilesystemActiveNoLock(bool active, - Status error = Status::Corruption("Not active")) { + void SetFilesystemActiveNoLock( + bool active, Status error = Status::Corruption("Not active")) { error.PermitUncheckedError(); filesystem_active_ = active; if (!active) { @@ -237,7 +236,7 @@ class FaultInjectionTestEnv : public EnvWrapper { error.PermitUncheckedError(); } void SetFilesystemActive(bool active, - Status error = Status::Corruption("Not active")) { + Status error = Status::Corruption("Not active")) { error.PermitUncheckedError(); MutexLock l(&mutex_); SetFilesystemActiveNoLock(active, error); diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index 26d4d2d4ccb..5261d79ea1c 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -26,6 +26,7 @@ #include "test_util/sync_point.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/mutexlock.h" #include "util/random.h" #include "util/string_util.h" #include "util/xxhash.h" @@ -386,9 +387,9 @@ IOStatus TestFSRandomRWFile::Sync(const IOOptions& options, return target_->Sync(options, dbg); } -TestFSRandomAccessFile::TestFSRandomAccessFile(const std::string& /*fname*/, - std::unique_ptr&& f, - FaultInjectionTestFS* fs) +TestFSRandomAccessFile::TestFSRandomAccessFile( + const std::string& /*fname*/, std::unique_ptr&& f, + FaultInjectionTestFS* fs) : target_(std::move(f)), fs_(fs) { assert(target_ != nullptr); } @@ -412,6 +413,35 @@ IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n, return s; } +IOStatus TestFSRandomAccessFile::ReadAsync( + FSReadRequest& req, const IOOptions& opts, + std::function cb, void* cb_arg, + void** io_handle, IOHandleDeleter* del_fn, IODebugContext* /*dbg*/) { + IOStatus ret; + IOStatus s; + FSReadRequest res; + if (!fs_->IsFilesystemActive()) { + ret = fs_->GetError(); + } else { + ret = fs_->InjectThreadSpecificReadError( + FaultInjectionTestFS::ErrorOperation::kRead, &res.result, + use_direct_io(), req.scratch, /*need_count_increase=*/true, + /*fault_injected=*/nullptr); + } + if (ret.ok()) { + if (fs_->ShouldInjectRandomReadError()) { + ret = IOStatus::IOError("Injected read error"); + } else { + s = target_->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, nullptr); + } + } + if (!ret.ok()) { + res.status = ret; + cb(res, cb_arg); + } + return s; +} + IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) { @@ -803,6 +833,15 @@ IOStatus FaultInjectionTestFS::LinkFile(const std::string& s, return io_s; } +IOStatus FaultInjectionTestFS::Poll(std::vector& io_handles, + size_t min_completions) { + return target()->Poll(io_handles, min_completions); +} + +IOStatus FaultInjectionTestFS::AbortIO(std::vector& io_handles) { + return target()->AbortIO(io_handles); +} + void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) { MutexLock l(&mutex_); if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) { @@ -912,8 +951,7 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( bool dummy_bool; bool& ret_fault_injected = fault_injected ? *fault_injected : dummy_bool; ret_fault_injected = false; - ErrorContext* ctx = - static_cast(thread_local_error_->Get()); + ErrorContext* ctx = static_cast(thread_local_error_->Get()); if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in) { return IOStatus::OK(); } @@ -1019,8 +1057,7 @@ IOStatus FaultInjectionTestFS::InjectMetadataWriteError() { void FaultInjectionTestFS::PrintFaultBacktrace() { #if defined(OS_LINUX) - ErrorContext* ctx = - static_cast(thread_local_error_->Get()); + ErrorContext* ctx = static_cast(thread_local_error_->Get()); if (ctx == nullptr) { return; } diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h index 8cf6c44c833..cab0051bd14 100644 --- a/utilities/fault_injection_fs.h +++ b/utilities/fault_injection_fs.h @@ -135,12 +135,16 @@ class TestFSRandomRWFile : public FSRandomRWFile { class TestFSRandomAccessFile : public FSRandomAccessFile { public: explicit TestFSRandomAccessFile(const std::string& fname, - std::unique_ptr&& f, - FaultInjectionTestFS* fs); + std::unique_ptr&& f, + FaultInjectionTestFS* fs); ~TestFSRandomAccessFile() override {} IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const override; + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, + std::function cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override; IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) override; size_t GetRequiredBufferAlignment() const override { @@ -266,6 +270,11 @@ class FaultInjectionTestFS : public FileSystemWrapper { return io_s; } + virtual IOStatus Poll(std::vector& io_handles, + size_t min_completions) override; + + virtual IOStatus AbortIO(std::vector& io_handles) override; + void WritableFileClosed(const FSFileState& state); void WritableFileSynced(const FSFileState& state); @@ -331,8 +340,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { error.PermitUncheckedError(); SetFilesystemActiveNoLock(active, error); } - void SetFilesystemDirectWritable( - bool writable) { + void SetFilesystemDirectWritable(bool writable) { MutexLock l(&mutex_); filesystem_writable_ = writable; } @@ -396,7 +404,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { // 1/one_in probability) void SetThreadLocalReadErrorContext(uint32_t seed, int one_in) { struct ErrorContext* ctx = - static_cast(thread_local_error_->Get()); + static_cast(thread_local_error_->Get()); if (ctx == nullptr) { ctx = new ErrorContext(seed); thread_local_error_->Reset(ctx); @@ -405,7 +413,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { ctx->count = 0; } - static void DeleteThreadLocalErrorContext(void *p) { + static void DeleteThreadLocalErrorContext(void* p) { ErrorContext* ctx = static_cast(p); delete ctx; } @@ -466,8 +474,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { // Get the count of how many times we injected since the previous call int GetAndResetErrorCount() { - ErrorContext* ctx = - static_cast(thread_local_error_->Get()); + ErrorContext* ctx = static_cast(thread_local_error_->Get()); int count = 0; if (ctx != nullptr) { count = ctx->count; @@ -477,8 +484,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { } void EnableErrorInjection() { - ErrorContext* ctx = - static_cast(thread_local_error_->Get()); + ErrorContext* ctx = static_cast(thread_local_error_->Get()); if (ctx) { ctx->enable_error_injection = true; } @@ -499,8 +505,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { } void DisableErrorInjection() { - ErrorContext* ctx = - static_cast(thread_local_error_->Get()); + ErrorContext* ctx = static_cast(thread_local_error_->Get()); if (ctx) { ctx->enable_error_injection = false; } @@ -530,7 +535,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { // will be recovered to content accordingly. std::unordered_map> dir_to_new_files_since_last_sync_; - bool filesystem_active_; // Record flushes, syncs, writes + bool filesystem_active_; // Record flushes, syncs, writes bool filesystem_writable_; // Bypass FaultInjectionTestFS and go directly // to underlying FS for writable files IOStatus error_; diff --git a/utilities/fault_injection_secondary_cache.cc b/utilities/fault_injection_secondary_cache.cc index 2758c2a1934..d24e92f06ff 100644 --- a/utilities/fault_injection_secondary_cache.cc +++ b/utilities/fault_injection_secondary_cache.cc @@ -36,7 +36,9 @@ void FaultInjectionSecondaryCache::ResultHandle::Wait() { UpdateHandleValue(this); } -void* FaultInjectionSecondaryCache::ResultHandle::Value() { return value_; } +Cache::ObjectPtr FaultInjectionSecondaryCache::ResultHandle::Value() { + return value_; +} size_t FaultInjectionSecondaryCache::ResultHandle::Size() { return size_; } @@ -75,7 +77,8 @@ FaultInjectionSecondaryCache::GetErrorContext() { } Status FaultInjectionSecondaryCache::Insert( - const Slice& key, void* value, const Cache::CacheItemHelper* helper) { + const Slice& key, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper) { ErrorContext* ctx = GetErrorContext(); if (ctx->rand.OneIn(prob_)) { return Status::IOError(); @@ -86,7 +89,8 @@ Status FaultInjectionSecondaryCache::Insert( std::unique_ptr FaultInjectionSecondaryCache::Lookup(const Slice& key, - const Cache::CreateCallback& create_cb, + const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool advise_erase, bool& is_in_sec_cache) { ErrorContext* ctx = GetErrorContext(); @@ -94,11 +98,12 @@ FaultInjectionSecondaryCache::Lookup(const Slice& key, if (ctx->rand.OneIn(prob_)) { return nullptr; } else { - return base_->Lookup(key, create_cb, wait, advise_erase, is_in_sec_cache); + return base_->Lookup(key, helper, create_context, wait, advise_erase, + is_in_sec_cache); } } else { - std::unique_ptr hdl = - base_->Lookup(key, create_cb, wait, advise_erase, is_in_sec_cache); + std::unique_ptr hdl = base_->Lookup( + key, helper, create_context, wait, advise_erase, is_in_sec_cache); if (wait && ctx->rand.OneIn(prob_)) { hdl.reset(); } diff --git a/utilities/fault_injection_secondary_cache.h b/utilities/fault_injection_secondary_cache.h index 5321df626ce..47585e30e0a 100644 --- a/utilities/fault_injection_secondary_cache.h +++ b/utilities/fault_injection_secondary_cache.h @@ -31,12 +31,13 @@ class FaultInjectionSecondaryCache : public SecondaryCache { const char* Name() const override { return "FaultInjectionSecondaryCache"; } - Status Insert(const Slice& key, void* value, + Status Insert(const Slice& key, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper) override; std::unique_ptr Lookup( - const Slice& key, const Cache::CreateCallback& create_cb, bool wait, - bool advise_erase, bool& is_in_sec_cache) override; + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool advise_erase, + bool& is_in_sec_cache) override; bool SupportForceErase() const override { return base_->SupportForceErase(); } @@ -69,7 +70,7 @@ class FaultInjectionSecondaryCache : public SecondaryCache { void Wait() override; - void* Value() override; + Cache::ObjectPtr Value() override; size_t Size() override; @@ -81,7 +82,7 @@ class FaultInjectionSecondaryCache : public SecondaryCache { FaultInjectionSecondaryCache* cache_; std::unique_ptr base_; - void* value_; + Cache::ObjectPtr value_; size_t size_; }; diff --git a/utilities/leveldb_options/leveldb_options.cc b/utilities/leveldb_options/leveldb_options.cc index 5698b21ce23..125c3d9565c 100644 --- a/utilities/leveldb_options/leveldb_options.cc +++ b/utilities/leveldb_options/leveldb_options.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/utilities/leveldb_options.h" + #include "rocksdb/cache.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" diff --git a/utilities/memory_allocators.h b/utilities/memory_allocators.h index c9e77a5b7d2..bdc2e13a949 100644 --- a/utilities/memory_allocators.h +++ b/utilities/memory_allocators.h @@ -6,7 +6,6 @@ #pragma once #include - #include "rocksdb/memory_allocator.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h index 37535cdc53a..9b90107e398 100644 --- a/utilities/merge_operators.h +++ b/utilities/merge_operators.h @@ -4,13 +4,13 @@ // (found in the LICENSE.Apache file in the root directory). // #pragma once -#include "rocksdb/merge_operator.h" - #include #include #include +#include "rocksdb/merge_operator.h" + namespace ROCKSDB_NAMESPACE { class MergeOperators { @@ -19,7 +19,8 @@ class MergeOperators { static std::shared_ptr CreateDeprecatedPutOperator(); static std::shared_ptr CreateUInt64AddOperator(); static std::shared_ptr CreateStringAppendOperator(); - static std::shared_ptr CreateStringAppendOperator(char delim_char); + static std::shared_ptr CreateStringAppendOperator( + char delim_char); static std::shared_ptr CreateStringAppendOperator( const std::string& delim); static std::shared_ptr CreateStringAppendTESTOperator(); diff --git a/utilities/merge_operators/bytesxor.cc b/utilities/merge_operators/bytesxor.cc index 9324a366571..fa09c18ea99 100644 --- a/utilities/merge_operators/bytesxor.cc +++ b/utilities/merge_operators/bytesxor.cc @@ -3,28 +3,26 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "utilities/merge_operators/bytesxor.h" + #include #include -#include "utilities/merge_operators/bytesxor.h" - namespace ROCKSDB_NAMESPACE { std::shared_ptr MergeOperators::CreateBytesXOROperator() { return std::make_shared(); } -bool BytesXOROperator::Merge(const Slice& /*key*/, - const Slice* existing_value, - const Slice& value, - std::string* new_value, - Logger* /*logger*/) const { +bool BytesXOROperator::Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* /*logger*/) const { XOR(existing_value, value, new_value); return true; } -void BytesXOROperator::XOR(const Slice* existing_value, - const Slice& value, std::string* new_value) const { +void BytesXOROperator::XOR(const Slice* existing_value, const Slice& value, + std::string* new_value) const { if (!existing_value) { new_value->clear(); new_value->assign(value.data(), value.size()); diff --git a/utilities/merge_operators/bytesxor.h b/utilities/merge_operators/bytesxor.h index f05b6ca981f..3c7baaccec4 100644 --- a/utilities/merge_operators/bytesxor.h +++ b/utilities/merge_operators/bytesxor.h @@ -8,6 +8,7 @@ #include #include #include + #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" @@ -22,10 +23,8 @@ class BytesXOROperator : public AssociativeMergeOperator { public: // XORs the two array of bytes one byte at a time and stores the result // in new_value. len is the number of xored bytes, and the length of new_value - virtual bool Merge(const Slice& key, - const Slice* existing_value, - const Slice& value, - std::string* new_value, + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, Logger* logger) const override; static const char* kClassName() { return "BytesXOR"; } @@ -35,7 +34,7 @@ class BytesXOROperator : public AssociativeMergeOperator { const char* Name() const override { return kClassName(); } void XOR(const Slice* existing_value, const Slice& value, - std::string* new_value) const; + std::string* new_value) const; }; } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc index ba86706efb6..ccf9ff21f19 100644 --- a/utilities/merge_operators/put.cc +++ b/utilities/merge_operators/put.cc @@ -4,11 +4,12 @@ // (found in the LICENSE.Apache file in the root directory). #include -#include "rocksdb/slice.h" + #include "rocksdb/merge_operator.h" +#include "rocksdb/slice.h" #include "utilities/merge_operators.h" -namespace { // anonymous namespace +namespace { // anonymous namespace using ROCKSDB_NAMESPACE::Logger; using ROCKSDB_NAMESPACE::MergeOperator; @@ -77,7 +78,7 @@ class PutOperatorV2 : public PutOperator { const char* NickName() const override { return kNickName(); } }; -} // end of anonymous namespace +} // end of anonymous namespace namespace ROCKSDB_NAMESPACE { diff --git a/utilities/merge_operators/sortlist.cc b/utilities/merge_operators/sortlist.cc index fae33e2fd7b..67bfc7e5eab 100644 --- a/utilities/merge_operators/sortlist.cc +++ b/utilities/merge_operators/sortlist.cc @@ -3,11 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #include "utilities/merge_operators/sortlist.h" + #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" #include "utilities/merge_operators.h" - namespace ROCKSDB_NAMESPACE { bool SortList::FullMergeV2(const MergeOperationInput& merge_in, diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc index c20d415e76a..5092cabcb45 100644 --- a/utilities/merge_operators/string_append/stringappend.cc +++ b/utilities/merge_operators/string_append/stringappend.cc @@ -48,7 +48,7 @@ bool StringAppendOperator::Merge(const Slice& /*key*/, if (!existing_value) { // No existing_value. Set *new_value = value - new_value->assign(value.data(),value.size()); + new_value->assign(value.data(), value.size()); } else { // Generic append (existing_value != null). // Reserve *new_value to correct size, and apply concatenation. @@ -61,12 +61,12 @@ bool StringAppendOperator::Merge(const Slice& /*key*/, return true; } - std::shared_ptr MergeOperators::CreateStringAppendOperator() { return std::make_shared(','); } -std::shared_ptr MergeOperators::CreateStringAppendOperator(char delim_char) { +std::shared_ptr MergeOperators::CreateStringAppendOperator( + char delim_char) { return std::make_shared(delim_char); } diff --git a/utilities/merge_operators/string_append/stringappend.h b/utilities/merge_operators/string_append/stringappend.h index 3c2bb1907a4..153532382c4 100644 --- a/utilities/merge_operators/string_append/stringappend.h +++ b/utilities/merge_operators/string_append/stringappend.h @@ -16,10 +16,8 @@ class StringAppendOperator : public AssociativeMergeOperator { explicit StringAppendOperator(char delim_char); explicit StringAppendOperator(const std::string& delim); - virtual bool Merge(const Slice& key, - const Slice* existing_value, - const Slice& value, - std::string* new_value, + virtual bool Merge(const Slice& key, const Slice* existing_value, + const Slice& value, std::string* new_value, Logger* logger) const override; static const char* kClassName() { return "StringAppendOperator"; } diff --git a/utilities/merge_operators/string_append/stringappend2.h b/utilities/merge_operators/string_append/stringappend2.h index 339c760bd26..75389e4ae81 100644 --- a/utilities/merge_operators/string_append/stringappend2.h +++ b/utilities/merge_operators/string_append/stringappend2.h @@ -31,8 +31,8 @@ class StringAppendTESTOperator : public MergeOperator { virtual bool PartialMergeMulti(const Slice& key, const std::deque& operand_list, - std::string* new_value, Logger* logger) const - override; + std::string* new_value, + Logger* logger) const override; static const char* kClassName() { return "StringAppendTESTOperator"; } static const char* kNickName() { return "stringappendtest"; } diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc index e7963364e13..22b6144af65 100644 --- a/utilities/merge_operators/string_append/stringappend_test.cc +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -10,7 +10,7 @@ * * @author Deon Nicholas (dnicholas@fb.com) * Copyright 2013 Facebook, Inc. -*/ + */ #include "utilities/merge_operators/string_append/stringappend.h" @@ -27,7 +27,6 @@ #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend2.h" - namespace ROCKSDB_NAMESPACE { // Path to the database on file system @@ -73,18 +72,15 @@ std::shared_ptr OpenTtlDb(const std::string& delim) { /// Supports Append(list, string) and Get(list) class StringLists { public: - - //Constructor: specifies the rocksdb db + // Constructor: specifies the rocksdb db /* implicit */ StringLists(std::shared_ptr db) - : db_(db), - merge_option_(), - get_option_() { + : db_(db), merge_option_(), get_option_() { assert(db); } // Append string val onto the list defined by key; return true on success - bool Append(const std::string& key, const std::string& val){ + bool Append(const std::string& key, const std::string& val) { Slice valSlice(val.data(), val.size()); auto s = db_->Merge(merge_option_, key, valSlice); @@ -97,8 +93,8 @@ class StringLists { } // Returns the list of strings associated with key (or "" if does not exist) - bool Get(const std::string& key, std::string* const result){ - assert(result != nullptr); // we should have a place to store the result + bool Get(const std::string& key, std::string* const result) { + assert(result != nullptr); // we should have a place to store the result auto s = db_->Get(get_option_, key, result); if (s.ok()) { @@ -106,10 +102,10 @@ class StringLists { } // Either key does not exist, or there is some error. - *result = ""; // Always return empty string (just for convention) + *result = ""; // Always return empty string (just for convention) - //NotFound is okay; just return empty (similar to std::map) - //But network or db errors, etc, should fail the test (or at least yell) + // NotFound is okay; just return empty (similar to std::map) + // But network or db errors, etc, should fail the test (or at least yell) if (!s.IsNotFound()) { std::cerr << "ERROR " << s.ToString() << std::endl; } @@ -118,15 +114,12 @@ class StringLists { return false; } - private: std::shared_ptr db_; WriteOptions merge_option_; ReadOptions get_option_; - }; - // The class for unit-testing class StringAppendOperatorTest : public testing::Test, public ::testing::WithParamInterface { @@ -153,14 +146,13 @@ class StringAppendOperatorTest : public testing::Test, // Allows user to open databases with different configurations. // e.g.: Can open a DB or a TtlDB, etc. - static void SetOpenDbFunction(OpenFuncPtr func) { - OpenDb = func; - } + static void SetOpenDbFunction(OpenFuncPtr func) { OpenDb = func; } protected: static OpenFuncPtr OpenDb; }; -StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = nullptr; +StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb = + nullptr; // THE TEST CASES BEGIN HERE @@ -206,7 +198,6 @@ TEST_P(StringAppendOperatorTest, IteratorTest) { } } - // Should release the snapshot and be aware of the new stuff now it.reset(db_->NewIterator(ReadOptions())); first = true; @@ -236,7 +227,7 @@ TEST_P(StringAppendOperatorTest, IteratorTest) { it.reset(db_->NewIterator(ReadOptions())); first = true; std::string k3("k3"); - for(it->Seek(k2); it->Valid(); it->Next()) { + for (it->Seek(k2); it->Valid(); it->Next()) { res = it->value().ToString(); if (first) { ASSERT_EQ(res, "a1,a2,a3,a4"); @@ -245,7 +236,7 @@ TEST_P(StringAppendOperatorTest, IteratorTest) { ASSERT_EQ(res, "g1"); } } - for(it->Seek(k3); it->Valid(); it->Next()) { + for (it->Seek(k3); it->Valid(); it->Next()) { res = it->value().ToString(); if (first) { // should not be hit @@ -353,7 +344,7 @@ TEST_P(StringAppendOperatorTest, VariousKeys) { sb = slists.Get("b", &b); sc = slists.Get("c", &c); - ASSERT_TRUE(sa && sb && sc); // All three keys should have been found + ASSERT_TRUE(sa && sb && sc); // All three keys should have been found ASSERT_EQ(a, "x\nt\nr"); ASSERT_EQ(b, "y\n2"); @@ -367,22 +358,23 @@ TEST_P(StringAppendOperatorTest, RandomMixGetAppend) { // Generate a list of random keys and values const int kWordCount = 15; - std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839", - "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89", - "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", + "342839", "dsuha", "mabuais", "sadajsid", + "jf9834hf", "2d9j89", "dj9823jd", "a", + "dk02ed2dh", "$(jd4h984$(*", "mabz"}; const int kKeyCount = 6; - std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki", - "shzassdianmd"}; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", + "keykey", "muki", "shzassdianmd"}; // Will store a local copy of all data in order to verify correctness std::map parallel_copy; // Generate a bunch of random queries (Append and Get)! - enum query_t { APPEND_OP, GET_OP, NUM_OPS }; - Random randomGen(1337); //deterministic seed; always get same results! + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(1337); // deterministic seed; always get same results! const int kNumQueries = 30; - for (int q=0; q 0) { @@ -407,7 +398,6 @@ TEST_P(StringAppendOperatorTest, RandomMixGetAppend) { slists.Get(key, &res); ASSERT_EQ(res, parallel_copy[key]); } - } } @@ -417,32 +407,32 @@ TEST_P(StringAppendOperatorTest, BIGRandomMixGetAppend) { // Generate a list of random keys and values const int kWordCount = 15; - std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", "342839", - "dsuha", "mabuais", "sadajsid", "jf9834hf", "2d9j89", - "dj9823jd", "a", "dk02ed2dh", "$(jd4h984$(*", "mabz"}; + std::string words[] = {"sdasd", "triejf", "fnjsdfn", "dfjisdfsf", + "342839", "dsuha", "mabuais", "sadajsid", + "jf9834hf", "2d9j89", "dj9823jd", "a", + "dk02ed2dh", "$(jd4h984$(*", "mabz"}; const int kKeyCount = 6; - std::string keys[] = {"dhaiusdhu", "denidw", "daisda", "keykey", "muki", - "shzassdianmd"}; + std::string keys[] = {"dhaiusdhu", "denidw", "daisda", + "keykey", "muki", "shzassdianmd"}; // Will store a local copy of all data in order to verify correctness std::map parallel_copy; // Generate a bunch of random queries (Append and Get)! - enum query_t { APPEND_OP, GET_OP, NUM_OPS }; - Random randomGen(9138204); // deterministic seed + enum query_t { APPEND_OP, GET_OP, NUM_OPS }; + Random randomGen(9138204); // deterministic seed const int kNumQueries = 1000; - for (int q=0; q 0) { @@ -457,7 +447,6 @@ TEST_P(StringAppendOperatorTest, BIGRandomMixGetAppend) { slists.Get(key, &res); ASSERT_EQ(res, parallel_copy[key]); } - } } @@ -578,7 +567,7 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) { ASSERT_TRUE(slists.Get("a", &a)); ASSERT_EQ(a, "x\nt\nr"); - //Append, Compact, Get + // Append, Compact, Get slists.Append("c", "bbnagnagsx"); slists.Append("a", "sa"); slists.Append("b", "df"); @@ -629,8 +618,8 @@ TEST_P(StringAppendOperatorTest, SimpleTestNullDelimiter) { ASSERT_TRUE(slists.Get("k1", &res)); // Construct the desired string. Default constructor doesn't like '\0' chars. - std::string checker("v1,v2,v3"); // Verify that the string is right size. - checker[2] = '\0'; // Use null delimiter instead of comma. + std::string checker("v1,v2,v3"); // Verify that the string is right size. + checker[2] = '\0'; // Use null delimiter instead of comma. checker[5] = '\0'; ASSERT_EQ(checker.size(), 8); // Verify it is still the correct size diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc index 91d6af1f44b..5be2f56411a 100644 --- a/utilities/merge_operators/uint64add.cc +++ b/utilities/merge_operators/uint64add.cc @@ -12,7 +12,7 @@ #include "util/coding.h" #include "utilities/merge_operators.h" -namespace { // anonymous namespace +namespace { // anonymous namespace using ROCKSDB_NAMESPACE::AssociativeMergeOperator; using ROCKSDB_NAMESPACE::InfoLogLevel; @@ -27,7 +27,7 @@ class UInt64AddOperator : public AssociativeMergeOperator { const Slice& value, std::string* new_value, Logger* logger) const override { uint64_t orig_value = 0; - if (existing_value){ + if (existing_value) { orig_value = DecodeInteger(*existing_value, logger); } uint64_t operand = DecodeInteger(value, logger); diff --git a/utilities/options/options_util.cc b/utilities/options/options_util.cc index 9efbf76f02b..00c4b981a65 100644 --- a/utilities/options/options_util.cc +++ b/utilities/options/options_util.cc @@ -57,8 +57,8 @@ Status LoadOptionsFromFile(const ConfigOptions& config_options, return Status::OK(); } -Status GetLatestOptionsFileName(const std::string& dbpath, - Env* env, std::string* options_file_name) { +Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, + std::string* options_file_name) { Status s; std::string latest_file_name; uint64_t latest_time_stamp = 0; diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc index 1ab420251ae..8ad9bb1b166 100644 --- a/utilities/persistent_cache/block_cache_tier.cc +++ b/utilities/persistent_cache/block_cache_tier.cc @@ -132,7 +132,7 @@ Status BlockCacheTier::Close() { return Status::OK(); } -template +template void Add(std::map* stats, const std::string& key, const T& t) { stats->insert({key, static_cast(t)}); @@ -148,8 +148,7 @@ PersistentCache::StatsType BlockCacheTier::Stats() { stats_.bytes_read_.Average()); Add(&stats, "persistentcache.blockcachetier.insert_dropped", stats_.insert_dropped_); - Add(&stats, "persistentcache.blockcachetier.cache_hits", - stats_.cache_hits_); + Add(&stats, "persistentcache.blockcachetier.cache_hits", stats_.cache_hits_); Add(&stats, "persistentcache.blockcachetier.cache_misses", stats_.cache_misses_); Add(&stats, "persistentcache.blockcachetier.cache_errors", @@ -326,10 +325,9 @@ Status BlockCacheTier::NewCacheFile() { TEST_SYNC_POINT_CALLBACK("BlockCacheTier::NewCacheFile:DeleteDir", (void*)(GetCachePath().c_str())); - std::unique_ptr f( - new WriteableCacheFile(opt_.env, &buffer_allocator_, &writer_, - GetCachePath(), writer_cache_id_, - opt_.cache_file_size, opt_.log)); + std::unique_ptr f(new WriteableCacheFile( + opt_.env, &buffer_allocator_, &writer_, GetCachePath(), writer_cache_id_, + opt_.cache_file_size, opt_.log)); bool status = f->Create(opt_.enable_direct_writes, opt_.enable_direct_reads); if (!status) { diff --git a/utilities/persistent_cache/block_cache_tier.h b/utilities/persistent_cache/block_cache_tier.h index 74c3095e7b3..1aac287cc0e 100644 --- a/utilities/persistent_cache/block_cache_tier.h +++ b/utilities/persistent_cache/block_cache_tier.h @@ -6,9 +6,9 @@ #ifndef ROCKSDB_LITE -#ifndef OS_WIN +#ifndef OS_WIN #include -#endif // ! OS_WIN +#endif // ! OS_WIN #include #include @@ -45,7 +45,8 @@ class BlockCacheTier : public PersistentCacheTier { : opt_(opt), insert_ops_(static_cast(opt_.max_write_pipeline_backlog_size)), buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()), - writer_(this, opt_.writer_qdepth, static_cast(opt_.writer_dispatch_size)) { + writer_(this, opt_.writer_qdepth, + static_cast(opt_.writer_dispatch_size)) { Info(opt_.log, "Initializing allocator. size=%d B count=%" ROCKSDB_PRIszt, opt_.write_buffer_size, opt_.write_buffer_count()); } @@ -147,7 +148,7 @@ class BlockCacheTier : public PersistentCacheTier { ThreadedWriter writer_; // Writer threads BlockCacheTierMetadata metadata_; // Cache meta data manager std::atomic size_{0}; // Size of the cache - Statistics stats_; // Statistics + Statistics stats_; // Statistics }; } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc index dbddc823240..f4f8517abe3 100644 --- a/utilities/persistent_cache/block_cache_tier_file.cc +++ b/utilities/persistent_cache/block_cache_tier_file.cc @@ -68,8 +68,7 @@ Status BlockCacheFile::Delete(uint64_t* size) { // <-- 4 --><-- 4 --><-- 4 --><-- 4 --><-- key size --><-- v-size --> // struct CacheRecordHeader { - CacheRecordHeader() - : magic_(0), crc_(0), key_size_(0), val_size_(0) {} + CacheRecordHeader() : magic_(0), crc_(0), key_size_(0), val_size_(0) {} CacheRecordHeader(const uint32_t magic, const uint32_t key_size, const uint32_t val_size) : magic_(magic), crc_(0), key_size_(key_size), val_size_(val_size) {} diff --git a/utilities/persistent_cache/block_cache_tier_file.h b/utilities/persistent_cache/block_cache_tier_file.h index 123f91e6c21..1d265ab74fb 100644 --- a/utilities/persistent_cache/block_cache_tier_file.h +++ b/utilities/persistent_cache/block_cache_tier_file.h @@ -12,19 +12,16 @@ #include #include "file/random_access_file_reader.h" - +#include "port/port.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" - +#include "util/crc32c.h" +#include "util/mutexlock.h" #include "utilities/persistent_cache/block_cache_tier_file_buffer.h" #include "utilities/persistent_cache/lrulist.h" #include "utilities/persistent_cache/persistent_cache_tier.h" #include "utilities/persistent_cache/persistent_cache_util.h" -#include "port/port.h" -#include "util/crc32c.h" -#include "util/mutexlock.h" - // The io code path of persistent cache uses pipelined architecture // // client -> In Queue <-- BlockCacheTier --> Out Queue <-- Writer <--> Kernel diff --git a/utilities/persistent_cache/block_cache_tier_file_buffer.h b/utilities/persistent_cache/block_cache_tier_file_buffer.h index 23013d72011..d4f02455a3a 100644 --- a/utilities/persistent_cache/block_cache_tier_file_buffer.h +++ b/utilities/persistent_cache/block_cache_tier_file_buffer.h @@ -8,8 +8,8 @@ #include #include -#include "rocksdb/comparator.h" #include "memory/arena.h" +#include "rocksdb/comparator.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/persistent_cache/block_cache_tier_metadata.cc b/utilities/persistent_cache/block_cache_tier_metadata.cc index c99322e6b63..d73b5d0b48a 100644 --- a/utilities/persistent_cache/block_cache_tier_metadata.cc +++ b/utilities/persistent_cache/block_cache_tier_metadata.cc @@ -32,8 +32,8 @@ BlockCacheFile* BlockCacheTierMetadata::Evict() { } void BlockCacheTierMetadata::Clear() { - cache_file_index_.Clear([](BlockCacheFile* arg){ delete arg; }); - block_index_.Clear([](BlockInfo* arg){ delete arg; }); + cache_file_index_.Clear([](BlockCacheFile* arg) { delete arg; }); + block_index_.Clear([](BlockInfo* arg) { delete arg; }); } BlockInfo* BlockCacheTierMetadata::Insert(const Slice& key, const LBA& lba) { diff --git a/utilities/persistent_cache/block_cache_tier_metadata.h b/utilities/persistent_cache/block_cache_tier_metadata.h index 6735ce290a9..2fcd501056d 100644 --- a/utilities/persistent_cache/block_cache_tier_metadata.h +++ b/utilities/persistent_cache/block_cache_tier_metadata.h @@ -11,7 +11,6 @@ #include #include "rocksdb/slice.h" - #include "utilities/persistent_cache/block_cache_tier_file.h" #include "utilities/persistent_cache/hash_table.h" #include "utilities/persistent_cache/hash_table_evictable.h" diff --git a/utilities/persistent_cache/hash_table.h b/utilities/persistent_cache/hash_table.h index 3d0a1f99394..b00b294ce9d 100644 --- a/utilities/persistent_cache/hash_table.h +++ b/utilities/persistent_cache/hash_table.h @@ -8,6 +8,7 @@ #ifndef ROCKSDB_LITE #include + #include #include diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc index a56813afa76..2f6387f5fb3 100644 --- a/utilities/persistent_cache/hash_table_test.cc +++ b/utilities/persistent_cache/hash_table_test.cc @@ -3,7 +3,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // +#include "utilities/persistent_cache/hash_table.h" + #include + #include #include #include @@ -12,7 +15,6 @@ #include "memory/arena.h" #include "test_util/testharness.h" #include "util/random.h" -#include "utilities/persistent_cache/hash_table.h" #include "utilities/persistent_cache/hash_table_evictable.h" #ifndef ROCKSDB_LITE diff --git a/utilities/persistent_cache/persistent_cache_bench.cc b/utilities/persistent_cache/persistent_cache_bench.cc index ea41a83b857..9d6e15d6b68 100644 --- a/utilities/persistent_cache/persistent_cache_bench.cc +++ b/utilities/persistent_cache/persistent_cache_bench.cc @@ -234,7 +234,7 @@ class CacheTierBenchmark { fprintf(stderr, "%s\n", status.ToString().c_str()); } assert(status.ok()); - assert(size == (size_t) FLAGS_iosize); + assert(size == (size_t)FLAGS_iosize); // adjust stats const size_t elapsed_micro = timer.ElapsedNanos() / 1000; diff --git a/utilities/persistent_cache/persistent_cache_test.cc b/utilities/persistent_cache/persistent_cache_test.cc index bacb9f82287..d1b18b68aa2 100644 --- a/utilities/persistent_cache/persistent_cache_test.cc +++ b/utilities/persistent_cache/persistent_cache_test.cc @@ -84,7 +84,8 @@ std::unique_ptr NewBlockCache( Env* env, const std::string& path, const uint64_t max_size = std::numeric_limits::max(), const bool enable_direct_writes = false) { - const uint32_t max_file_size = static_cast(12 * 1024 * 1024 * kStressFactor); + const uint32_t max_file_size = + static_cast(12 * 1024 * 1024 * kStressFactor); auto log = std::make_shared(); PersistentCacheConfig opt(env, path, max_size, log); opt.cache_file_size = max_file_size; @@ -101,7 +102,8 @@ std::unique_ptr NewTieredCache( Env* env, const std::string& path, const uint64_t max_volatile_cache_size, const uint64_t max_block_cache_size = std::numeric_limits::max()) { - const uint32_t max_file_size = static_cast(12 * 1024 * 1024 * kStressFactor); + const uint32_t max_file_size = + static_cast(12 * 1024 * 1024 * kStressFactor); auto log = std::make_shared(); auto opt = PersistentCacheConfig(env, path, max_block_cache_size, log); opt.cache_file_size = max_file_size; @@ -126,13 +128,13 @@ PersistentCacheTierTest::PersistentCacheTierTest() TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithFileCreateError) { cache_ = NewBlockCache(Env::Default(), path_, /*size=*/std::numeric_limits::max(), - /*direct_writes=*/ false); + /*direct_writes=*/false); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BlockCacheTier::NewCacheFile:DeleteDir", OnDeleteDir); - RunNegativeInsertTest(/*nthreads=*/ 1, + RunNegativeInsertTest(/*nthreads=*/1, /*max_keys*/ - static_cast(10 * 1024 * kStressFactor)); + static_cast(10 * 1024 * kStressFactor)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } @@ -171,7 +173,8 @@ TEST_F(PersistentCacheTierTest, DISABLED_VolatileCacheInsertWithEviction) { for (auto nthreads : {1, 5}) { for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) { cache_ = std::make_shared( - /*compressed=*/true, /*size=*/static_cast(1 * 1024 * 1024 * kStressFactor)); + /*compressed=*/true, + /*size=*/static_cast(1 * 1024 * 1024 * kStressFactor)); RunInsertTestWithEviction(nthreads, static_cast(max_keys)); } } @@ -197,8 +200,9 @@ TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsert) { TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithEviction) { for (auto nthreads : {1, 5}) { for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) { - cache_ = NewBlockCache(Env::Default(), path_, - /*max_size=*/static_cast(200 * 1024 * 1024 * kStressFactor)); + cache_ = NewBlockCache( + Env::Default(), path_, + /*max_size=*/static_cast(200 * 1024 * 1024 * kStressFactor)); RunInsertTestWithEviction(nthreads, static_cast(max_keys)); } } @@ -210,8 +214,9 @@ TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsert) { for (auto nthreads : {1, 5}) { for (auto max_keys : {10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) { - cache_ = NewTieredCache(Env::Default(), path_, - /*memory_size=*/static_cast(1 * 1024 * 1024 * kStressFactor)); + cache_ = NewTieredCache( + Env::Default(), path_, + /*memory_size=*/static_cast(1 * 1024 * 1024 * kStressFactor)); RunInsertTest(nthreads, static_cast(max_keys)); } } @@ -226,7 +231,8 @@ TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsertWithEviction) { cache_ = NewTieredCache( Env::Default(), path_, /*memory_size=*/static_cast(1 * 1024 * 1024 * kStressFactor), - /*block_cache_size*/ static_cast(200 * 1024 * 1024 * kStressFactor)); + /*block_cache_size*/ + static_cast(200 * 1024 * 1024 * kStressFactor)); RunInsertTestWithEviction(nthreads, static_cast(max_keys)); } } @@ -291,14 +297,13 @@ PersistentCacheDBTest::PersistentCacheDBTest() void PersistentCacheDBTest::RunTest( const std::function(bool)>& new_pcache, const size_t max_keys = 100 * 1024, const size_t max_usecase = 5) { - // number of insertion interations int num_iter = static_cast(max_keys * kStressFactor); for (size_t iter = 0; iter < max_usecase; iter++) { Options options; options.write_buffer_size = - static_cast(64 * 1024 * kStressFactor); // small write buffer + static_cast(64 * 1024 * kStressFactor); // small write buffer options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options = CurrentOptions(options); diff --git a/utilities/simulator_cache/cache_simulator.cc b/utilities/simulator_cache/cache_simulator.cc index dc419e51ab3..edb75d545c4 100644 --- a/utilities/simulator_cache/cache_simulator.cc +++ b/utilities/simulator_cache/cache_simulator.cc @@ -26,8 +26,8 @@ bool GhostCache::Admit(const Slice& lookup_key) { return true; } // TODO: Should we check for errors here? - auto s = sim_cache_->Insert(lookup_key, /*value=*/nullptr, lookup_key.size(), - /*deleter=*/nullptr); + auto s = sim_cache_->Insert(lookup_key, /*obj=*/nullptr, + &kNoopCacheItemHelper, lookup_key.size()); s.PermitUncheckedError(); return false; } @@ -51,9 +51,8 @@ void CacheSimulator::Access(const BlockCacheTraceRecord& access) { } else { if (!access.no_insert && admit && access.block_size > 0) { // Ignore errors on insert - auto s = sim_cache_->Insert(access.block_key, /*value=*/nullptr, - access.block_size, - /*deleter=*/nullptr); + auto s = sim_cache_->Insert(access.block_key, /*obj=*/nullptr, + &kNoopCacheItemHelper, access.block_size); s.PermitUncheckedError(); } } @@ -109,8 +108,8 @@ void PrioritizedCacheSimulator::AccessKVPair( *is_cache_miss = false; } else if (!no_insert && *admitted && value_size > 0) { // TODO: Should we check for an error here? - auto s = sim_cache_->Insert(key, /*value=*/nullptr, value_size, - /*deleter=*/nullptr, + auto s = sim_cache_->Insert(key, /*obj=*/nullptr, &kNoopCacheItemHelper, + value_size, /*handle=*/nullptr, priority); s.PermitUncheckedError(); } @@ -188,10 +187,10 @@ void HybridRowBlockCacheSimulator::Access(const BlockCacheTraceRecord& access) { /*update_metrics=*/true); if (access.referenced_data_size > 0 && inserted == InsertResult::ADMITTED) { // TODO: Should we check for an error here? - auto s = sim_cache_->Insert(row_key, /*value=*/nullptr, - access.referenced_data_size, - /*deleter=*/nullptr, - /*handle=*/nullptr, Cache::Priority::HIGH); + auto s = + sim_cache_->Insert(row_key, /*obj=*/nullptr, &kNoopCacheItemHelper, + access.referenced_data_size, + /*handle=*/nullptr, Cache::Priority::HIGH); s.PermitUncheckedError(); status.row_key_status[row_key] = InsertResult::INSERTED; } diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc index a883b52e788..0f0c098710b 100644 --- a/utilities/simulator_cache/sim_cache.cc +++ b/utilities/simulator_cache/sim_cache.cc @@ -165,8 +165,8 @@ class SimCacheImpl : public SimCache { } using Cache::Insert; - Status Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value), Handle** handle, + Status Insert(const Slice& key, Cache::ObjectPtr value, + const CacheItemHelper* helper, size_t charge, Handle** handle, Priority priority) override { // The handle and value passed in are for real cache, so we pass nullptr // to key_only_cache_ for both instead. Also, the deleter function pointer @@ -176,9 +176,8 @@ class SimCacheImpl : public SimCache { Handle* h = key_only_cache_->Lookup(key); if (h == nullptr) { // TODO: Check for error here? - auto s = key_only_cache_->Insert( - key, nullptr, charge, [](const Slice& /*k*/, void* /*v*/) {}, nullptr, - priority); + auto s = key_only_cache_->Insert(key, nullptr, &kNoopCacheItemHelper, + charge, nullptr, priority); s.PermitUncheckedError(); } else { key_only_cache_->Release(h); @@ -188,26 +187,18 @@ class SimCacheImpl : public SimCache { if (!cache_) { return Status::OK(); } - return cache_->Insert(key, value, charge, deleter, handle, priority); + return cache_->Insert(key, value, helper, charge, handle, priority); } - using Cache::Lookup; - Handle* Lookup(const Slice& key, Statistics* stats) override { - Handle* h = key_only_cache_->Lookup(key); - if (h != nullptr) { - key_only_cache_->Release(h); - inc_hit_counter(); - RecordTick(stats, SIM_BLOCK_CACHE_HIT); - } else { - inc_miss_counter(); - RecordTick(stats, SIM_BLOCK_CACHE_MISS); - } - - cache_activity_logger_.ReportLookup(key); + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority = Priority::LOW, bool wait = true, + Statistics* stats = nullptr) override { + HandleLookup(key, stats); if (!cache_) { return nullptr; } - return cache_->Lookup(key, stats); + return cache_->Lookup(key, helper, create_context, priority, wait, stats); } bool Ref(Handle* handle) override { return cache_->Ref(handle); } @@ -222,7 +213,9 @@ class SimCacheImpl : public SimCache { key_only_cache_->Erase(key); } - void* Value(Handle* handle) override { return cache_->Value(handle); } + Cache::ObjectPtr Value(Handle* handle) override { + return cache_->Value(handle); + } uint64_t NewId() override { return cache_->NewId(); } @@ -242,8 +235,8 @@ class SimCacheImpl : public SimCache { return cache_->GetCharge(handle); } - DeleterFn GetDeleter(Handle* handle) const override { - return cache_->GetDeleter(handle); + const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { + return cache_->GetCacheItemHelper(handle); } size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); } @@ -253,15 +246,9 @@ class SimCacheImpl : public SimCache { key_only_cache_->DisownData(); } - void ApplyToAllCacheEntries(void (*callback)(void*, size_t), - bool thread_safe) override { - // only apply to _cache since key_only_cache doesn't hold value - cache_->ApplyToAllCacheEntries(callback, thread_safe); - } - void ApplyToAllEntries( - const std::function& callback, + const std::function& callback, const ApplyToAllEntriesOptions& opts) override { cache_->ApplyToAllEntries(callback, opts); } @@ -338,6 +325,19 @@ class SimCacheImpl : public SimCache { miss_times_.fetch_add(1, std::memory_order_relaxed); } void inc_hit_counter() { hit_times_.fetch_add(1, std::memory_order_relaxed); } + + void HandleLookup(const Slice& key, Statistics* stats) { + Handle* h = key_only_cache_->Lookup(key); + if (h != nullptr) { + key_only_cache_->Release(h); + inc_hit_counter(); + RecordTick(stats, SIM_BLOCK_CACHE_HIT); + } else { + inc_miss_counter(); + RecordTick(stats, SIM_BLOCK_CACHE_MISS); + } + cache_activity_logger_.ReportLookup(key); + } }; } // end anonymous namespace diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.h b/utilities/table_properties_collectors/compact_on_deletion_collector.h index 63ebd3d60cc..2f7dc4f1b63 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector.h +++ b/utilities/table_properties_collectors/compact_on_deletion_collector.h @@ -41,9 +41,7 @@ class CompactOnDeletionCollector : public TablePropertiesCollector { } // EXPERIMENTAL Return whether the output file should be further compacted - virtual bool NeedCompact() const override { - return need_compaction_; - } + virtual bool NeedCompact() const override { return need_compaction_; } static const int kNumBuckets = 128; diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc index 97784efe43a..88aeb8d5c92 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc @@ -80,10 +80,10 @@ TEST(CompactOnDeletionCollector, DeletionRatio) { } TEST(CompactOnDeletionCollector, SlidingWindow) { - const int kWindowSizes[] = - {1000, 10000, 10000, 127, 128, 129, 255, 256, 257, 2, 10000}; - const int kDeletionTriggers[] = - {500, 9500, 4323, 47, 61, 128, 250, 250, 250, 2, 2}; + const int kWindowSizes[] = {1000, 10000, 10000, 127, 128, 129, + 255, 256, 257, 2, 10000}; + const int kDeletionTriggers[] = {500, 9500, 4323, 47, 61, 128, + 250, 250, 250, 2, 2}; TablePropertiesCollectorFactory::Context context; context.column_family_id = TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; @@ -134,13 +134,13 @@ TEST(CompactOnDeletionCollector, SlidingWindow) { collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0)); } } - if (collector->NeedCompact() != - (deletions >= kNumDeletionTrigger) && + if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) && std::abs(deletions - kNumDeletionTrigger) > kBias) { - fprintf(stderr, "[Error] collector->NeedCompact() != (%d >= %d)" + fprintf(stderr, + "[Error] collector->NeedCompact() != (%d >= %d)" " with kWindowSize = %d and kNumDeletionTrigger = %d\n", - deletions, kNumDeletionTrigger, - kWindowSize, kNumDeletionTrigger); + deletions, kNumDeletionTrigger, kWindowSize, + kNumDeletionTrigger); ASSERT_TRUE(false); } ASSERT_OK(collector->Finish(nullptr)); @@ -182,11 +182,11 @@ TEST(CompactOnDeletionCollector, SlidingWindow) { } if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) && std::abs(deletions - kNumDeletionTrigger) > kBias) { - fprintf(stderr, "[Error] collector->NeedCompact() %d != (%d >= %d)" + fprintf(stderr, + "[Error] collector->NeedCompact() %d != (%d >= %d)" " with kWindowSize = %d, kNumDeletionTrigger = %d\n", - collector->NeedCompact(), - deletions, kNumDeletionTrigger, kWindowSize, - kNumDeletionTrigger); + collector->NeedCompact(), deletions, kNumDeletionTrigger, + kWindowSize, kNumDeletionTrigger); ASSERT_TRUE(false); } ASSERT_OK(collector->Finish(nullptr)); @@ -218,7 +218,8 @@ TEST(CompactOnDeletionCollector, SlidingWindow) { } if (collector->NeedCompact() && std::abs(kDeletionsPerSection - kNumDeletionTrigger) > kBias) { - fprintf(stderr, "[Error] collector->NeedCompact() != false" + fprintf(stderr, + "[Error] collector->NeedCompact() != false" " with kWindowSize = %d and kNumDeletionTrigger = %d\n", kWindowSize, kNumDeletionTrigger); ASSERT_TRUE(false); diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc index 1948c81c124..b362a164ddd 100644 --- a/utilities/transactions/lock/point/point_lock_manager.cc +++ b/utilities/transactions/lock/point/point_lock_manager.cc @@ -247,14 +247,14 @@ Status PointLockManager::TryLock(PessimisticTransaction* txn, int64_t timeout = txn->GetLockTimeout(); return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env, - timeout, std::move(lock_info)); + timeout, lock_info); } // Helper function for TryLock(). Status PointLockManager::AcquireWithTimeout( PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe, ColumnFamilyId column_family_id, const std::string& key, Env* env, - int64_t timeout, LockInfo&& lock_info) { + int64_t timeout, const LockInfo& lock_info) { Status result; uint64_t end_time = 0; @@ -278,7 +278,7 @@ Status PointLockManager::AcquireWithTimeout( // Acquire lock if we are able to uint64_t expire_time_hint = 0; autovector wait_ids; - result = AcquireLocked(lock_map, stripe, key, env, std::move(lock_info), + result = AcquireLocked(lock_map, stripe, key, env, lock_info, &expire_time_hint, &wait_ids); if (!result.ok() && timeout != 0) { @@ -334,14 +334,14 @@ Status PointLockManager::AcquireWithTimeout( } if (result.IsTimedOut()) { - timed_out = true; - // Even though we timed out, we will still make one more attempt to - // acquire lock below (it is possible the lock expired and we - // were never signaled). + timed_out = true; + // Even though we timed out, we will still make one more attempt to + // acquire lock below (it is possible the lock expired and we + // were never signaled). } if (result.ok() || result.IsTimedOut()) { - result = AcquireLocked(lock_map, stripe, key, env, std::move(lock_info), + result = AcquireLocked(lock_map, stripe, key, env, lock_info, &expire_time_hint, &wait_ids); } } while (!result.ok() && !timed_out); @@ -379,8 +379,10 @@ bool PointLockManager::IncrementWaiters( const autovector& wait_ids, const std::string& key, const uint32_t& cf_id, const bool& exclusive, Env* const env) { auto id = txn->GetID(); - std::vector queue_parents(static_cast(txn->GetDeadlockDetectDepth())); - std::vector queue_values(static_cast(txn->GetDeadlockDetectDepth())); + std::vector queue_parents( + static_cast(txn->GetDeadlockDetectDepth())); + std::vector queue_values( + static_cast(txn->GetDeadlockDetectDepth())); std::lock_guard lock(wait_txn_map_mutex_); assert(!wait_txn_map_.Contains(id)); @@ -473,7 +475,7 @@ bool PointLockManager::IncrementWaiters( // REQUIRED: Stripe mutex must be held. Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, const std::string& key, Env* env, - LockInfo&& txn_lock_info, + const LockInfo& txn_lock_info, uint64_t* expire_time, autovector* txn_ids) { assert(txn_lock_info.txn_ids.size() == 1); @@ -525,7 +527,7 @@ Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, result = Status::Busy(Status::SubCode::kLockLimit); } else { // acquire lock - stripe->keys.emplace(key, std::move(txn_lock_info)); + stripe->keys.emplace(key, txn_lock_info); // Maintain lock count if there is a limit on the number of locks if (max_num_locks_) { diff --git a/utilities/transactions/lock/point/point_lock_manager.h b/utilities/transactions/lock/point/point_lock_manager.h index 3c6f80dcdf6..eeb34f3bec4 100644 --- a/utilities/transactions/lock/point/point_lock_manager.h +++ b/utilities/transactions/lock/point/point_lock_manager.h @@ -200,11 +200,11 @@ class PointLockManager : public LockManager { Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe, uint32_t column_family_id, const std::string& key, Env* env, int64_t timeout, - LockInfo&& lock_info); + const LockInfo& lock_info); Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe, const std::string& key, Env* env, - LockInfo&& lock_info, uint64_t* wait_time, + const LockInfo& lock_info, uint64_t* wait_time, autovector* txn_ids); void UnLockKey(PessimisticTransaction* txn, const std::string& key, diff --git a/utilities/transactions/lock/range/range_locking_test.cc b/utilities/transactions/lock/range/range_locking_test.cc index 45e13832673..bce66c1f360 100644 --- a/utilities/transactions/lock/range/range_locking_test.cc +++ b/utilities/transactions/lock/range/range_locking_test.cc @@ -142,20 +142,18 @@ TEST_F(RangeLockingTest, UpgradeLockAndGetConflict) { auto cf = db->DefaultColumnFamily(); Status s; std::string value; - txn_options.lock_timeout= 10; + txn_options.lock_timeout = 10; Transaction* txn0 = db->BeginTransaction(write_options, txn_options); Transaction* txn1 = db->BeginTransaction(write_options, txn_options); // Get the shared lock in txn0 - s = txn0->GetForUpdate(ReadOptions(), cf, - Slice("a"), &value, - false /*exclusive*/); + s = txn0->GetForUpdate(ReadOptions(), cf, Slice("a"), &value, + false /*exclusive*/); ASSERT_TRUE(s.IsNotFound()); // Get the shared lock on the same key in txn1 - s = txn1->GetForUpdate(ReadOptions(), cf, - Slice("a"), &value, + s = txn1->GetForUpdate(ReadOptions(), cf, Slice("a"), &value, false /*exclusive*/); ASSERT_TRUE(s.IsNotFound()); @@ -170,7 +168,6 @@ TEST_F(RangeLockingTest, UpgradeLockAndGetConflict) { delete txn1; } - TEST_F(RangeLockingTest, SnapshotValidation) { Status s; Slice key_slice = Slice("k"); diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc index 0d99130ae7c..3d6a590c792 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc @@ -199,8 +199,7 @@ static bool determine_conflicting_txnids( if (other_txnid == TXNID_SHARED) { // Add all shared lock owners, except this transaction. for (TXNID shared_id : *lock.owners) { - if (shared_id != txnid) - conflicts->add(shared_id); + if (shared_id != txnid) conflicts->add(shared_id); } } else { conflicts->add(other_txnid); diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h index 37fde032356..9b83c53511c 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h @@ -129,7 +129,7 @@ static inline tokutime_t toku_time_now(void) { return (uint64_t)hi << 32 | lo; #elif defined(__aarch64__) uint64_t result; - __asm __volatile__("mrs %[rt], cntvct_el0" : [ rt ] "=r"(result)); + __asm __volatile__("mrs %[rt], cntvct_el0" : [rt] "=r"(result)); return result; #elif defined(__powerpc__) return __ppc_get_timebase(); @@ -154,6 +154,10 @@ static inline tokutime_t toku_time_now(void) { uint64_t cycles; asm volatile("rdcycle %0" : "=r"(cycles)); return cycles; +#elif defined(__loongarch64) + unsigned long result; + asm volatile ("rdtime.d\t%0,$r0" : "=r" (result)); + return result; #else #error No timer implementation for this platform #endif diff --git a/utilities/transactions/optimistic_transaction.h b/utilities/transactions/optimistic_transaction.h index c337de2af3c..de23233d573 100644 --- a/utilities/transactions/optimistic_transaction.h +++ b/utilities/transactions/optimistic_transaction.h @@ -18,8 +18,8 @@ #include "rocksdb/snapshot.h" #include "rocksdb/status.h" #include "rocksdb/types.h" -#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "utilities/transactions/transaction_base.h" #include "utilities/transactions/transaction_util.h" diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index 49651be012c..88e86ea4a67 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -6,9 +6,9 @@ #pragma once #ifndef ROCKSDB_LITE +#include #include #include -#include #include "rocksdb/db.h" #include "rocksdb/options.h" diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index 1bd9fc9be8d..aa8192c325b 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -53,7 +53,7 @@ class OptimisticTransactionTest Open(); } -private: + private: void Open() { ColumnFamilyOptions cf_options(options); OptimisticTransactionDBOptions occ_opts; @@ -1426,7 +1426,8 @@ TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) { WriteOptions write_options; OptimisticTransactionOptions transaction_options; - Transaction* transaction(txn_db->BeginTransaction(write_options, transaction_options)); + Transaction* transaction( + txn_db->BeginTransaction(write_options, transaction_options)); Status s = transaction->Put("foo", "val"); ASSERT_OK(s); s = transaction->Put("foo2", "val"); diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index c98cfcbf270..83fd94ac855 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -209,8 +209,7 @@ Status TransactionBaseImpl::RollbackToSavePoint() { } Status TransactionBaseImpl::PopSavePoint() { - if (save_points_ == nullptr || - save_points_->empty()) { + if (save_points_ == nullptr || save_points_->empty()) { // No SavePoint yet. assert(write_batch_.PopSavePoint().IsNotFound()); return Status::NotFound(); diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index 504d692bfb3..1bcb20ca90b 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -93,8 +93,9 @@ class TransactionBaseImpl : public Transaction { std::vector MultiGet(const ReadOptions& options, const std::vector& keys, std::vector* values) override { - return MultiGet(options, std::vector( - keys.size(), db_->DefaultColumnFamily()), + return MultiGet(options, + std::vector( + keys.size(), db_->DefaultColumnFamily()), keys, values); } diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 632826def10..d74a4b8b116 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -2490,7 +2490,7 @@ TEST_P(TransactionTest, FlushTest2) { ASSERT_OK(s); ASSERT_EQ("z", value); - delete txn; + delete txn; } } @@ -2967,9 +2967,9 @@ TEST_P(TransactionTest, MultiGetLargeBatchedTest) { std::vector values(keys.size()); std::vector statuses(keys.size()); - wb.MultiGetFromBatchAndDB(db, snapshot_read_options, handles[1], keys.size(), keys.data(), - values.data(), statuses.data(), false); - for (size_t i =0; i < keys.size(); ++i) { + wb.MultiGetFromBatchAndDB(db, snapshot_read_options, handles[1], keys.size(), + keys.data(), values.data(), statuses.data(), false); + for (size_t i = 0; i < keys.size(); ++i) { if (i == 1) { ASSERT_TRUE(statuses[1].IsNotFound()); } else if (i == 2) { @@ -4674,7 +4674,7 @@ TEST_P(TransactionTest, TimeoutTest) { ASSERT_OK(s); TransactionOptions txn_options0; - txn_options0.expiration = 100; // 100ms + txn_options0.expiration = 100; // 100ms txn_options0.lock_timeout = 50; // txn timeout no longer infinite Transaction* txn1 = db->BeginTransaction(write_options, txn_options0); @@ -5619,7 +5619,7 @@ TEST_P(TransactionStressTest, SeqAdvanceTest) { size_t branch = 0; auto seq = db_impl->GetLatestSequenceNumber(); exp_seq = seq; - txn_t0(0); + TestTxn0(0); seq = db_impl->TEST_GetLastVisibleSequence(); ASSERT_EQ(exp_seq, seq); @@ -5637,11 +5637,11 @@ TEST_P(TransactionStressTest, SeqAdvanceTest) { } // Doing it twice might detect some bugs - txn_t0(1); + TestTxn0(1); seq = db_impl->TEST_GetLastVisibleSequence(); ASSERT_EQ(exp_seq, seq); - txn_t1(0); + TestTxn1(0); seq = db_impl->TEST_GetLastVisibleSequence(); ASSERT_EQ(exp_seq, seq); @@ -5658,7 +5658,7 @@ TEST_P(TransactionStressTest, SeqAdvanceTest) { ASSERT_EQ(exp_seq, seq); } - txn_t3(0); + TestTxn3(0); seq = db_impl->TEST_GetLastVisibleSequence(); ASSERT_EQ(exp_seq, seq); @@ -5675,7 +5675,7 @@ TEST_P(TransactionStressTest, SeqAdvanceTest) { ASSERT_EQ(exp_seq, seq); } - txn_t4(0); + TestTxn4(0); seq = db_impl->TEST_GetLastVisibleSequence(); ASSERT_EQ(exp_seq, seq); @@ -5693,7 +5693,7 @@ TEST_P(TransactionStressTest, SeqAdvanceTest) { ASSERT_EQ(exp_seq, seq); } - txn_t2(0); + TestTxn2(0); seq = db_impl->TEST_GetLastVisibleSequence(); ASSERT_EQ(exp_seq, seq); @@ -6530,6 +6530,117 @@ TEST_P(TransactionTest, WriteWithBulkCreatedColumnFamilies) { cf_handles.clear(); } +TEST_P(TransactionTest, LockWal) { + const TxnDBWritePolicy write_policy = std::get<2>(GetParam()); + if (TxnDBWritePolicy::WRITE_COMMITTED != write_policy) { + ROCKSDB_GTEST_BYPASS("Test only write-committed for now"); + return; + } + ASSERT_OK(ReOpen()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"TransactionTest::LockWal:AfterLockWal", + "TransactionTest::LockWal:BeforePrepareTxn2"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + std::unique_ptr txn0; + WriteOptions wopts; + wopts.no_slowdown = true; + txn0.reset(db->BeginTransaction(wopts, TransactionOptions())); + ASSERT_OK(txn0->SetName("txn0")); + ASSERT_OK(txn0->Put("foo", "v0")); + + std::unique_ptr txn1; + txn1.reset(db->BeginTransaction(wopts, TransactionOptions())); + ASSERT_OK(txn1->SetName("txn1")); + ASSERT_OK(txn1->Put("dummy", "v0")); + ASSERT_OK(txn1->Prepare()); + + std::unique_ptr txn2; + port::Thread worker([&]() { + txn2.reset(db->BeginTransaction(WriteOptions(), TransactionOptions())); + ASSERT_OK(txn2->SetName("txn2")); + ASSERT_OK(txn2->Put("bar", "v0")); + TEST_SYNC_POINT("TransactionTest::LockWal:BeforePrepareTxn2"); + ASSERT_OK(txn2->Prepare()); + ASSERT_OK(txn2->Commit()); + }); + ASSERT_OK(db->LockWAL()); + // txn0 cannot prepare + Status s = txn0->Prepare(); + ASSERT_TRUE(s.IsIncomplete()); + // txn1 cannot commit + s = txn1->Commit(); + ASSERT_TRUE(s.IsIncomplete()); + + TEST_SYNC_POINT("TransactionTest::LockWal:AfterLockWal"); + + ASSERT_OK(db->UnlockWAL()); + txn0.reset(); + + txn0.reset(db->BeginTransaction(wopts, TransactionOptions())); + ASSERT_OK(txn0->SetName("txn0_1")); + ASSERT_OK(txn0->Put("foo", "v1")); + ASSERT_OK(txn0->Prepare()); + ASSERT_OK(txn0->Commit()); + worker.join(); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(TransactionTest, StallTwoWriteQueues) { + // There was a two_write_queues bug in which both write thread leaders (for + // each queue) would attempt to own the stopping of writes in the primary + // write queue. This nearly worked but could lead to some broken assertions + // and a kind of deadlock in the test below. (Would resume if someone + // eventually signalled bg_cv_ again.) + if (!options.two_write_queues) { + ROCKSDB_GTEST_BYPASS("Test only needed with two_write_queues"); + return; + } + + // Stop writes + ASSERT_OK(db->LockWAL()); + + WriteOptions wopts; + wopts.sync = true; + wopts.disableWAL = false; + + // Create one write thread that blocks in the primary write queue and one + // that blocks in the nonmem queue. + bool t1_completed = false; + bool t2_completed = false; + port::Thread t1{[&]() { + ASSERT_OK(db->Put(wopts, "x", "y")); + t1_completed = true; + }}; + port::Thread t2{[&]() { + std::unique_ptr txn0{db->BeginTransaction(wopts, {})}; + ASSERT_OK(txn0->SetName("xid")); + ASSERT_OK(txn0->Prepare()); // nonmem + ASSERT_OK(txn0->Commit()); + t2_completed = true; + }}; + + // Sleep long enough to that above threads can usually reach a waiting point, + // to usually reveal deadlock if the bug is present. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // Ensure proper test setup + ASSERT_FALSE(t1_completed); + ASSERT_FALSE(t2_completed); + + // Resume writes + ASSERT_OK(db->UnlockWAL()); + + // Wait for writes to finish + t1.join(); + t2.join(); + // Ensure proper test setup + ASSERT_TRUE(t1_completed); + ASSERT_TRUE(t2_completed); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 52e8acd444d..0b86453a409 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -223,12 +223,10 @@ class TransactionTestBase : public ::testing::Test { std::atomic expected_commits = {0}; // Without Prepare, the commit does not write to WAL std::atomic with_empty_commits = {0}; - std::function txn_t0_with_status = [&](size_t index, - Status exp_s) { + void TestTxn0(size_t index) { // Test DB's internal txn. It involves no prepare phase nor a commit marker. - WriteOptions wopts; - auto s = db->Put(wopts, "key" + std::to_string(index), "value"); - ASSERT_EQ(exp_s, s); + auto s = db->Put(WriteOptions(), "key" + std::to_string(index), "value"); + ASSERT_OK(s); if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { // Consume one seq per key exp_seq++; @@ -241,11 +239,9 @@ class TransactionTestBase : public ::testing::Test { } } with_empty_commits++; - }; - std::function txn_t0 = [&](size_t index) { - return txn_t0_with_status(index, Status::OK()); - }; - std::function txn_t1 = [&](size_t index) { + } + + void TestTxn1(size_t index) { // Testing directly writing a write batch. Functionality-wise it is // equivalent to commit without prepare. WriteBatch wb; @@ -253,8 +249,7 @@ class TransactionTestBase : public ::testing::Test { ASSERT_OK(wb.Put("k1" + istr, "v1")); ASSERT_OK(wb.Put("k2" + istr, "v2")); ASSERT_OK(wb.Put("k3" + istr, "v3")); - WriteOptions wopts; - auto s = db->Write(wopts, &wb); + auto s = db->Write(WriteOptions(), &wb); if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) { // Consume one seq per key exp_seq += 3; @@ -268,12 +263,12 @@ class TransactionTestBase : public ::testing::Test { } ASSERT_OK(s); with_empty_commits++; - }; - std::function txn_t2 = [&](size_t index) { + } + + void TestTxn2(size_t index) { // Commit without prepare. It should write to DB without a commit marker. - TransactionOptions txn_options; - WriteOptions write_options; - Transaction* txn = db->BeginTransaction(write_options, txn_options); + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); auto istr = std::to_string(index); ASSERT_OK(txn->SetName("xid" + istr)); ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); @@ -301,12 +296,12 @@ class TransactionTestBase : public ::testing::Test { } delete txn; with_empty_commits++; - }; - std::function txn_t3 = [&](size_t index) { + } + + void TestTxn3(size_t index) { // A full 2pc txn that also involves a commit marker. - TransactionOptions txn_options; - WriteOptions write_options; - Transaction* txn = db->BeginTransaction(write_options, txn_options); + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); auto istr = std::to_string(index); ASSERT_OK(txn->SetName("xid" + istr)); ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); @@ -334,12 +329,12 @@ class TransactionTestBase : public ::testing::Test { exp_seq++; } delete txn; - }; - std::function txn_t4 = [&](size_t index) { + } + + void TestTxn4(size_t index) { // A full 2pc txn that also involves a commit marker. - TransactionOptions txn_options; - WriteOptions write_options; - Transaction* txn = db->BeginTransaction(write_options, txn_options); + Transaction* txn = + db->BeginTransaction(WriteOptions(), TransactionOptions()); auto istr = std::to_string(index); ASSERT_OK(txn->SetName("xid" + istr)); ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar"))); @@ -375,7 +370,7 @@ class TransactionTestBase : public ::testing::Test { } } delete txn; - }; + } // Test that we can change write policy after a clean shutdown (which would // empty the WAL) diff --git a/utilities/transactions/write_committed_transaction_ts_test.cc b/utilities/transactions/write_committed_transaction_ts_test.cc index 2bae5db12c1..94b8201f7ae 100644 --- a/utilities/transactions/write_committed_transaction_ts_test.cc +++ b/utilities/transactions/write_committed_transaction_ts_test.cc @@ -320,6 +320,7 @@ TEST_P(WriteCommittedTxnWithTsTest, Merge) { ColumnFamilyOptions cf_options; cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); const std::string test_cf_name = "test_cf"; ColumnFamilyHandle* cfh = nullptr; assert(db); @@ -338,8 +339,17 @@ TEST_P(WriteCommittedTxnWithTsTest, Merge) { NewTxn(WriteOptions(), TransactionOptions())); assert(txn); ASSERT_OK(txn->Put(handles_[1], "foo", "bar")); - ASSERT_TRUE(txn->Merge(handles_[1], "foo", "1").IsInvalidArgument()); + ASSERT_OK(txn->Merge(handles_[1], "foo", "1")); + ASSERT_OK(txn->SetCommitTimestamp(24)); + ASSERT_OK(txn->Commit()); txn.reset(); + { + std::string value; + const Status s = + GetFromDb(ReadOptions(), handles_[1], "foo", /*ts=*/24, &value); + ASSERT_OK(s); + ASSERT_EQ("bar,1", value); + } } TEST_P(WriteCommittedTxnWithTsTest, GetForUpdate) { diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 6854360dbb6..6cbb26e9dab 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -1684,22 +1684,23 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) { expected_commits = 0; std::vector threads; - linked = 0; + linked.store(0, std::memory_order_release); std::atomic batch_formed(false); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::EnterAsBatchGroupLeader:End", [&](void* /*arg*/) { batch_formed = true; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "WriteThread::JoinBatchGroup:Wait", [&](void* /*arg*/) { - linked++; - if (linked == 1) { + size_t orig_linked = linked.fetch_add(1, std::memory_order_acq_rel); + if (orig_linked == 0) { // Wait until the others are linked too. - while (linked < first_group_size) { + while (linked.load(std::memory_order_acquire) < first_group_size) { } - } else if (linked == 1 + first_group_size) { + } else if (orig_linked == first_group_size) { // Make the 2nd batch of the rest of writes plus any followup // commits from the first batch - while (linked < txn_cnt + commit_writes) { + while (linked.load(std::memory_order_acquire) < + txn_cnt + commit_writes) { } } // Then we will have one or more batches consisting of follow-up @@ -1713,32 +1714,33 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) { size_t d = (n % base[bi + 1]) / base[bi]; switch (d) { case 0: - threads.emplace_back(txn_t0, bi); + threads.emplace_back(&TransactionTestBase::TestTxn0, this, bi); break; case 1: - threads.emplace_back(txn_t1, bi); + threads.emplace_back(&TransactionTestBase::TestTxn1, this, bi); break; case 2: - threads.emplace_back(txn_t2, bi); + threads.emplace_back(&TransactionTestBase::TestTxn2, this, bi); break; case 3: - threads.emplace_back(txn_t3, bi); + threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi); break; case 4: - threads.emplace_back(txn_t3, bi); + threads.emplace_back(&TransactionTestBase::TestTxn3, this, bi); break; default: FAIL(); } // wait to be linked - while (linked.load() <= bi) { + while (linked.load(std::memory_order_acquire) <= bi) { } // after a queue of size first_group_size if (bi + 1 == first_group_size) { while (!batch_formed) { } // to make it more deterministic, wait until the commits are linked - while (linked.load() <= bi + expected_commits) { + while (linked.load(std::memory_order_acquire) <= + bi + expected_commits) { } } } @@ -1792,7 +1794,7 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) { ASSERT_OK(ReOpen()); WritePreparedTxnDB* wp_db = dynamic_cast(db); - txn_t0(0); + TestTxn0(0); TransactionOptions txn_options; WriteOptions write_options; @@ -1807,7 +1809,7 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) { ASSERT_OK(s); auto prep_seq_0 = txn0->GetId(); - txn_t1(0); + TestTxn1(0); index++; Transaction* txn1 = db->BeginTransaction(write_options, txn_options); @@ -1820,7 +1822,7 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) { ASSERT_OK(s); auto prep_seq_1 = txn1->GetId(); - txn_t2(0); + TestTxn2(0); ReadOptions ropt; PinnableSlice pinnable_val; @@ -1856,7 +1858,7 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) { ASSERT_TRUE(s.IsNotFound()); pinnable_val.Reset(); - txn_t3(0); + TestTxn3(0); // Test that a recovered txns will be properly marked committed for the next // recovery @@ -2120,7 +2122,7 @@ TEST_P(WritePreparedTransactionTest, IsInSnapshot) { seq++; cur_txn = seq; wp_db->AddPrepared(cur_txn); - } else { // else commit it + } else { // else commit it seq++; wp_db->AddCommitted(cur_txn, seq); wp_db->RemovePrepared(cur_txn); @@ -3427,9 +3429,8 @@ TEST_P(WritePreparedTransactionTest, Iterate) { auto* txn = db->BeginTransaction(WriteOptions()); for (int i = 0; i < 2; i++) { - Iterator* iter = (i == 0) - ? db->NewIterator(ReadOptions()) - : txn->GetIterator(ReadOptions()); + Iterator* iter = (i == 0) ? db->NewIterator(ReadOptions()) + : txn->GetIterator(ReadOptions()); // Seek iter->Seek("foo"); verify_state(iter, "foo", expected_val); diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 1133f903af0..16b5cc1cbce 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -115,8 +115,8 @@ Status WritePreparedTxn::PrepareInternal() { // For each duplicate key we account for a new sub-batch prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt(); // Having AddPrepared in the PreReleaseCallback allows in-order addition of - // prepared entries to PreparedHeap and hence enables an optimization. Refer to - // SmallestUnCommittedSeq for more details. + // prepared entries to PreparedHeap and hence enables an optimization. Refer + // to SmallestUnCommittedSeq for more details. AddPreparedCallback add_prepared_callback( wpt_db_, db_impl_, prepare_batch_cnt_, db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch); diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index c6661479a93..595c3df8f52 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -309,7 +309,6 @@ void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) { handle_map_.reset(handle_map); } - std::vector WritePreparedTxnDB::MultiGet( const ReadOptions& options, const std::vector& column_family, @@ -608,7 +607,8 @@ void WritePreparedTxnDB::RemovePrepared(const uint64_t prepare_seq, bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq, CommitEntry64b* entry_64b, CommitEntry* entry) const { - *entry_64b = commit_cache_[static_cast(indexed_seq)].load(std::memory_order_acquire); + *entry_64b = commit_cache_[static_cast(indexed_seq)].load( + std::memory_order_acquire); bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT); return valid; } @@ -617,8 +617,9 @@ bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq, const CommitEntry& new_entry, CommitEntry* evicted_entry) { CommitEntry64b new_entry_64b(new_entry, FORMAT); - CommitEntry64b evicted_entry_64b = commit_cache_[static_cast(indexed_seq)].exchange( - new_entry_64b, std::memory_order_acq_rel); + CommitEntry64b evicted_entry_64b = + commit_cache_[static_cast(indexed_seq)].exchange( + new_entry_64b, std::memory_order_acq_rel); bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT); return valid; } diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc index 771ff555367..6c8c62e0e04 100644 --- a/utilities/transactions/write_unprepared_transaction_test.cc +++ b/utilities/transactions/write_unprepared_transaction_test.cc @@ -28,7 +28,7 @@ class WriteUnpreparedTransactionTest WriteUnpreparedTransactionTest() : WriteUnpreparedTransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()), - std::get<2>(GetParam())){} + std::get<2>(GetParam())) {} }; INSTANTIATE_TEST_CASE_P( diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 72a21755a94..2ed2d5c59bb 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -6,6 +6,7 @@ #ifndef ROCKSDB_LITE #include "utilities/transactions/write_unprepared_txn_db.h" + #include "db/arena_wrapped_db_iter.h" #include "rocksdb/utilities/transaction_db.h" #include "util/cast_util.h" diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index 1c2c6daa1fd..3bfc66649a7 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -68,6 +68,7 @@ bool TtlMergeOperator::FullMergeV2(const MergeOperationInput& merge_in, merge_in.logger), &user_merge_out); } + merge_out->op_failure_scope = user_merge_out.op_failure_scope; // Return false if the user merge operator returned false if (!good) { @@ -595,14 +596,13 @@ Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts, return new TtlIterator(db_->NewIterator(opts, column_family)); } -void DBWithTTLImpl::SetTtl(ColumnFamilyHandle *h, int32_t ttl) { +void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) { std::shared_ptr filter; Options opts; opts = GetOptions(h); filter = std::static_pointer_cast( - opts.compaction_filter_factory); - if (!filter) - return; + opts.compaction_filter_factory); + if (!filter) return; filter->SetTtl(ttl); } diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index 7c43501a43f..dd67a6ddc36 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -103,7 +103,7 @@ class DBWithTTLImpl : public DBWithTTL { void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); } - void SetTtl(ColumnFamilyHandle *h, int32_t ttl) override; + void SetTtl(ColumnFamilyHandle* h, int32_t ttl) override; private: // remember whether the Close completes or not @@ -111,7 +111,6 @@ class DBWithTTLImpl : public DBWithTTL { }; class TtlIterator : public Iterator { - public: explicit TtlIterator(Iterator* iter) : iter_(iter) { assert(iter_); } @@ -189,9 +188,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory { std::unique_ptr CreateCompactionFilter( const CompactionFilter::Context& context) override; - void SetTtl(int32_t ttl) { - ttl_ = ttl; - } + void SetTtl(int32_t ttl) { ttl_ = ttl; } const char* Name() const override { return kClassName(); } static const char* kClassName() { return "TtlCompactionFilterFactory"; } @@ -209,7 +206,6 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory { }; class TtlMergeOperator : public MergeOperator { - public: explicit TtlMergeOperator(const std::shared_ptr& merge_op, SystemClock* clock); diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index e32c82a9d9d..a42e0acb4ea 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -28,7 +28,7 @@ namespace { using KVMap = std::map; enum BatchOperation { OP_PUT = 0, OP_DELETE = 1 }; -} +} // namespace class SpecialTimeEnv : public EnvWrapper { public: @@ -81,8 +81,8 @@ class TtlTest : public testing::Test { // Open with TestFilter compaction filter void OpenTtlWithTestCompaction(int32_t ttl) { options_.compaction_filter_factory = - std::shared_ptr( - new TestFilterFactory(kSampleSize_, kNewValue_)); + std::shared_ptr( + new TestFilterFactory(kSampleSize_, kNewValue_)); OpenTtl(ttl); } @@ -121,7 +121,7 @@ class TtlTest : public testing::Test { if (i % 10 == 0) { digits_in_i++; } - for(int j = digits_in_i; j < digits; j++) { + for (int j = digits_in_i; j < digits; j++) { key.append("0"); value.append("0"); } @@ -210,16 +210,19 @@ class TtlTest : public testing::Test { static ReadOptions ropts; bool value_found; std::string val; - for(auto &kv : kvmap_) { + for (auto& kv : kvmap_) { bool ret = db_ttl_->KeyMayExist(ropts, kv.first, &val, &value_found); if (ret == false || value_found == false) { - fprintf(stderr, "KeyMayExist could not find key=%s in the database but" - " should have\n", kv.first.c_str()); + fprintf(stderr, + "KeyMayExist could not find key=%s in the database but" + " should have\n", + kv.first.c_str()); FAIL(); } else if (val.compare(kv.second) != 0) { - fprintf(stderr, " value for key=%s present in database is %s but" - " should be %s\n", kv.first.c_str(), val.c_str(), - kv.second.c_str()); + fprintf(stderr, + " value for key=%s present in database is %s but" + " should be %s\n", + kv.first.c_str(), val.c_str(), kv.second.c_str()); FAIL(); } } @@ -263,17 +266,19 @@ class TtlTest : public testing::Test { } FAIL(); } else if (s.ok()) { - if (test_compaction_change && v.compare(kNewValue_) != 0) { - fprintf(stderr, " value for key=%s present in database is %s but " - " should be %s\n", kv_it_->first.c_str(), v.c_str(), - kNewValue_.c_str()); - FAIL(); - } else if (!test_compaction_change && v.compare(kv_it_->second) !=0) { - fprintf(stderr, " value for key=%s present in database is %s but " - " should be %s\n", kv_it_->first.c_str(), v.c_str(), - kv_it_->second.c_str()); - FAIL(); - } + if (test_compaction_change && v.compare(kNewValue_) != 0) { + fprintf(stderr, + " value for key=%s present in database is %s but " + " should be %s\n", + kv_it_->first.c_str(), v.c_str(), kNewValue_.c_str()); + FAIL(); + } else if (!test_compaction_change && v.compare(kv_it_->second) != 0) { + fprintf(stderr, + " value for key=%s present in database is %s but " + " should be %s\n", + kv_it_->first.c_str(), v.c_str(), kv_it_->second.c_str()); + FAIL(); + } } } } @@ -299,7 +304,7 @@ class TtlTest : public testing::Test { env_->Sleep(slp); ASSERT_OK(ManualCompact()); static ReadOptions ropts; - Iterator *dbiter = db_ttl_->NewIterator(ropts); + Iterator* dbiter = db_ttl_->NewIterator(ropts); kv_it_ = kvmap_.begin(); advance(kv_it_, st_pos); @@ -329,9 +334,7 @@ class TtlTest : public testing::Test { class TestFilter : public CompactionFilter { public: TestFilter(const int64_t kSampleSize, const std::string& kNewValue) - : kSampleSize_(kSampleSize), - kNewValue_(kNewValue) { - } + : kSampleSize_(kSampleSize), kNewValue_(kNewValue) {} // Works on keys of the form "key" // Drops key if number at the end of key is in [0, kSampleSize_/3), @@ -355,7 +358,7 @@ class TtlTest : public testing::Test { #endif } else { - return false; // Keep keys not matching the format "key" + return false; // Keep keys not matching the format "key" } int64_t partition = kSampleSize_ / 3; @@ -378,26 +381,23 @@ class TtlTest : public testing::Test { }; class TestFilterFactory : public CompactionFilterFactory { - public: - TestFilterFactory(const int64_t kSampleSize, const std::string& kNewValue) - : kSampleSize_(kSampleSize), - kNewValue_(kNewValue) { - } + public: + TestFilterFactory(const int64_t kSampleSize, const std::string& kNewValue) + : kSampleSize_(kSampleSize), kNewValue_(kNewValue) {} - std::unique_ptr CreateCompactionFilter( - const CompactionFilter::Context& /*context*/) override { - return std::unique_ptr( - new TestFilter(kSampleSize_, kNewValue_)); - } + std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& /*context*/) override { + return std::unique_ptr( + new TestFilter(kSampleSize_, kNewValue_)); + } - const char* Name() const override { return "TestFilterFactory"; } + const char* Name() const override { return "TestFilterFactory"; } - private: - const int64_t kSampleSize_; - const std::string kNewValue_; + private: + const int64_t kSampleSize_; + const std::string kNewValue_; }; - // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer static const int64_t kSampleSize_ = 100; std::string dbname_; @@ -410,7 +410,7 @@ class TtlTest : public testing::Test { KVMap::iterator kv_it_; const std::string kNewValue_ = "new_value"; std::unique_ptr test_comp_filter_; -}; // class TtlTest +}; // class TtlTest // If TTL is non positive or not provided, the behaviour is TTL = infinity // This test opens the db 3 times with such default behavior and inserts a @@ -422,18 +422,18 @@ TEST_F(TtlTest, NoEffect) { int64_t boundary2 = 2 * boundary1; OpenTtl(); - PutValues(0, boundary1); //T=0: Set1 never deleted - SleepCompactCheck(1, 0, boundary1); //T=1: Set1 still there + PutValues(0, boundary1); // T=0: Set1 never deleted + SleepCompactCheck(1, 0, boundary1); // T=1: Set1 still there CloseTtl(); OpenTtl(0); - PutValues(boundary1, boundary2 - boundary1); //T=1: Set2 never deleted - SleepCompactCheck(1, 0, boundary2); //T=2: Sets1 & 2 still there + PutValues(boundary1, boundary2 - boundary1); // T=1: Set2 never deleted + SleepCompactCheck(1, 0, boundary2); // T=2: Sets1 & 2 still there CloseTtl(); OpenTtl(-1); - PutValues(boundary2, kSampleSize_ - boundary2); //T=3: Set3 never deleted - SleepCompactCheck(1, 0, kSampleSize_, true); //T=4: Sets 1,2,3 still there + PutValues(boundary2, kSampleSize_ - boundary2); // T=3: Set3 never deleted + SleepCompactCheck(1, 0, kSampleSize_, true); // T=4: Sets 1,2,3 still there CloseTtl(); } @@ -464,9 +464,10 @@ TEST_F(TtlTest, DestructWithoutClose) { TEST_F(TtlTest, PresentDuringTTL) { MakeKVMap(kSampleSize_); - OpenTtl(2); // T=0:Open the db with ttl = 2 - PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 - SleepCompactCheck(1, 0, kSampleSize_, true); // T=1:Set1 should still be there + OpenTtl(2); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_, + true); // T=1:Set1 should still be there CloseTtl(); } @@ -474,9 +475,9 @@ TEST_F(TtlTest, PresentDuringTTL) { TEST_F(TtlTest, AbsentAfterTTL) { MakeKVMap(kSampleSize_); - OpenTtl(1); // T=0:Open the db with ttl = 2 - PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 - SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there + OpenTtl(1); // T=0:Open the db with ttl = 2 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2:Set1 should not be there CloseTtl(); } @@ -486,10 +487,10 @@ TEST_F(TtlTest, ResetTimestamp) { MakeKVMap(kSampleSize_); OpenTtl(3); - PutValues(0, kSampleSize_); // T=0: Insert Set1. Delete at t=3 - env_->Sleep(2); // T=2 - PutValues(0, kSampleSize_); // T=2: Insert Set1. Delete at t=5 - SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there + PutValues(0, kSampleSize_); // T=0: Insert Set1. Delete at t=3 + env_->Sleep(2); // T=2 + PutValues(0, kSampleSize_); // T=2: Insert Set1. Delete at t=5 + SleepCompactCheck(2, 0, kSampleSize_); // T=4: Set1 should still be there CloseTtl(); } @@ -508,8 +509,8 @@ TEST_F(TtlTest, IterAbsentAfterTTL) { MakeKVMap(kSampleSize_); OpenTtl(1); - PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 - SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + SleepCompactCheckIter(2, 0, kSampleSize_, false); // T=2: Should not be there CloseTtl(); } @@ -519,11 +520,11 @@ TEST_F(TtlTest, MultiOpenSamePresent) { MakeKVMap(kSampleSize_); OpenTtl(2); - PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=2 CloseTtl(); - OpenTtl(2); // T=0. Delete at t=2 - SleepCompactCheck(1, 0, kSampleSize_); // T=1: Set should be there + OpenTtl(2); // T=0. Delete at t=2 + SleepCompactCheck(1, 0, kSampleSize_); // T=1: Set should be there CloseTtl(); } @@ -533,11 +534,11 @@ TEST_F(TtlTest, MultiOpenSameAbsent) { MakeKVMap(kSampleSize_); OpenTtl(1); - PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 CloseTtl(); - OpenTtl(1); // T=0.Delete at t=1 - SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there + OpenTtl(1); // T=0.Delete at t=1 + SleepCompactCheck(2, 0, kSampleSize_, false); // T=2: Set should not be there CloseTtl(); } @@ -546,11 +547,11 @@ TEST_F(TtlTest, MultiOpenDifferent) { MakeKVMap(kSampleSize_); OpenTtl(1); - PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 + PutValues(0, kSampleSize_); // T=0: Insert. Delete at t=1 CloseTtl(); - OpenTtl(3); // T=0: Set deleted at t=3 - SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there + OpenTtl(3); // T=0: Set deleted at t=3 + SleepCompactCheck(2, 0, kSampleSize_); // T=2: Set should be there CloseTtl(); } @@ -558,8 +559,8 @@ TEST_F(TtlTest, MultiOpenDifferent) { TEST_F(TtlTest, ReadOnlyPresentForever) { MakeKVMap(kSampleSize_); - OpenTtl(1); // T=0:Open the db normally - PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + OpenTtl(1); // T=0:Open the db normally + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 CloseTtl(); OpenReadOnlyTtl(1); @@ -597,17 +598,17 @@ TEST_F(TtlTest, CompactionFilter) { MakeKVMap(kSampleSize_); OpenTtlWithTestCompaction(1); - PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=1 // T=2: TTL logic takes precedence over TestFilter:-Set1 should not be there SleepCompactCheck(2, 0, kSampleSize_, false); CloseTtl(); OpenTtlWithTestCompaction(3); - PutValues(0, kSampleSize_); // T=0:Insert Set1. + PutValues(0, kSampleSize_); // T=0:Insert Set1. int64_t partition = kSampleSize_ / 3; - SleepCompactCheck(1, 0, partition, false); // Part dropped - SleepCompactCheck(0, partition, partition); // Part kept - SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed + SleepCompactCheck(1, 0, partition, false); // Part dropped + SleepCompactCheck(0, partition, partition); // Part kept + SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed CloseTtl(); } @@ -696,10 +697,10 @@ TEST_F(TtlTest, ColumnFamiliesTest) { TEST_F(TtlTest, ChangeTtlOnOpenDb) { MakeKVMap(kSampleSize_); - OpenTtl(1); // T=0:Open the db with ttl = 2 + OpenTtl(1); // T=0:Open the db with ttl = 2 SetTtl(3); - PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 - SleepCompactCheck(2, 0, kSampleSize_, true); // T=2:Set1 should be there + PutValues(0, kSampleSize_); // T=0:Insert Set1. Delete at t=2 + SleepCompactCheck(2, 0, kSampleSize_, true); // T=2:Set1 should be there CloseTtl(); } diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index f43630a48eb..408243b3fff 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -163,7 +163,7 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) { auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry)); auto* index_entry = new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id, - key.data() - wb_data.data(), key.size()); + key.data() - wb_data.data(), key.size()); skip_list.Insert(index_entry); } @@ -207,8 +207,8 @@ Status WriteBatchWithIndex::Rep::ReBuildIndex() { // set offset of current entry for call to AddNewEntry() last_entry_offset = input.data() - write_batch.Data().data(); - s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key, - &value, &blob, &xid); + s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key, &value, + &blob, &xid); if (!s.ok()) { break; } diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/utilities/write_batch_with_index/write_batch_with_index_internal.cc index 7ff6fbfafc2..5ae4df7dd05 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_internal.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -664,22 +664,34 @@ Status WriteBatchWithIndexInternal::MergeKey(const Slice& key, Statistics* statistics = immutable_db_options.statistics.get(); Logger* logger = immutable_db_options.info_log.get(); SystemClock* clock = immutable_db_options.clock; - return MergeHelper::TimedFullMerge(merge_operator, key, value, - context.GetOperands(), result, logger, - statistics, clock); + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + merge_operator, key, value, context.GetOperands(), result, logger, + statistics, clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr); } else if (db_options_ != nullptr) { Statistics* statistics = db_options_->statistics.get(); Env* env = db_options_->env; Logger* logger = db_options_->info_log.get(); SystemClock* clock = env->GetSystemClock().get(); - return MergeHelper::TimedFullMerge(merge_operator, key, value, - context.GetOperands(), result, logger, - statistics, clock); + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + merge_operator, key, value, context.GetOperands(), result, logger, + statistics, clock, /* result_operand */ nullptr, + /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr); } else { const auto cf_opts = cfh->cfd()->ioptions(); + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. return MergeHelper::TimedFullMerge( merge_operator, key, value, context.GetOperands(), result, - cf_opts->logger, cf_opts->stats, cf_opts->clock); + cf_opts->logger, cf_opts->stats, cf_opts->clock, + /* result_operand */ nullptr, /* update_num_ops_stats */ false, + /* op_failure_scope */ nullptr); } } else { return Status::InvalidArgument("Must provide a column_family"); diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index 50d64963a0a..350dcc881e0 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -237,7 +237,7 @@ void AssertIterEqual(WBWIIteratorImpl* wbwii, } ASSERT_FALSE(wbwii->Valid()); } -} // namespace anonymous +} // namespace class WBWIBaseTest : public testing::Test { public: @@ -512,14 +512,10 @@ void TestValueAsSecondaryIndexHelper(std::vector entries, TEST_F(WBWIKeepTest, TestValueAsSecondaryIndex) { Entry entries[] = { - {"aaa", "0005", kPutRecord}, - {"b", "0002", kPutRecord}, - {"cdd", "0002", kMergeRecord}, - {"aab", "00001", kPutRecord}, - {"cc", "00005", kPutRecord}, - {"cdd", "0002", kPutRecord}, - {"aab", "0003", kPutRecord}, - {"cc", "00005", kDeleteRecord}, + {"aaa", "0005", kPutRecord}, {"b", "0002", kPutRecord}, + {"cdd", "0002", kMergeRecord}, {"aab", "00001", kPutRecord}, + {"cc", "00005", kPutRecord}, {"cdd", "0002", kPutRecord}, + {"aab", "0003", kPutRecord}, {"cc", "00005", kDeleteRecord}, }; std::vector entries_list(entries, entries + 8); @@ -531,14 +527,10 @@ TEST_F(WBWIKeepTest, TestValueAsSecondaryIndex) { batch_->Clear(); Entry new_entries[] = { - {"aaa", "0005", kPutRecord}, - {"e", "0002", kPutRecord}, - {"add", "0002", kMergeRecord}, - {"aab", "00001", kPutRecord}, - {"zz", "00005", kPutRecord}, - {"add", "0002", kPutRecord}, - {"aab", "0003", kPutRecord}, - {"zz", "00005", kDeleteRecord}, + {"aaa", "0005", kPutRecord}, {"e", "0002", kPutRecord}, + {"add", "0002", kMergeRecord}, {"aab", "00001", kPutRecord}, + {"zz", "00005", kPutRecord}, {"add", "0002", kPutRecord}, + {"aab", "0003", kPutRecord}, {"zz", "00005", kDeleteRecord}, }; entries_list = std::vector(new_entries, new_entries + 8);