From 64b497f7d91587136f38092af13ba73a37f0d1c7 Mon Sep 17 00:00:00 2001 From: Vladimir Petrzhikovskii Date: Thu, 21 Nov 2024 16:51:36 +0100 Subject: [PATCH] chore(storage): rocksdb update --- Cargo.lock | 27 +++++------ Cargo.toml | 7 ++- scripts/gen-dashboard.py | 3 ++ storage/src/db/kv_db/mod.rs | 26 +++++++++- storage/src/db/kv_db/tables.rs | 47 ++++++++++++++----- storage/src/lib.rs | 2 - storage/src/store/shard_state/cell_storage.rs | 1 + 7 files changed, 80 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6eb57781a..e380c7a92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -306,7 +306,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.10.5", "lazy_static", "lazycell", "log", @@ -1507,7 +1507,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -1522,9 +1522,8 @@ dependencies = [ [[package]] name = "librocksdb-sys" -version = "0.16.0+8.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce3d60bc059831dc1c83903fb45c103f75db65c5a7bf22272764d9cc683e348c" +version = "0.17.1+9.7.4" +source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git?rev=95d01b183cdb45f80a470cadfc030a545ab66156#95d01b183cdb45f80a470cadfc030a545ab66156" dependencies = [ "bindgen", "bzip2-sys", @@ -2491,8 +2490,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bd13e55d6d7b8cd0ea569161127567cd587676c99f4472f779a0279aa60a7a7" +source = "git+https://github.com/rust-rocksdb/rust-rocksdb.git?rev=95d01b183cdb45f80a470cadfc030a545ab66156#95d01b183cdb45f80a470cadfc030a545ab66156" dependencies = [ "libc", "librocksdb-sys", @@ -3002,9 +3000,9 @@ dependencies = [ [[package]] name = "tikv-jemalloc-ctl" -version = "0.5.4" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c" +checksum = "f21f216790c8df74ce3ab25b534e0718da5a1916719771d3fec23315c99e468b" dependencies = [ "libc", "paste", @@ -3013,9 +3011,9 @@ dependencies = [ [[package]] name = "tikv-jemalloc-sys" -version = "0.5.4+5.3.0-patched" +version = "0.6.0+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1" +checksum = "cd3c60906412afa9c2b5b5a48ca6a5abe5736aec9eb48ad05037a677e52e4e2d" dependencies = [ "cc", "libc", @@ -3023,9 +3021,9 @@ dependencies = [ [[package]] name = "tikv-jemallocator" -version = "0.5.4" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca" +checksum = "4cec5ff18518d81584f477e9bfdf957f5bb0979b0bac3af4ca30b5b3ae2d2865" dependencies = [ "libc", "tikv-jemalloc-sys", @@ -4028,8 +4026,7 @@ dependencies = [ [[package]] name = "weedb" version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c65a1543619bd05013dfe6092797d2c3b16aa7b41862d588735ff7def453ce04" +source = "git+https://github.com/broxus/weedb.git?branch=next-rocksdb#be76187ed31348144bdab3e113ad7de114d99ac6" dependencies = [ "librocksdb-sys", "metrics", diff --git a/Cargo.toml b/Cargo.toml index 398602dac..6f4d6aa62 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,11 +97,11 @@ tarpc = { version = "0.34", features = [ ] } tempfile = "3.10" thiserror = "1.0" -tikv-jemallocator = { version = "0.5", features = [ +tikv-jemallocator = { version = "0.6.0", features = [ "unprefixed_malloc_on_supported_platforms", "background_threads", ] } -tikv-jemalloc-ctl = { version = "0.5" } +tikv-jemalloc-ctl = { version = "0.6.0", features = ["stats"] } tl-proto = "0.4" tokio = { version = "1", default-features = false } tokio-stream = "0.1.15" @@ -134,6 +134,9 @@ tycho-rpc = { path = "./rpc", version = "0.1.4" } tycho-storage = { path = "./storage", version = "0.1.4" } tycho-util = { path = "./util", version = "0.1.4" } +[patch.crates-io] +weedb = { version = "0.3.8", git = "https://github.com/broxus/weedb.git", branch = "next-rocksdb" } + [workspace.lints.rust] future_incompatible = "warn" nonstandard_style = "warn" diff --git a/scripts/gen-dashboard.py b/scripts/gen-dashboard.py index af5109f1a..1224c2ac9 100644 --- a/scripts/gen-dashboard.py +++ b/scripts/gen-dashboard.py @@ -779,6 +779,9 @@ def storage() -> RowPanel: create_heatmap_panel( "tycho_storage_cell_in_mem_store_time", "Time to store cell without write" ), + create_heatmap_panel( + "tycho_storage_batch_write_time", "Time to write merge in write batch" + ), create_heatmap_panel( "tycho_storage_state_store_time", "Time to store state with cell traversal" ), diff --git a/storage/src/db/kv_db/mod.rs b/storage/src/db/kv_db/mod.rs index a8d032abe..095ca3320 100644 --- a/storage/src/db/kv_db/mod.rs +++ b/storage/src/db/kv_db/mod.rs @@ -103,7 +103,7 @@ impl BaseDbExt for BaseDb { impl WithMigrations for BaseDb { const NAME: &'static str = "base"; - const VERSION: Semver = [0, 0, 2]; + const VERSION: Semver = [0, 0, 3]; fn register_migrations( migrations: &mut Migrations, @@ -111,7 +111,10 @@ impl WithMigrations for BaseDb { ) -> Result<(), MigrationError> { migrations.register([0, 0, 1], [0, 0, 2], move |db| { base_migrations::v0_0_1_to_0_0_2(db, cancelled.clone()) - }) + })?; + migrations.register([0, 0, 2], [0, 0, 3], base_migrations::v_0_0_2_to_v_0_0_3)?; + + Ok(()) } } @@ -139,6 +142,7 @@ mod base_migrations { use everscale_types::boc::Boc; use tycho_block_util::archive::ArchiveEntryType; + use weedb::rocksdb::CompactOptions; use super::*; use crate::util::StoredValue; @@ -192,6 +196,24 @@ mod base_migrations { ); Ok(()) } + + pub fn v_0_0_2_to_v_0_0_3(db: &BaseDb) -> Result<(), MigrationError> { + let mut opts = CompactOptions::default(); + opts.set_exclusive_manual_compaction(true); + let null = Option::<&[u8]>::None; + + let started_at = Instant::now(); + tracing::info!("started cells compaction"); + db.cells + .db() + .compact_range_cf_opt(&db.cells.cf(), null, null, &opts); + tracing::info!( + elapsed = %humantime::format_duration(started_at.elapsed()), + "finished cells compaction" + ); + + Ok(()) + } } // === RPC DB === diff --git a/storage/src/db/kv_db/tables.rs b/storage/src/db/kv_db/tables.rs index 4c48a51bb..d426f0f40 100644 --- a/storage/src/db/kv_db/tables.rs +++ b/storage/src/db/kv_db/tables.rs @@ -1,7 +1,7 @@ use bytesize::ByteSize; use weedb::rocksdb::{ - BlockBasedIndexType, BlockBasedOptions, DBCompressionType, DataBlockIndexType, MergeOperands, - Options, ReadOptions, SliceTransform, + BlockBasedIndexType, BlockBasedOptions, CompactionPri, DBCompressionType, DataBlockIndexType, + MemtableFactory, MergeOperands, Options, ReadOptions, SliceTransform, }; use weedb::{rocksdb, Caches, ColumnFamily, ColumnFamilyOptions}; @@ -232,12 +232,25 @@ impl ColumnFamilyOptions for Cells { opts.set_compaction_filter("cell_compaction", refcount::compaction_filter); // optimize for bulk inserts and single writer - opts.set_max_write_buffer_number(6); // 6 * 512MB = 3GB for write buffers + opts.set_max_write_buffer_number(8); // 8 * 512MB = 4GB opts.set_min_write_buffer_number_to_merge(2); // allow early flush opts.set_write_buffer_size(512 * 1024 * 1024); // 512 per memtable - // try to do more merges in memory - opts.set_max_successive_merges(100); + opts.set_max_successive_merges(0); // it will eat cpu, we are doing first merge in hashmap anyway. + + // - Write batch size: 500K entries + // - Entry size: ~244 bytes (32 SHA + 8 seq + 192 value + 12 overhead) + // - Memtable size: 512MB + + // 1. Entries per memtable = 512MB / 244B ≈ 2.2M entries + // 2. Target bucket load factor = 10-12 entries per bucket (RocksDB recommendation) + // 3. Bucket count = entries / target_load = 2.2M / 11 ≈ 200K + opts.set_memtable_factory(MemtableFactory::HashLinkList { + bucket_count: 200_000, + }); + + opts.set_memtable_prefix_bloom_ratio(0.1); // we use hash-based memtable so bloom filter is not that useful + opts.set_bloom_locality(1); // Optimize bloom filter locality let mut block_factory = BlockBasedOptions::default(); @@ -252,7 +265,7 @@ impl ColumnFamilyOptions for Cells { // to match fs block size block_factory.set_block_size(4096); - block_factory.set_format_version(5); + block_factory.set_format_version(6); // we have 4096 / 256 = 16 keys per block, so binary search is enough block_factory.set_data_block_index_type(DataBlockIndexType::BinarySearch); @@ -261,16 +274,17 @@ impl ColumnFamilyOptions for Cells { block_factory.set_pin_l0_filter_and_index_blocks_in_cache(true); opts.set_block_based_table_factory(&block_factory); + opts.set_prefix_extractor(SliceTransform::create_noop()); - opts.set_prefix_extractor(SliceTransform::create_fixed_prefix(32)); - opts.set_memtable_prefix_bloom_ratio(0.1); + opts.set_memtable_whole_key_filtering(true); + opts.set_memtable_prefix_bloom_ratio(0.25); opts.set_compression_type(DBCompressionType::None); - opts.set_level_zero_file_num_compaction_trigger(8); // flush L0 as soon as possible + opts.set_compaction_pri(CompactionPri::OldestSmallestSeqFirst); + opts.set_level_zero_file_num_compaction_trigger(8); + opts.set_target_file_size_base(512 * 1024 * 1024); // smaller files for more efficient GC - opts.set_max_background_jobs(16); - opts.set_max_subcompactions(4); opts.set_max_bytes_for_level_base(4 * 1024 * 1024 * 1024); // 4GB per level opts.set_max_bytes_for_level_multiplier(8.0); @@ -305,8 +319,17 @@ impl ColumnFamilyOptions for Cells { opts.set_enable_write_thread_adaptive_yield(false); opts.set_allow_concurrent_memtable_write(false); opts.set_enable_pipelined_write(true); - opts.set_inplace_update_support(true); + opts.set_inplace_update_support(false); opts.set_unordered_write(true); // we don't use snapshots + opts.set_avoid_unnecessary_blocking_io(true); // schedule unnecessary IO in background; + + opts.set_auto_tuned_ratelimiter( + 256 * 1024 * 1024, // 256MB/s base rate + 100_000, // 100ms refill (standard value) + 10, // fairness (standard value) + ); + + opts.set_periodic_compaction_seconds(3600 * 24); // force compaction once a day } } diff --git a/storage/src/lib.rs b/storage/src/lib.rs index 01943f377..bc8c24ce8 100644 --- a/storage/src/lib.rs +++ b/storage/src/lib.rs @@ -61,9 +61,7 @@ impl StorageBuilder { let update_options = |opts: &mut rocksdb::Options, threads: usize, fdlimit: u64| { opts.set_paranoid_checks(false); - // bigger base level size - less compactions // parallel compactions finishes faster - less write stalls - opts.set_max_subcompactions(threads as u32 / 2); // io diff --git a/storage/src/store/shard_state/cell_storage.rs b/storage/src/store/shard_state/cell_storage.rs index cc42984d2..b90fe7587 100644 --- a/storage/src/store/shard_state/cell_storage.rs +++ b/storage/src/store/shard_state/cell_storage.rs @@ -491,6 +491,7 @@ impl CellStorage { drop(stack); // Write transaction to the `WriteBatch` + let _hist = HistogramGuard::begin("tycho_storage_batch_write_time"); let total = transaction.len(); for (key, CellState { removes, .. }) in transaction { self.raw_cells_cache.remove_refs(key, removes);