Skip to content

Commit

Permalink
chore(storage): rocksdb update
Browse files Browse the repository at this point in the history
  • Loading branch information
0xdeafbeef authored and Rexagon committed Dec 6, 2024
1 parent 60d992b commit 64b497f
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 33 deletions.
27 changes: 12 additions & 15 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ tarpc = { version = "0.34", features = [
] }
tempfile = "3.10"
thiserror = "1.0"
tikv-jemallocator = { version = "0.5", features = [
tikv-jemallocator = { version = "0.6.0", features = [
"unprefixed_malloc_on_supported_platforms",
"background_threads",
] }
tikv-jemalloc-ctl = { version = "0.5" }
tikv-jemalloc-ctl = { version = "0.6.0", features = ["stats"] }
tl-proto = "0.4"
tokio = { version = "1", default-features = false }
tokio-stream = "0.1.15"
Expand Down Expand Up @@ -134,6 +134,9 @@ tycho-rpc = { path = "./rpc", version = "0.1.4" }
tycho-storage = { path = "./storage", version = "0.1.4" }
tycho-util = { path = "./util", version = "0.1.4" }

[patch.crates-io]
weedb = { version = "0.3.8", git = "https://github.com/broxus/weedb.git", branch = "next-rocksdb" }

[workspace.lints.rust]
future_incompatible = "warn"
nonstandard_style = "warn"
Expand Down
3 changes: 3 additions & 0 deletions scripts/gen-dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,9 @@ def storage() -> RowPanel:
create_heatmap_panel(
"tycho_storage_cell_in_mem_store_time", "Time to store cell without write"
),
create_heatmap_panel(
"tycho_storage_batch_write_time", "Time to write merge in write batch"
),
create_heatmap_panel(
"tycho_storage_state_store_time", "Time to store state with cell traversal"
),
Expand Down
26 changes: 24 additions & 2 deletions storage/src/db/kv_db/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,18 @@ impl BaseDbExt for BaseDb {

impl WithMigrations for BaseDb {
const NAME: &'static str = "base";
const VERSION: Semver = [0, 0, 2];
const VERSION: Semver = [0, 0, 3];

fn register_migrations(
migrations: &mut Migrations<Self>,
cancelled: CancellationFlag,
) -> Result<(), MigrationError> {
migrations.register([0, 0, 1], [0, 0, 2], move |db| {
base_migrations::v0_0_1_to_0_0_2(db, cancelled.clone())
})
})?;
migrations.register([0, 0, 2], [0, 0, 3], base_migrations::v_0_0_2_to_v_0_0_3)?;

Ok(())
}
}

Expand Down Expand Up @@ -139,6 +142,7 @@ mod base_migrations {

use everscale_types::boc::Boc;
use tycho_block_util::archive::ArchiveEntryType;
use weedb::rocksdb::CompactOptions;

use super::*;
use crate::util::StoredValue;
Expand Down Expand Up @@ -192,6 +196,24 @@ mod base_migrations {
);
Ok(())
}

pub fn v_0_0_2_to_v_0_0_3(db: &BaseDb) -> Result<(), MigrationError> {
let mut opts = CompactOptions::default();
opts.set_exclusive_manual_compaction(true);
let null = Option::<&[u8]>::None;

let started_at = Instant::now();
tracing::info!("started cells compaction");
db.cells
.db()
.compact_range_cf_opt(&db.cells.cf(), null, null, &opts);
tracing::info!(
elapsed = %humantime::format_duration(started_at.elapsed()),
"finished cells compaction"
);

Ok(())
}
}

// === RPC DB ===
Expand Down
47 changes: 35 additions & 12 deletions storage/src/db/kv_db/tables.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use bytesize::ByteSize;
use weedb::rocksdb::{
BlockBasedIndexType, BlockBasedOptions, DBCompressionType, DataBlockIndexType, MergeOperands,
Options, ReadOptions, SliceTransform,
BlockBasedIndexType, BlockBasedOptions, CompactionPri, DBCompressionType, DataBlockIndexType,
MemtableFactory, MergeOperands, Options, ReadOptions, SliceTransform,
};
use weedb::{rocksdb, Caches, ColumnFamily, ColumnFamilyOptions};

Expand Down Expand Up @@ -232,12 +232,25 @@ impl ColumnFamilyOptions<Caches> for Cells {
opts.set_compaction_filter("cell_compaction", refcount::compaction_filter);

// optimize for bulk inserts and single writer
opts.set_max_write_buffer_number(6); // 6 * 512MB = 3GB for write buffers
opts.set_max_write_buffer_number(8); // 8 * 512MB = 4GB
opts.set_min_write_buffer_number_to_merge(2); // allow early flush
opts.set_write_buffer_size(512 * 1024 * 1024); // 512 per memtable

// try to do more merges in memory
opts.set_max_successive_merges(100);
opts.set_max_successive_merges(0); // it will eat cpu, we are doing first merge in hashmap anyway.

// - Write batch size: 500K entries
// - Entry size: ~244 bytes (32 SHA + 8 seq + 192 value + 12 overhead)
// - Memtable size: 512MB

// 1. Entries per memtable = 512MB / 244B ≈ 2.2M entries
// 2. Target bucket load factor = 10-12 entries per bucket (RocksDB recommendation)
// 3. Bucket count = entries / target_load = 2.2M / 11 ≈ 200K
opts.set_memtable_factory(MemtableFactory::HashLinkList {
bucket_count: 200_000,
});

opts.set_memtable_prefix_bloom_ratio(0.1); // we use hash-based memtable so bloom filter is not that useful
opts.set_bloom_locality(1); // Optimize bloom filter locality

let mut block_factory = BlockBasedOptions::default();

Expand All @@ -252,7 +265,7 @@ impl ColumnFamilyOptions<Caches> for Cells {

// to match fs block size
block_factory.set_block_size(4096);
block_factory.set_format_version(5);
block_factory.set_format_version(6);

// we have 4096 / 256 = 16 keys per block, so binary search is enough
block_factory.set_data_block_index_type(DataBlockIndexType::BinarySearch);
Expand All @@ -261,16 +274,17 @@ impl ColumnFamilyOptions<Caches> for Cells {
block_factory.set_pin_l0_filter_and_index_blocks_in_cache(true);

opts.set_block_based_table_factory(&block_factory);
opts.set_prefix_extractor(SliceTransform::create_noop());

opts.set_prefix_extractor(SliceTransform::create_fixed_prefix(32));
opts.set_memtable_prefix_bloom_ratio(0.1);
opts.set_memtable_whole_key_filtering(true);
opts.set_memtable_prefix_bloom_ratio(0.25);

opts.set_compression_type(DBCompressionType::None);

opts.set_level_zero_file_num_compaction_trigger(8); // flush L0 as soon as possible
opts.set_compaction_pri(CompactionPri::OldestSmallestSeqFirst);
opts.set_level_zero_file_num_compaction_trigger(8);

opts.set_target_file_size_base(512 * 1024 * 1024); // smaller files for more efficient GC
opts.set_max_background_jobs(16);
opts.set_max_subcompactions(4);

opts.set_max_bytes_for_level_base(4 * 1024 * 1024 * 1024); // 4GB per level
opts.set_max_bytes_for_level_multiplier(8.0);
Expand Down Expand Up @@ -305,8 +319,17 @@ impl ColumnFamilyOptions<Caches> for Cells {
opts.set_enable_write_thread_adaptive_yield(false);
opts.set_allow_concurrent_memtable_write(false);
opts.set_enable_pipelined_write(true);
opts.set_inplace_update_support(true);
opts.set_inplace_update_support(false);
opts.set_unordered_write(true); // we don't use snapshots
opts.set_avoid_unnecessary_blocking_io(true); // schedule unnecessary IO in background;

opts.set_auto_tuned_ratelimiter(
256 * 1024 * 1024, // 256MB/s base rate
100_000, // 100ms refill (standard value)
10, // fairness (standard value)
);

opts.set_periodic_compaction_seconds(3600 * 24); // force compaction once a day
}
}

Expand Down
2 changes: 0 additions & 2 deletions storage/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ impl StorageBuilder {
let update_options = |opts: &mut rocksdb::Options, threads: usize, fdlimit: u64| {
opts.set_paranoid_checks(false);

// bigger base level size - less compactions
// parallel compactions finishes faster - less write stalls

opts.set_max_subcompactions(threads as u32 / 2);

// io
Expand Down
1 change: 1 addition & 0 deletions storage/src/store/shard_state/cell_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ impl CellStorage {
drop(stack);

// Write transaction to the `WriteBatch`
let _hist = HistogramGuard::begin("tycho_storage_batch_write_time");
let total = transaction.len();
for (key, CellState { removes, .. }) in transaction {
self.raw_cells_cache.remove_refs(key, removes);
Expand Down

0 comments on commit 64b497f

Please sign in to comment.