Skip to content

Commit

Permalink
begin combine-small-shards experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
dholth committed Jun 3, 2024
1 parent 9584bb5 commit 3cac14a
Showing 1 changed file with 21 additions and 1 deletion.
22 changes: 21 additions & 1 deletion conda_index/index/shards.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,27 @@ def index_subdir(self, subdir, verbose=False, progress=False):

(self.output_root / subdir).mkdir(parents=True, exist_ok=True)

# yield shards and combine tiny ones?

SMALL_SHARD = 1024 # if a shard is this small, it is a candidate for merge
MERGE_SHARD = 4096 # if the merged shards are bigger than this then spit them out
def merged_shards():
"""
If a shard would be tiny, combine it with a few neighboring shards.
"""
collected = {}
for name, shard in cache.index_shards():
shard_size = len(packb_typed(shard))
if shard_size > SMALL_SHARD:
if collected:
yield collected
yield {name: shard}

collected[name] = shard


for name, shard in cache.index_shards():
shard_data = bytes(packb_typed(shard))
shard_data = packb_typed(shard)
reference_hash = hashlib.sha256(shard_data).hexdigest()
output_path = self.output_root / subdir / f"{reference_hash}.msgpack.zst"
if not output_path.exists():
Expand All @@ -216,6 +235,7 @@ def _patch_repodata_shards(
else:

def per_shard_instructions():
# more difficult if some shards are duplicated...
for pkg, reference in repodata_shards["shards"].items():
# XXX keep it all in RAM? only patch changed shards or, if patches change, all shards?
shard_path = (
Expand Down

0 comments on commit 3cac14a

Please sign in to comment.