Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions bergson/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import math
import os
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Literal, Sequence
Expand Down Expand Up @@ -114,7 +115,7 @@ def ceildiv(a: int, b: int) -> int:
return -(-a // b) # Equivalent to math.ceil(a / b) but faster for integers


def allocate_batches(doc_lengths: list[int], N: int) -> list[list[int]]:
def allocate_batches(doc_lengths: list[int], N: int, seed: int = 42) -> list[list[int]]:
Copy link
Collaborator Author

@luciaquirke luciaquirke Sep 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we leave setting the seed to the end user? Can't think of any reason why someone would want non-deterministic shuffling here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just always seed it

"""
Allocate documents into batches that are then distributed evenly across
a fixed number of workers.
Expand Down Expand Up @@ -230,8 +231,13 @@ def allocate_batches(doc_lengths: list[int], N: int) -> list[list[int]]:
for b_idx, batch in enumerate(batches):
allocation[b_idx % world_size].append(batch)

# sanity: equal # of batches per worker
# Sanity: equal # of batches per worker
assert len({len(b) for b in allocation}) == 1

# Break any systematic ordering of batches
random.seed(seed)
random.shuffle(allocation[rank])

return allocation[rank]


Expand Down