Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Lots of 7b-1b anneals on jupiter #33

Merged
merged 8 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 20 additions & 10 deletions src/regmixer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,20 +77,26 @@ def launch(config: Path, mixture_file: Optional[Path], dry_run: bool, no_cache:
logger.info("Launch cancelled!")
return

launch_group = None

if mixture_file:
with open(mixture_file, "r") as f:
predefined_mixes = json.load(f)

launch_group = LaunchGroup(
instances=mk_launch_configs(
group=mk_experiment_group(
config=experiment_config,
mixes=predefined_mixes["mixes"],
group_uuid=group_uuid,
),
beaker_user=beaker_user,
)
)
logger.info(predefined_mixes)
if click.confirm("Launch experiment with this set of mixtures?", default=False):
with yaspin(text="Building experiment group...", color="yellow") as spinner:
launch_group = LaunchGroup(
instances=mk_launch_configs(
group=mk_experiment_group(
config=experiment_config,
mixes=predefined_mixes["mixes"],
group_uuid=group_uuid,
),
beaker_user=beaker_user,
)
)
spinner.ok("✔")
else:
mixes = mk_mixes(config, use_cache=(no_cache == False))

Expand All @@ -109,6 +115,10 @@ def launch(config: Path, mixture_file: Optional[Path], dry_run: bool, no_cache:
logger.info("Launch cancelled!")
return

if not launch_group:
logger.info("Launch cancelled!")
return

with yaspin(text="Launching experiment group...", color="yellow") as spinner:
try:
if dry_run:
Expand Down
130 changes: 130 additions & 0 deletions src/regmixer/config/olmo2-7b-anneal-1b-binary-search.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
name: "olmo2-7b-anneal-1b-binary-search"
description: "Simulated anneal of olmo2-7b to 1B tokens"
budget: "ai2/oe-data"
workspace: "ai2/dolma2"
nodes: 1
gpus: 8
variants: 4
preemptible: true
max_tokens: 10_000_000_000
allow_repetition: false
sequence_length: 2048
seed: 42
minimum_weight: 0.05
train_type: "anneal"
weka: true
checkpoint_path: "/weka/oe-training-default/ai2-llm/checkpoints/ai2-tylerm/peteish7/olmo-core-7b-1124/step928646"
proxy_model_id: "olmo_7b"
tokenizer: "dolma2"
dtype: "uint32"
priority: high
cluster: ai2/jupiter-cirrascale-2
sources:
- name: dolmino-math
paths:
- s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-2-00000.npy
- s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-0-00000.npy
- s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-3-00000.npy
- s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-4-00000.npy
- s3://ai2-llm/preprocessed/tulu-3-sft-personas-math-grade/dolma2-tokenizer/part-1-00000.npy
- s3://ai2-llm/preprocessed/mathcoder2-synthmath/mathcoder2-synthmath/filtered-math/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/mathcoder2-synthmath/ajibawa-2023/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/basic_math_mj/multiadd/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/basic_math_mj/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/owm-filtered-math/metamath/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/gsm8k/v0_main_train/allenai/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/gsm8k-synth/resample_v1_6x/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/gsm_MIND/clean_stop/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy
- s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy
- s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy

- name: dclm-baseline-ft7pct-fw2
paths:
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-26-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0001/part-27-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-09-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-19-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-2-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-22-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-51-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0002/part-61-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-28-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0003/part-48-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-09-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-39-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-54-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0004/part-60-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0005/part-13-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-11-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-27-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0006/part-34-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-07-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-18-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0007/part-48-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-09-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-10-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-27-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0008/part-51-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-17-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-53-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0009/part-55-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-04-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0010/part-41-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-15-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-32-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-40-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-46-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-48-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0013/part-55-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-16-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-44-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0014/part-59-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-19-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0015/part-63-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0016/part-51-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-07-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0017/part-61-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-05-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-36-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0018/part-56-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0020/part-10-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0022/part-06-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-14-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0024/part-22-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-32-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0026/part-57-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-08-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-24-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0027/part-44-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-04-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-14-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-28-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-35-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0028/part-42-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0029/part-45-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-19-00000.npy
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0030/part-30-00000.npy
40 changes: 16 additions & 24 deletions src/regmixer/config/olmo2-7b-anneal-1b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,22 @@ budget: "ai2/oe-data"
workspace: "ai2/dolma2"
nodes: 1
gpus: 8
variants: 1
variants: 32
preemptible: true
max_tokens: 1_000_000_000
allow_repetition: false
sequence_length: 2048
seed: 1337
minimum_weight: 8e-3
seed: 42
minimum_weight: 0.05
train_type: "anneal"
weka: true
# checkpoint_path: "/weka/oe-training-default/ai2-llm/checkpoints/ai2-tylerm/peteish7/step928646-olmo-core-1124"
checkpoint_path: "/weka/oe-training-default/ai2-llm/checkpoints/ai2-tylerm/peteish7/olmo-core-7b-1124/step928646"
proxy_model_id: "olmo_7b"
tokenizer: "dolma2"
dtype: "uint32"
priority: high
cluster: ai2/saturn-cirrascale
cluster: ai2/jupiter-cirrascale-2
sources:
# X Tokens
- name: dolmino-math
paths:
- s3://ai2-llm/preprocessed/tulu_v3.9_personahub_math_interm_algebra_20k/dolma2-tokenizer/*.npy
Expand All @@ -37,7 +35,6 @@ sources:
- s3://ai2-llm/preprocessed/owm-filtered-math/metamath/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/personahub_math_v5_regen_149960/dolma2-tokenizer/*.npy

# X Tokens
- name: gsm-math
paths:
- s3://ai2-llm/preprocessed/gsm8k/v0_socratic_train/allenai/dolma2-tokenizer/*.npy
Expand All @@ -47,43 +44,38 @@ sources:
- s3://ai2-llm/preprocessed/tinyGSM/mind-2students/dolma2-tokenizer/*.npy
- s3://ai2-llm/preprocessed/tinyGSM/mind/dolma2-tokenizer/*.npy

# X Tokens
- name: tulu-flan
paths:
- s3://ai2-llm/preprocessed/tulu_flan/v1-FULLDECON-HARD-TRAIN-60M-shots_all-upweight_1-dialog_false-sep_rulebased/allenai/dolma2-tokenizer/*.npy

# X Tokens
- name: pes2o
paths:
- s3://ai2-llm/preprocessed/pes2o/allenai/dolma2-tokenizer/part-10-00000.npy

# X Tokens
- name: wiki
paths:
- s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-1-00000.npy
- s3://ai2-llm/preprocessed/olmo-mix/danyh-compiled-v1_7/documents/wiki/allenai/dolma2-tokenizer/part-0-00000.npy

# X Tokens
- name: stack-exchange
paths:
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-00-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-02-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-03-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-04-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-01-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-05-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-06-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-07-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-08-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-09-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-10-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-11-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-12-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-13-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-14-00000.npy
- s3://ai2-llm/preprocessed/stackexchange/v1_dedupe/allenai/dolma2-tokenizer/part-15-00000.npy

# X Tokens
- name: dclm-baseline-ft7pct-fw2
paths:
- s3://ai2-llm/preprocessed/dclm/v0_rep32_ft7percentile_fw2/documents/allenai/dolma2-tokenizer/0000/part-17-00000.npy
Expand Down
44 changes: 44 additions & 0 deletions src/regmixer/config/premix/dolmino-dclm-hq-v1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"mixes": [
{
"dclm-baseline-ft7pct-fw2": [
1.0,
1.0
],
"dolmino-math": [
0.0,
1.0
]
},
{
"dclm-baseline-ft7pct-fw2": [
0.0,
1.0
],
"dolmino-math": [
1.0,
1.0
]
},
{
"dclm-baseline-ft7pct-fw2": [
0.25,
1.0
],
"dolmino-math": [
0.75,
1.0
]
},
{
"dclm-baseline-ft7pct-fw2": [
0.75,
1.0
],
"dolmino-math": [
0.25,
1.0
]
}
]
}
4 changes: 3 additions & 1 deletion src/regmixer/model/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,9 @@ def __init__(

self.checkpoint_save_dir = f"{self.root_dir if weka else self.data_dir}/checkpoints/{self.beaker_user.lower().strip()}/{self.run_name}"

self.dataset_cache = f"{self.root_dir}/{self.beaker_user.lower()}/dataset-cache"
self.dataset_cache = (
f"{self.root_dir}/{self.beaker_user.lower()}/{self.run_name}/dataset-cache"
)

def get_tokenizer_config(self, tokenizer) -> TokenizerConfig:
try:
Expand Down