Skip to content

Commit e2682e1

Browse files
felipemello1Felipe Melloebsmothers
authored andcommitted
update configs (#1954)
Co-authored-by: Felipe Mello <felipemello@fb.com> Co-authored-by: ebsmothers <ebs@meta.com>
1 parent befdcc7 commit e2682e1

File tree

116 files changed

+2208
-732
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

116 files changed

+2208
-732
lines changed

recipes/configs/code_llama2/7B_full_low_memory.yaml

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ resume_from_checkpoint: False
4545

4646
# Dataset
4747
dataset:
48-
packed: False # Set to true for great speed ups
4948
_component_: torchtune.datasets.alpaca_dataset
49+
packed: False # True increases speed
5050

5151
seed: null
5252
shuffle: True
@@ -55,20 +55,20 @@ shuffle: True
5555
epochs: 1
5656
max_steps_per_epoch: null
5757
batch_size: 2
58-
gradient_accumulation_steps: 1
58+
gradient_accumulation_steps: 1 # Use to increase virtual batch size
5959
optimizer:
6060
_component_: bitsandbytes.optim.PagedAdamW
6161
lr: 2e-5
62-
optimizer_in_bwd: True
62+
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
6363
loss:
6464
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
65-
compile: False
65+
compile: False # pytorch compile, set to true for better perf/memory
6666

6767
# Training env
6868
device: cuda
6969

7070
# Memory management
71-
enable_activation_checkpointing: True
71+
enable_activation_checkpointing: True # True reduces memory
7272
enable_activation_offloading: True # True reduces memory
7373
dtype: bf16
7474

@@ -79,3 +79,28 @@ metric_logger:
7979
log_dir: /tmp/CodeLlama-7b-hf/logs
8080
log_every_n_steps: 1
8181
log_peak_memory_stats: True
82+
83+
# Profiler (disabled)
84+
profiler:
85+
_component_: torchtune.training.setup_torch_profiler
86+
enabled: False
87+
88+
#Output directory of trace artifacts
89+
output_dir: ${output_dir}/profiling_outputs
90+
91+
#`torch.profiler.ProfilerActivity` types to trace
92+
cpu: True
93+
cuda: True
94+
95+
#trace options passed to `torch.profiler.profile`
96+
profile_memory: False
97+
with_stack: False
98+
record_shapes: True
99+
with_flops: False
100+
101+
# `torch.profiler.schedule` options:
102+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
103+
wait_steps: 5
104+
warmup_steps: 3
105+
active_steps: 2
106+
num_cycles: 1

recipes/configs/code_llama2/7B_lora_single_device.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@
1818
# Model Arguments
1919
model:
2020
_component_: torchtune.models.code_llama2.lora_code_llama2_7b
21-
lora_attn_modules: ['q_proj', 'v_proj']
22-
apply_lora_to_mlp: False
21+
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
22+
apply_lora_to_mlp: True
2323
apply_lora_to_output: False
24-
lora_rank: 8
25-
lora_alpha: 16
24+
lora_rank: 8 # higher increases accuracy and memory
25+
lora_alpha: 16 # usually alpha=2*rank
2626
lora_dropout: 0.0
2727

2828
# Tokenizer
@@ -49,8 +49,8 @@ save_adapter_weights_only: False
4949

5050
# Dataset
5151
dataset:
52-
packed: False # Set to true for great speed ups
5352
_component_: torchtune.datasets.alpaca_cleaned_dataset
53+
packed: False # True increases speed
5454

5555
seed: null
5656
shuffle: True
@@ -59,7 +59,7 @@ shuffle: True
5959
epochs: 1
6060
max_steps_per_epoch: null
6161
batch_size: 2
62-
gradient_accumulation_steps: 16
62+
gradient_accumulation_steps: 8 # Use to increase virtual batch size
6363
optimizer:
6464
_component_: torch.optim.AdamW
6565
fused: True
@@ -70,13 +70,13 @@ lr_scheduler:
7070
num_warmup_steps: 100
7171
loss:
7272
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
73-
compile: False
73+
compile: False # pytorch compile, set to true for better perf/memory
7474

7575
# Training env
7676
device: cuda
7777

7878
# Memory management
79-
enable_activation_checkpointing: True
79+
enable_activation_checkpointing: True # True reduces memory
8080
enable_activation_offloading: False # True reduces memory
8181
dtype: bf16
8282

recipes/configs/code_llama2/7B_qlora_single_device.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@
1818
# Model Arguments
1919
model:
2020
_component_: torchtune.models.code_llama2.qlora_code_llama2_7b
21-
lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
21+
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
2222
apply_lora_to_mlp: True
2323
apply_lora_to_output: False
24-
lora_rank: 8
25-
lora_alpha: 16
24+
lora_rank: 8 # higher increases accuracy and memory
25+
lora_alpha: 16 # usually alpha=2*rank
2626
lora_dropout: 0.0
2727

2828
# Tokenizer
@@ -49,16 +49,16 @@ save_adapter_weights_only: False
4949

5050
# Dataset
5151
dataset:
52-
packed: False # Set to true for great speed ups
5352
_component_: torchtune.datasets.alpaca_cleaned_dataset
53+
packed: False # True increases speed
5454
seed: null
5555
shuffle: True
5656

5757
# Fine-tuning arguments
5858
epochs: 1
5959
max_steps_per_epoch: null
6060
batch_size: 2
61-
gradient_accumulation_steps: 16
61+
gradient_accumulation_steps: 8 # Use to increase virtual batch size
6262
optimizer:
6363
_component_: torch.optim.AdamW
6464
fused: True
@@ -69,13 +69,13 @@ lr_scheduler:
6969
num_warmup_steps: 100
7070
loss:
7171
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
72-
compile: False
72+
compile: False # pytorch compile, set to true for better perf/memory
7373

7474
# Training env
7575
device: cuda
7676

7777
# Memory management
78-
enable_activation_checkpointing: True
78+
enable_activation_checkpointing: True # True reduces memory
7979
enable_activation_offloading: False # True reduces memory
8080
dtype: bf16
8181

recipes/configs/dev/8B_full_experimental.yaml

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ tokenizer:
2626

2727
# Dataset
2828
dataset:
29-
packed: False # Set to true for great speed ups
3029
_component_: torchtune.datasets.alpaca_dataset
30+
packed: False # True increases speed
3131
seed: null
3232
shuffle: True
3333

@@ -57,14 +57,14 @@ optimizer:
5757
loss:
5858
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5959
max_steps_per_epoch: null
60-
gradient_accumulation_steps: 1
61-
compile: False
60+
gradient_accumulation_steps: 1 # Use to increase virtual batch size
61+
compile: False # pytorch compile, set to true for better perf/memory
6262

6363
# Training env
6464
device: cuda
6565

6666
# Memory management
67-
enable_activation_checkpointing: False
67+
enable_activation_checkpointing: False # True reduces memory
6868
enable_activation_offloading: False # True reduces memory
6969
ac_mode: 'selective' # ['selective', 'full']
7070
ac_option: 2 # [int] = ac every positive int layer
@@ -81,3 +81,28 @@ metric_logger:
8181
output_dir: /tmp/alpaca-llama3-finetune
8282
log_every_n_steps: null
8383
log_peak_memory_stats: True
84+
85+
# Profiler (disabled)
86+
profiler:
87+
_component_: torchtune.training.setup_torch_profiler
88+
enabled: False
89+
90+
#Output directory of trace artifacts
91+
output_dir: ${output_dir}/profiling_outputs
92+
93+
#`torch.profiler.ProfilerActivity` types to trace
94+
cpu: True
95+
cuda: True
96+
97+
#trace options passed to `torch.profiler.profile`
98+
profile_memory: False
99+
with_stack: False
100+
record_shapes: True
101+
with_flops: False
102+
103+
# `torch.profiler.schedule` options:
104+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
105+
wait_steps: 5
106+
warmup_steps: 3
107+
active_steps: 2
108+
num_cycles: 1

recipes/configs/gemma/2B_full.yaml

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ tokenizer:
2323

2424
# Dataset
2525
dataset:
26-
packed: False # Set to true for great speed ups
2726
_component_: torchtune.datasets.alpaca_dataset
27+
packed: False # True increases speed
2828
seed: null
2929
shuffle: True
3030

@@ -54,14 +54,15 @@ optimizer:
5454
loss:
5555
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5656
max_steps_per_epoch: null
57-
gradient_accumulation_steps: 1
58-
compile: False
57+
gradient_accumulation_steps: 1 # Use to increase virtual batch size
58+
compile: False # pytorch compile, set to true for better perf/memory
59+
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
5960

6061
# Training env
6162
device: cuda
6263

6364
# Memory management
64-
enable_activation_checkpointing: True
65+
enable_activation_checkpointing: True # True reduces memory
6566
enable_activation_offloading: False # True reduces memory
6667

6768
# Reduced precision
@@ -74,3 +75,28 @@ metric_logger:
7475
output_dir: /tmp/alpaca-gemma-finetune
7576
log_every_n_steps: 1
7677
log_peak_memory_stats: True
78+
79+
# Profiler (disabled)
80+
profiler:
81+
_component_: torchtune.training.setup_torch_profiler
82+
enabled: False
83+
84+
#Output directory of trace artifacts
85+
output_dir: ${output_dir}/profiling_outputs
86+
87+
#`torch.profiler.ProfilerActivity` types to trace
88+
cpu: True
89+
cuda: True
90+
91+
#trace options passed to `torch.profiler.profile`
92+
profile_memory: False
93+
with_stack: False
94+
record_shapes: True
95+
with_flops: False
96+
97+
# `torch.profiler.schedule` options:
98+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
99+
wait_steps: 5
100+
warmup_steps: 3
101+
active_steps: 2
102+
num_cycles: 1

recipes/configs/gemma/2B_lora.yaml

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,18 @@ tokenizer:
2222

2323
# Dataset
2424
dataset:
25-
packed: False # Set to true for great speed ups
2625
_component_: torchtune.datasets.alpaca_dataset
26+
packed: False # True increases speed
2727
seed: null
2828
shuffle: True
2929

3030
# Model Arguments
3131
model:
3232
_component_: torchtune.models.gemma.lora_gemma_2b
33-
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
33+
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
3434
apply_lora_to_mlp: True
35-
lora_rank: 64
36-
lora_alpha: 128
35+
lora_rank: 64 # higher increases accuracy and memory
36+
lora_alpha: 128 # usually alpha=2*rank
3737
lora_dropout: 0.0
3838

3939
checkpointer:
@@ -66,14 +66,14 @@ loss:
6666
batch_size: 4
6767
epochs: 3
6868
max_steps_per_epoch: null
69-
gradient_accumulation_steps: 1
70-
compile: False
69+
gradient_accumulation_steps: 1 # Use to increase virtual batch size
70+
compile: False # pytorch compile, set to true for better perf/memory
7171

7272
# Training env
7373
device: cuda
7474

7575
# Memory management
76-
enable_activation_checkpointing: True
76+
enable_activation_checkpointing: True # True reduces memory
7777
enable_activation_offloading: False # True reduces memory
7878

7979
# Reduced precision
@@ -86,3 +86,28 @@ metric_logger:
8686
output_dir: /tmp/alpaca-gemma-lora
8787
log_every_n_steps: 1
8888
log_peak_memory_stats: True
89+
90+
# Profiler (disabled)
91+
profiler:
92+
_component_: torchtune.training.setup_torch_profiler
93+
enabled: False
94+
95+
#Output directory of trace artifacts
96+
output_dir: ${output_dir}/profiling_outputs
97+
98+
#`torch.profiler.ProfilerActivity` types to trace
99+
cpu: True
100+
cuda: True
101+
102+
#trace options passed to `torch.profiler.profile`
103+
profile_memory: False
104+
with_stack: False
105+
record_shapes: True
106+
with_flops: False
107+
108+
# `torch.profiler.schedule` options:
109+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
110+
wait_steps: 5
111+
warmup_steps: 3
112+
active_steps: 2
113+
num_cycles: 1

recipes/configs/gemma/2B_lora_single_device.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,18 @@ tokenizer:
2222

2323
# Dataset
2424
dataset:
25-
packed: False # Set to true for great speed ups
2625
_component_: torchtune.datasets.alpaca_dataset
26+
packed: False # True increases speed
2727
seed: null
2828
shuffle: True
2929

3030
# Model Arguments
3131
model:
3232
_component_: torchtune.models.gemma.lora_gemma_2b
33-
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
33+
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
3434
apply_lora_to_mlp: True
35-
lora_rank: 64
36-
lora_alpha: 128
35+
lora_rank: 64 # higher increases accuracy and memory
36+
lora_alpha: 128 # usually alpha=2*rank
3737
lora_dropout: 0.0
3838

3939
checkpointer:
@@ -65,14 +65,14 @@ loss:
6565
batch_size: 4
6666
epochs: 3
6767
max_steps_per_epoch: null
68-
gradient_accumulation_steps: 4
69-
compile: False
68+
gradient_accumulation_steps: 8 # Use to increase virtual batch size
69+
compile: False # pytorch compile, set to true for better perf/memory
7070

7171
# Training env
7272
device: cuda
7373

7474
# Memory management
75-
enable_activation_checkpointing: True
75+
enable_activation_checkpointing: True # True reduces memory
7676
enable_activation_offloading: False # True reduces memory
7777

7878
# Reduced precision

0 commit comments

Comments
 (0)