-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainvar.py
252 lines (203 loc) · 11.6 KB
/
trainvar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import os
import random
import torch
from data import CachedVarDataset, TransfusionVarDataset, TransfusionVarDataset_collate_fn, create_text_image_pairs, load_pairs_from_disk, patchify, unpatchify, resume_checkpoint, save_checkpoint, save_pairs_to_disk
from inference import debug_image, visualize_latents
from multiscale import MultiscaleAutoregressiveModel
from ogen import OGen
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader
from diffusers import AutoencoderKL
from vae import vae_decode, vae_encode_batch
import argparse
from tqdm import tqdm
from accelerate import Accelerator
from PIL import Image
def train():
# Add command-line argument parsing
parser = argparse.ArgumentParser(description='Train the Transfusion model')
parser.add_argument('--resume', action='store_true', help='Resume training from checkpoint')
parser.add_argument('--model_name', type=str, default='HuggingFaceTB/SmolLM-1.7B', help='Name of the model to use')
parser.add_argument('--learning_rate', type=float, default=5e-5, help='Learning rate for training')
parser.add_argument('--batch_size', type=int, default=4, help='Batch size for training')
parser.add_argument('--image_size', type=int, default=256, help='Image size for training')
parser.add_argument('--patch_size', type=int, default=2, help='Patch size for training')
parser.add_argument('--gradient_checkpointing', action='store_true', help='Use gradient checkpointing')
parser.add_argument('--diffusion_loss_weight', type=float, default=5, help='Weight for the diffusion loss')
parser.add_argument('--max_length', type=int, default=128, help='Max length for the input text')
parser.add_argument('--warmup_steps', type=int, default=0, help='Warmup steps for the scheduler')
parser.add_argument('--hidden_dim', type=int, default=1024, help='Transformer hidden dimension')
parser.add_argument('--depth', type=int, default=20, help='Transformer depth')
parser.add_argument('--heads', type=int, default=8, help='Number of attention heads')
parser.add_argument('--debug_steps', type=int, default=50, help='Number of steps to debug')
parser.add_argument('--inference_steps', type=int, default=100, help='Number of steps to inference')
parser.add_argument('--save_steps', type=int, default=200, help='Number of steps to save')
parser.add_argument('--cache', action='store_true', help='Recache the dataset')
parser.add_argument('--cache_batch_size', type=int, default=1, help='Batch size for cache loading')
parser.add_argument('--optimizer', type=str, default="adamw_schedulefree", help='Optimizer')
args = parser.parse_args()
model_name = args.model_name
accelerator = Accelerator(mixed_precision='bf16' if torch.cuda.is_available() else None)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
vae_name = "madebyollin/sdxl-vae-fp16-fix"
vae = AutoencoderKL.from_pretrained(vae_name).to(accelerator.device)
vae.requires_grad = False
max_length = args.max_length
image_size = args.image_size
batch_size = args.batch_size
patch_size = args.patch_size
learning_rate = args.learning_rate
diffusion_loss_weight = args.diffusion_loss_weight
warmup_steps = args.warmup_steps
N_debug = args.debug_steps
N_inference = args.inference_steps
N_save = args.save_steps
num_epochs = 10
N_loss_window = 100 # Number of steps for moving average
cache_batch_size = args.cache_batch_size
gradient_checkpointing = args.gradient_checkpointing
print(f"Model name: {model_name}")
print(f"Learning rate: {learning_rate}")
print(f"Gradient checkpointing: {gradient_checkpointing}")
print(f"Batch size: {batch_size}")
print(f"Image size: {image_size}")
print(f"Patch size: {patch_size}")
print(f"Diffusion loss weight: {diffusion_loss_weight}")
print(f"Debug: {N_debug} Inference: {N_inference} Save: {N_save}")
print(f"Max length: {max_length}")
print(f"VAE: {vae_name}")
print(f"Cache batch size: {cache_batch_size}")
scale_sizes=[1, 2, 3, 4, 5, 6, 8, 10, 13, 16]
if torch.cuda.is_available():
from liger_kernel.transformers import AutoLigerKernelForCausalLM
transformer = AutoLigerKernelForCausalLM.from_pretrained(model_name)
else:
transformer = AutoModelForCausalLM.from_pretrained(model_name)
model = MultiscaleAutoregressiveModel(transformer_decoder=transformer, scale_sizes=scale_sizes)
model.train()
total_params = sum(p.numel() for p in model.parameters())
print(f"Total model parameters: {total_params}")
if not os.path.isfile("pairs.pkl"):
pairs = create_text_image_pairs(["source"])
save_pairs_to_disk(pairs,"pairs.pkl")
text_image_pairs = load_pairs_from_disk('pairs.pkl')
print(f"Loaded {len(text_image_pairs)} text-image pairs")
dataset = TransfusionVarDataset(text_image_pairs, tokenizer, model, text_seq_len=max_length, scale_sizes=scale_sizes)
torch.cuda.empty_cache()
# Cache the dataset
cache_dir = 'dataset_cache'
os.makedirs(cache_dir, exist_ok=True)
if args.cache or not os.path.exists(cache_dir) or len(os.listdir(cache_dir)) == 0:
temp_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=8, multiprocessing_context='fork' if torch.backends.mps.is_available() else None)
for i, batch in enumerate(tqdm(temp_dataloader, desc="Caching dataset")):
batch["image_latents"] = [vae_encode_batch(batch["pixel_values"][i], vae, vae_batch_size=batch_size, accelerator=accelerator) for i in range(len(batch["pixel_values"]))]
del batch["pixel_values"]
cache_file = os.path.join(cache_dir, f'batch_var_{batch_size}_{i}.pt')
torch.save(batch, cache_file)
else:
print("Using existing cached dataset.")
cached_dataset = CachedVarDataset(cache_dir, batch_size, accelerator)
dataloader = DataLoader(cached_dataset, batch_size=cache_batch_size, shuffle=False, num_workers=cache_batch_size, multiprocessing_context='fork' if torch.backends.mps.is_available() else None)
# Optimizer
optim = args.optimizer
if optim == "adamw8bit":
from bitsandbytes.optim import AdamW8bit
optimizer = AdamW8bit(model.parameters(), lr=learning_rate, betas=(0.9, 0.999))
elif optim == "adamw":
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999))
elif optim == "adamw_schedulefree":
from schedulefree import AdamWScheduleFree
optimizer = AdamWScheduleFree(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), foreach=torch.cuda.is_available(), warmup_steps=warmup_steps)
optimizer.train()
else:
raise ValueError(f"Optimizer {optim} not found")
# Add LR scheduler
from transformers import get_linear_schedule_with_warmup
# Calculate the total number of training steps
total_steps = len(dataloader) * num_epochs
# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps
)
# Initialize epoch and step counter
start_epoch = 0
step_counter = 0
# Print optimizer class
print(f"Optimizer: {optimizer.__class__.__name__}")
# Load checkpoint if resume flag is set
if args.resume:
model, optimizer, scheduler, start_epoch, step_counter = resume_checkpoint(model, optimizer, scheduler)
# Prepare everything with accelerator
model, vae, optimizer, dataloader, scheduler = accelerator.prepare(model, vae, optimizer, dataloader, scheduler)
model = model.to(accelerator.device)
print("Model device: ", accelerator.device)
"""
# Sample for inference
sample_batch = torch.load(os.path.join(cache_dir, cached_dataset.cache_files[0]))
sample_text = sample_batch['input_ids'][0].unsqueeze(0).to(accelerator.device)
sample_latents = sample_batch['image_latents'][0].unsqueeze(0).to(accelerator.device)
# Decode the sample latents using the VAE
decoded_sample_latents = vae_decode(sample_latents, accelerator.unwrap_model(vae))
# Create a folder to save the decoded sample latents if it doesn't exist
os.makedirs('inference_results', exist_ok=True)
decoded_sample_latents.save(f'inference_results/sample_latents_epoch_0.png')
"""
# Add noise to the sample latents
torch.manual_seed(42)
text_loss_window = []
diffusion_loss_window = []
if torch.cuda.is_available():
torch.cuda.empty_cache()
if gradient_checkpointing:
model.transformer_decoder.gradient_checkpointing_enable()
for epoch in range(start_epoch, num_epochs):
progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/10", leave=False)
for i, batch in enumerate(progress_bar, 1):
step_counter += 1
bsz = batch['input_ids'].shape[1] * cache_batch_size
text = batch['input_ids'].reshape(bsz, -1)[:,1:max_length+1].clone().to(accelerator.device)
latents = [latent.reshape(bsz, 4, scale, scale).clone().to(accelerator.device) for latent,scale in zip(batch['image_latents'],scale_sizes)]
with accelerator.autocast():
generated_latents, text_predictions = model.forward(text, latents)
total_loss, text_loss, image_loss = model.compute_loss(generated_latents, latents, text_predictions, text, diffusion_loss_weight)
accelerator.backward(total_loss)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
# Update loss windows
text_loss_window.append(text_loss.item())
diffusion_loss_window.append(image_loss.item())
if len(text_loss_window) > N_loss_window:
text_loss_window.pop(0)
if len(diffusion_loss_window) > N_loss_window:
diffusion_loss_window.pop(0)
# Calculate moving averages
avg_text_loss = sum(text_loss_window) / len(text_loss_window)
avg_diffusion_loss = sum(diffusion_loss_window) / len(diffusion_loss_window)
progress_bar.set_postfix({
'dloss': f"{avg_diffusion_loss:.4f}",
'tloss': f"{avg_text_loss:.4f}",
'lr': f"{scheduler.get_last_lr()[0]:.6f}"
})
if step_counter % N_debug == 0:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
visualize_latents(vae, generated_latents, latents, f'debug_results/debug_e{epoch+1}_s{step_counter}.png')
"""
if step_counter % N_inference == 0:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
#inference(unwrapped_model, accelerator.unwrap_model(vae), optimizer, sample_text, sample_latents, f'inference_results/inference_epoch_{epoch+1}_step_{step_counter}.png')
"""
# Save the model every N steps
if step_counter % N_save == 0:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
save_checkpoint(unwrapped_model, optimizer, scheduler, epoch, step_counter)
if __name__ == '__main__':
train()