forked from lucataco/cog-hunyuanvideo-lora-trainer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredict.py
130 lines (116 loc) · 5.09 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Prediction interface for Cog ⚙️
# https://cog.run/python
from cog import BasePredictor, Input, Path, Secret
import os
import time
import shutil
import subprocess
from zipfile import ZipFile, is_zipfile
GPU_IDS = "0"
DATA_ROOT = "dataset"
OUTPUT_DIR= "output"
MODEL_CACHE = "HunyuanVideo"
MODEL_URL = "https://weights.replicate.delivery/default/hunyuanvideo-community/HunyuanVideo/model.tar"
def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-xf", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)
class Predictor(BasePredictor):
def setup(self):
"""Load the model into memory to make running multiple predictions efficient"""
# Set environment variables
os.environ["WANDB_MODE"] = "offline"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["TORCH_NCCL_ENABLE_MONITORING"] = "0"
os.environ["FINETRAINERS_LOG_LEVEL"] = "INFO"
# Download weights
if not os.path.exists(MODEL_CACHE):
download_weights(MODEL_URL, MODEL_CACHE)
def predict(
self,
input_videos: Path = Input(description="ZIP file containing video dataset"),
trigger_word: str = Input(description="Trigger word", default="afkx"),
train_steps: int = Input(description="Number of training steps", default=500, ge=10, le=4000),
rank: int = Input(description="LoRA rank", default=128, ge=16, le=128),
batch_size: int = Input(description="Batch size", default=1, ge=1, le=4),
gradient_accumulation_steps: int = Input(description="Gradient accumulation steps", default=1),
seed: int = Input(description="Random seed", default=0),
hub_model_id: str = Input(description="Hugging Face model path to upload trained LoRA", default=None),
hf_token: Secret = Input(description="Hugging Face token for model upload", default=None),
) -> Path:
"""Run training pipeline"""
if seed <=0:
seed = int.from_bytes(os.urandom(2), "big")
print(f"Using seed: {seed}")
# Cleanup past runs
print("Cleaning up past runs")
if os.path.exists(DATA_ROOT):
shutil.rmtree(DATA_ROOT)
if os.path.exists(OUTPUT_DIR):
shutil.rmtree(OUTPUT_DIR)
# Check if input_videos is a zip file
if not is_zipfile(input_videos):
raise ValueError("input_images must be a zip file")
# Extract files from the zip file
os.makedirs(DATA_ROOT, exist_ok=True)
file_count = 0
with ZipFile(input_videos, "r") as zip_ref:
for file_info in zip_ref.infolist():
if not file_info.filename.startswith(
"__MACOSX/"
) and not file_info.filename.startswith("._"):
zip_ref.extract(file_info, DATA_ROOT)
file_count += 1
print(f"Extracted {file_count} files from zip to folder: {DATA_ROOT}")
# Set training arguments
training_args = [
"accelerate",
"launch",
"--config_file", "accelerate_configs/uncompiled_1.yaml",
"--gpu_ids", GPU_IDS,
"train.py",
"--model_name", "hunyuan_video",
"--pretrained_model_name_or_path", MODEL_CACHE,
"--enable_tiling",
"--enable_slicing",
"--data_root", DATA_ROOT,
"--caption_column", "prompts.txt",
"--video_column", "videos.txt",
"--seed", str(seed),
"--rank", str(rank),
"--lora_alpha", str(rank),
"--mixed_precision", "bf16",
"--output_dir", OUTPUT_DIR,
"--batch_size", str(batch_size),
"--id_token", trigger_word,
"--caption_dropout_p", str(0.05),
"--training_type", "lora",
"--train_steps", str(train_steps),
"--gradient_accumulation_steps", str(gradient_accumulation_steps),
"--gradient_checkpointing",
"--optimizer", "adamw",
"--lr", str(2e-5),
"--lr_scheduler", "constant_with_warmup",
"--lr_warmup_steps", str(100),
"--lr_num_cycles", str(1),
"--beta1", str(0.9),
"--beta2", str(0.95),
"--max_grad_norm", str(1.0),
"--weight_decay", str(1e-4),
"--epsilon", str(1e-8),
"--tracker_name", "replicate-hunyuanvideo"
]
# Run the trainer
print(f"Using args: {training_args}")
subprocess.run(training_args, check=True, close_fds=False)
# Check to upload to Hugging Face
if hf_token is not None and hub_model_id is not None:
token = hf_token.get_secret_value()
os.system(f"huggingface-cli login --token {token}")
os.system(f"huggingface-cli upload {hub_model_id} {OUTPUT_DIR}")
# Create output tar of trained model
output_path = "/tmp/trained_model.tar"
os.system(f"tar -cvf {output_path} -C {OUTPUT_DIR} .")
return Path(output_path)