Wan MoE: Automatic expert routing based on timestep boundary

stduhpf · leejet · commit 5bf5c1b52e57 · 2025-09-07T01:13:14.000+08:00
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -113,10 +113,12 @@ struct SDParams {
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
     int chroma_t5_mask_pad   = 1;
+    float boundary           = 0.875; 
 
     SDParams() {
         sd_sample_params_init(&sample_params);
         sd_sample_params_init(&high_noise_sample_params);
+        high_noise_sample_params.sample_steps = -1;
     }
 };
 
@@ -243,7 +245,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)\n");
     printf("  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
     printf("                                     (high noise) sampling method (default: \"euler_a\")\n");
-    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: 20)\n");
+    printf("  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)\n");
     printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
     printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
     printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)\n");
@@ -274,6 +276,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma\n");
     printf("  --video-frames                     video frames (default: 1)\n");
     printf("  --fps                              fps (default: 24)\n");
+    printf("  --moe-boundary BOUNDARY            Timestep boundary for Wan2.2 MoE model. (default: 0.875)"); 
+    printf("                                     Only enabled if `--high-noise-steps` is set to -1");
     printf("  -v, --verbose                      print extra info\n");
 }
 
@@ -507,6 +511,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--strength", "", &params.strength},
         {"", "--style-ratio", "", &params.style_ratio},
         {"", "--control-strength", "", &params.control_strength},
+        {"", "--moe-boundary", "", &params.boundary},
     };
 
     options.bool_options = {
@@ -767,8 +772,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
     }
 
     if (params.high_noise_sample_params.sample_steps <= 0) {
-        fprintf(stderr, "error: the high_noise_sample_steps must be greater than 0\n");
-        exit(1);
+        params.high_noise_sample_params.sample_steps = -1;
     }
 
     if (params.strength < 0.f || params.strength > 1.f) {
@@ -1225,6 +1229,7 @@ int main(int argc, const char* argv[]) {
             params.strength,
             params.seed,
             params.video_frames,
+            params.boundary
         };
 
         results = generate_video(sd_ctx, &vid_gen_params, &num_results);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -1727,11 +1727,13 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
     memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
     sd_sample_params_init(&sd_vid_gen_params->sample_params);
     sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
-    sd_vid_gen_params->width        = 512;
-    sd_vid_gen_params->height       = 512;
-    sd_vid_gen_params->strength     = 0.75f;
-    sd_vid_gen_params->seed         = -1;
-    sd_vid_gen_params->video_frames = 6;
+    sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
+    sd_vid_gen_params->width                                 = 512;
+    sd_vid_gen_params->height                                = 512;
+    sd_vid_gen_params->strength                              = 0.75f;
+    sd_vid_gen_params->seed                                  = -1;
+    sd_vid_gen_params->video_frames                          = 6;
+    sd_vid_gen_params->boundary                              = 0.875f;
 }
 
 struct sd_ctx_t {
@@ -2381,7 +2383,24 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
     }
 
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps + high_noise_sample_steps);
+    int total_steps = sample_steps;
+
+    if (high_noise_sample_steps > 0) {
+        total_steps += high_noise_sample_steps;
+    }
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps);
+
+    if(high_noise_sample_steps < 0) {
+        // timesteps∝sigmas for Flow models (like wan2.2 a14b)
+        for (size_t i = 0; i < sigmas.size(); ++i) {
+            if (sigmas[i] < sd_vid_gen_params->boundary) {
+                high_noise_sample_steps = i;
+                break;
+            }
+        }
+        LOG_DEBUG("Switching from high noise model at step %d", high_noise_sample_steps);
+        sample_steps = total_steps - high_noise_sample_steps;
+    }
 
     struct ggml_init_params params;
     params.mem_size = static_cast<size_t>(200 * 1024) * 1024;  // 200 MB
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -208,6 +208,7 @@ typedef struct {
     float strength;
     int64_t seed;
     int video_frames;
+    float boundary;
 } sd_vid_gen_params_t;
 
 typedef struct sd_ctx_t sd_ctx_t;