dakenf · cyrildiagne · Jan 20, 2024
diff --git a/examples/react/src/App.tsx b/examples/react/src/App.tsx
@@ -55,6 +55,17 @@ const pipelines = [
     hasImg2Img: false,
     hasControlNet: false
   },
+  {
+    name: 'SD Turbo (2.6GB)',
+    repo: 'cyrildiagne/sdturbo-onnx',
+    revision: 'main',
+    fp16: true,
+    width: 512,
+    height: 512,
+    steps: 1,
+    hasImg2Img: false,
+    hasControlNet: false,
+  },
   // {
   //   name: 'LCM Dreamshaper FP32 (4.2GB)',
   //   repo: 'aislamov/lcm-dreamshaper-v7-onnx',

diff --git a/src/pipelines/DiffusionPipeline.ts b/src/pipelines/DiffusionPipeline.ts
@@ -21,6 +21,10 @@ export class DiffusionPipeline {
         if (typeof index.controlnet !== 'undefined') {
           return StableDiffusionControlNetPipeline.fromPretrained(modelRepoOrPath, options)
         }
+        // temp hack to identify the SD Turbo model
+        if (index.scheduler[1] === 'EulerDiscreteScheduler') {
+          return SDTurboPipeline.fromPretrained(modelRepoOrPath, options)
+        }
         return StableDiffusionPipeline.fromPretrained(modelRepoOrPath, options)
       case 'StableDiffusionXLPipeline':
       case 'ORTStableDiffusionXLPipeline':

diff --git a/src/pipelines/SDTurboPipeline.ts b/src/pipelines/SDTurboPipeline.ts
@@ -0,0 +1,151 @@
+import { Session } from '@/backends'
+import { getModelJSON } from '@/hub'
+import { GetModelFileOptions } from '@/hub/common'
+import { PipelineBase } from '@/pipelines/PipelineBase'
+import { EulerDiscreteScheduler } from '@/schedulers/EulerDiscreteScheduler'
+import { SchedulerConfig } from '@/schedulers/SchedulerBase'
+import { CLIPTokenizer } from '@/tokenizers/CLIPTokenizer'
+import { randomNormalTensor } from '@/util/Tensor'
+import { Tensor } from '@xenova/transformers'
+import { PretrainedOptions, ProgressCallback, ProgressStatus, dispatchProgress, loadModel } from './common'
+
+export interface SDTurboInput {
+  prompt: string
+  seed?: string
+  width?: number
+  height?: number
+  numInferenceSteps: number
+  sdV1?: boolean
+  progressCallback?: ProgressCallback
+  runVaeOnEachStep?: boolean
+  img2imgFlag?: boolean
+  inputImage?: Float32Array
+  strength?: number
+}
+
+export class SDTurboPipeline extends PipelineBase {
+  declare scheduler: EulerDiscreteScheduler
+
+  constructor (unet: Session, vaeDecoder: Session, vaeEncoder: Session, textEncoder: Session, tokenizer: CLIPTokenizer, scheduler: EulerDiscreteScheduler) {
+    super()
+    this.unet = unet
+    this.vaeDecoder = vaeDecoder
+    this.vaeEncoder = vaeEncoder
+    this.textEncoder = textEncoder
+    this.tokenizer = tokenizer
+    this.scheduler = scheduler
+    this.vaeScaleFactor = 8
+  }
+
+  static createScheduler (config: SchedulerConfig) {
+    return new EulerDiscreteScheduler(
+      {
+        prediction_type: 'epsilon',
+        ...config,
+      },
+    )
+  }
+
+  static async fromPretrained (modelRepoOrPath: string, options?: PretrainedOptions) {
+    const opts: GetModelFileOptions = {
+      ...options,
+    }
+
+    // order matters because WASM memory cannot be decreased. so we load the biggest one first
+    const unet = await loadModel(
+      modelRepoOrPath,
+      'unet/model.onnx',
+      opts,
+    )
+    const textEncoder = await loadModel(modelRepoOrPath, 'text_encoder/model.onnx', opts)
+    const vaeEncoder = await loadModel(modelRepoOrPath, 'vae_encoder/model.onnx', opts)
+    const vae = await loadModel(modelRepoOrPath, 'vae_decoder/model.onnx', opts)
+
+    const schedulerConfig = await getModelJSON(modelRepoOrPath, 'scheduler/scheduler_config.json', true, opts)
+    const scheduler = SDTurboPipeline.createScheduler(schedulerConfig)
+
+    const tokenizer = await CLIPTokenizer.from_pretrained(modelRepoOrPath, { ...opts, subdir: 'tokenizer' })
+    await dispatchProgress(opts.progressCallback, {
+      status: ProgressStatus.Ready,
+    })
+    return new SDTurboPipeline(unet, vae, vaeEncoder, textEncoder, tokenizer, scheduler)
+  }
+
+  async run (input: SDTurboInput) {
+    const width = input.width || 512
+    const height = input.height || 512
+    const batchSize = 1
+    const seed = input.seed || ''
+    this.scheduler.setTimesteps(input.numInferenceSteps || 1)
+
+    await dispatchProgress(input.progressCallback, {
+      status: ProgressStatus.EncodingPrompt,
+    })
+
+    const promptEmbeds = await this.encodePrompt(input.prompt)
+
+    const latentShape = [batchSize, 4, width / 8, height / 8]
+    let latents = randomNormalTensor(latentShape, undefined, undefined, 'float32', seed) // Normal latents used in Text-to-Image
+    const timesteps = this.scheduler.timesteps.data
+
+    latents = latents.mul(this.scheduler.initNoiseSigma)
+
+    let humanStep = 1
+    let cachedImages: Tensor[] | null = null
+
+    for (const step of timesteps) {
+      // for some reason v1.4 takes int64 as timestep input. ideally we should get input dtype from the model
+      // but currently onnxruntime-node does not give out types, only input names
+      const timestep = input.sdV1
+        ? new Tensor(BigInt64Array.from([BigInt(step)]))
+        : new Tensor(new Float32Array([step]))
+      await dispatchProgress(input.progressCallback, {
+        status: ProgressStatus.RunningUnet,
+        unetTimestep: humanStep,
+        unetTotalSteps: timesteps.length,
+      })
+      const latentInput = this.scheduler.scaleInput(latents)
+
+      const noise = await this.unet.run(
+        { sample: latentInput, timestep, encoder_hidden_states: promptEmbeds },
+      )
+
+      const noisePred = noise.out_sample
+
+      latents = this.scheduler.step(
+        noisePred,
+        step,
+        latents,
+      )
+
+      if (input.runVaeOnEachStep) {
+        await dispatchProgress(input.progressCallback, {
+          status: ProgressStatus.RunningVae,
+          unetTimestep: humanStep,
+          unetTotalSteps: timesteps.length,
+        })
+        cachedImages = await this.makeImages(latents)
+      }
+      humanStep++
+    }
+
+    await dispatchProgress(input.progressCallback, {
+      status: ProgressStatus.Done,
+    })
+
+    if (input.runVaeOnEachStep) {
+      return cachedImages!
+    }
+
+    return this.makeImages(latents)
+  }
+
+  async encodeImage (inputImage: Float32Array, width: number, height: number) {
+    const encoded = await this.vaeEncoder.run(
+      { sample: new Tensor('float32', inputImage, [1, 3, width, height]) },
+    )
+
+    const encodedImage = encoded.latent_sample
+    return encodedImage.mul(0.18215)
+  }
+}
diff --git a/src/schedulers/EulerDiscreteScheduler.ts b/src/schedulers/EulerDiscreteScheduler.ts
@@ -0,0 +1,130 @@
+import { SchedulerBase, SchedulerConfig } from '@/schedulers/SchedulerBase'
+import { cat, interp, linspace, randomNormalTensor, range } from '@/util/Tensor'
+import { Tensor } from '@xenova/transformers'
+
+/**
+ * Euler discrete scheduler
+ */
+export class EulerDiscreteScheduler extends SchedulerBase {
+  sigmas: Tensor
+  stepIndex: number = 0
+
+  constructor (
+    config: SchedulerConfig,
+  ) {
+    super(config)
+    this.betas = linspace(
+      config.beta_start ** 0.5,
+      config.beta_end ** 0.5,
+      config.num_train_timesteps,
+    ).pow(2)
+
+    this.alphas = linspace(1, 1, config.num_train_timesteps).sub(this.betas)
+    this.alphasCumprod = this.alphas.cumprod()
+
+    this.sigmas = linspace(1, 1, config.num_train_timesteps)
+      .sub(this.alphasCumprod)
+      .div(this.alphasCumprod)
+      .sqrt()
+    this.timesteps = linspace(
+      0,
+      config.num_train_timesteps - 1,
+      config.num_train_timesteps,
+    ).reverse()
+
+    this.sigmas = cat([
+      this.sigmas.reverse(),
+      new Tensor(this.sigmas.type, [0], [1]),
+    ])
+
+    this.config = config
+  }
+
+  setTimesteps (numInferenceSteps: number) {
+    this.numInferenceSteps = numInferenceSteps
+
+    const stepRatio = ~~(
+      this.config.num_train_timesteps / this.numInferenceSteps
+    )
+    this.timesteps = range(1, numInferenceSteps + 1)
+      .reverse()
+      .mul(stepRatio)
+      .round()
+    this.timesteps = this.timesteps.sub(1)
+
+    this.sigmas = linspace(1, 1, this.config.num_train_timesteps)
+      .sub(this.alphasCumprod)
+      .div(this.alphasCumprod)
+      .sqrt()
+    this.sigmas = interp(
+      this.timesteps,
+      range(0, this.sigmas.data.length),
+      this.sigmas,
+    )
+
+    this.sigmas = cat([this.sigmas, new Tensor(this.sigmas.type, [0], [1])])
+
+    this.stepIndex = 0
+  }
+
+  scaleInput (input: Tensor) {
+    const sigma = this.sigmas.data[this.stepIndex]
+    const scale = (sigma ** 2 + 1) ** 0.5
+    return input.div(scale)
+  }
+
+  get initNoiseSigma () {
+    return Math.max(...this.sigmas.data)
+  }
+
+  step (
+    modelOutput: Tensor,
+    timestep: number,
+    sample: Tensor,
+    s_churn: number = 0.0,
+    s_tmin: number = 0.0,
+    s_tmax: number = Infinity,
+    s_noise: number = 1.0,
+  ) {
+    if (this.numInferenceSteps === null) {
+      throw new Error(
+        "Number of inference steps is 'null', you need to run 'setTimesteps' after creating the scheduler",
+      )
+    }
+
+    const sigma = this.sigmas.data[this.stepIndex]
+
+    // Get gama with the equivalent of this python code
+    let gamma = 0.0
+    if (s_tmin <= sigma && sigma <= s_tmax) {
+      gamma = Math.min(
+        s_churn / (this.sigmas.data.length - 1),
+        Math.sqrt(2) - 1,
+      )
+    }
+
+    const noise = randomNormalTensor(modelOutput.dims)
+
+    const eps = noise.mul(s_noise)
+    const sigma_hat = sigma * (gamma + 1)
+
+    if (gamma > 0) {
+      sample = sample.add(eps.mul(sigma_hat ** 2 - sigma ** 2).sqrt())
+    }
+
+    // # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+    // # config.prediction_type == "epsilon":
+    const denoised = sample.sub(modelOutput.mul(sigma_hat))
+
+    // 2. Convert to an ODE derivative
+    const derivative = sample.sub(denoised).div(sigma_hat)
+
+    const dt = this.sigmas.data[this.stepIndex + 1] - sigma_hat
+
+    const prevSample = sample.add(derivative.mul(dt))
+
+    this.stepIndex++
+
+    return prevSample
+  }
+}
diff --git a/src/util/Tensor.ts b/src/util/Tensor.ts
@@ -227,6 +227,56 @@ Tensor.prototype.cos_ = function () {
   return this
 }
 
+Tensor.prototype.sqrt = function () {
+  return this.clone().sqrt_()
+}
+
+Tensor.prototype.sqrt_ = function () {
+  for (let i = 0; i < this.data.length; ++i) {
+    this.data[i] = Math.sqrt(this.data[i])
+  }
+  return this
+}
+
+export function interp (
+  x: Tensor,
+  xp: Tensor,
+  fp: Tensor,
+) {
+  if (xp.dims.length !== 1) {
+    throw new Error('xp must be 1 dimensional')
+  }
+  if (fp.dims.length !== 1) {
+    throw new Error('fp must be 1 dimensional')
+  }
+  if (xp.dims[0] !== fp.dims[0]) {
+    throw new Error('xp and fp must have the same length')
+  }
+  if (x.dims.length !== 1) {
+    throw new Error('x must be 1 dimensional')
+  }
+  const newDims = x.dims.slice()
+  // @ts-ignore
+  const newData = new x.data.constructor(newDims.reduce((a, b) => a * b))
+  const left = fp.data[0]
+  const right = fp.data[fp.data.length - 1]
+  for (let i = 0; i < newData.length; ++i) {
+    const index = xp.data.findIndex((v) => v > x.data[i])
+    if (index === -1) {
+      newData[i] = right
+    } else if (index === 0) {
+      newData[i] = left
+    } else {
+      const x1 = xp.data[index - 1]
+      const x2 = xp.data[index]
+      const y1 = fp.data[index - 1]
+      const y2 = fp.data[index]
+      newData[i] = ((x.data[i] - x1) * (y2 - y1)) / (x2 - x1) + y1
+    }
+  }
+  return new Tensor(x.type, newData, newDims)
+}
+
 Tensor.prototype.location = 'cpu'
 
 export function range (start: number, end: number, step = 1, type = 'float32') {