diff --git a/examples/react/src/App.tsx b/examples/react/src/App.tsx index b256c56..6e20f6a 100644 --- a/examples/react/src/App.tsx +++ b/examples/react/src/App.tsx @@ -55,6 +55,17 @@ const pipelines = [ hasImg2Img: false, hasControlNet: false }, + { + name: 'SD Turbo (2.6GB)', + repo: 'cyrildiagne/sdturbo-onnx', + revision: 'main', + fp16: true, + width: 512, + height: 512, + steps: 1, + hasImg2Img: false, + hasControlNet: false, + }, // { // name: 'LCM Dreamshaper FP32 (4.2GB)', // repo: 'aislamov/lcm-dreamshaper-v7-onnx', diff --git a/src/pipelines/DiffusionPipeline.ts b/src/pipelines/DiffusionPipeline.ts index 8571c4e..d8e99ea 100644 --- a/src/pipelines/DiffusionPipeline.ts +++ b/src/pipelines/DiffusionPipeline.ts @@ -21,6 +21,10 @@ export class DiffusionPipeline { if (typeof index.controlnet !== 'undefined') { return StableDiffusionControlNetPipeline.fromPretrained(modelRepoOrPath, options) } + // temp hack to identify the SD Turbo model + if (index.scheduler[1] === 'EulerDiscreteScheduler') { + return SDTurboPipeline.fromPretrained(modelRepoOrPath, options) + } return StableDiffusionPipeline.fromPretrained(modelRepoOrPath, options) case 'StableDiffusionXLPipeline': case 'ORTStableDiffusionXLPipeline': diff --git a/src/pipelines/SDTurboPipeline.ts b/src/pipelines/SDTurboPipeline.ts new file mode 100644 index 0000000..3d4c0ed --- /dev/null +++ b/src/pipelines/SDTurboPipeline.ts @@ -0,0 +1,151 @@ +import { Session } from '@/backends' +import { getModelJSON } from '@/hub' +import { GetModelFileOptions } from '@/hub/common' +import { PipelineBase } from '@/pipelines/PipelineBase' +import { EulerDiscreteScheduler } from '@/schedulers/EulerDiscreteScheduler' +import { SchedulerConfig } from '@/schedulers/SchedulerBase' +import { CLIPTokenizer } from '@/tokenizers/CLIPTokenizer' +import { randomNormalTensor } from '@/util/Tensor' +import { Tensor } from '@xenova/transformers' +import { PretrainedOptions, ProgressCallback, ProgressStatus, dispatchProgress, loadModel } from './common' + +export interface SDTurboInput { + prompt: string + seed?: string + width?: number + height?: number + numInferenceSteps: number + sdV1?: boolean + progressCallback?: ProgressCallback + runVaeOnEachStep?: boolean + img2imgFlag?: boolean + inputImage?: Float32Array + strength?: number +} + +export class SDTurboPipeline extends PipelineBase { + declare scheduler: EulerDiscreteScheduler + + constructor (unet: Session, vaeDecoder: Session, vaeEncoder: Session, textEncoder: Session, tokenizer: CLIPTokenizer, scheduler: EulerDiscreteScheduler) { + super() + this.unet = unet + this.vaeDecoder = vaeDecoder + this.vaeEncoder = vaeEncoder + this.textEncoder = textEncoder + this.tokenizer = tokenizer + this.scheduler = scheduler + this.vaeScaleFactor = 8 + } + + static createScheduler (config: SchedulerConfig) { + return new EulerDiscreteScheduler( + { + prediction_type: 'epsilon', + ...config, + }, + ) + } + + static async fromPretrained (modelRepoOrPath: string, options?: PretrainedOptions) { + const opts: GetModelFileOptions = { + ...options, + } + + // order matters because WASM memory cannot be decreased. so we load the biggest one first + const unet = await loadModel( + modelRepoOrPath, + 'unet/model.onnx', + opts, + ) + const textEncoder = await loadModel(modelRepoOrPath, 'text_encoder/model.onnx', opts) + const vaeEncoder = await loadModel(modelRepoOrPath, 'vae_encoder/model.onnx', opts) + const vae = await loadModel(modelRepoOrPath, 'vae_decoder/model.onnx', opts) + + const schedulerConfig = await getModelJSON(modelRepoOrPath, 'scheduler/scheduler_config.json', true, opts) + const scheduler = SDTurboPipeline.createScheduler(schedulerConfig) + + const tokenizer = await CLIPTokenizer.from_pretrained(modelRepoOrPath, { ...opts, subdir: 'tokenizer' }) + await dispatchProgress(opts.progressCallback, { + status: ProgressStatus.Ready, + }) + return new SDTurboPipeline(unet, vae, vaeEncoder, textEncoder, tokenizer, scheduler) + } + + async run (input: SDTurboInput) { + const width = input.width || 512 + const height = input.height || 512 + const batchSize = 1 + const seed = input.seed || '' + this.scheduler.setTimesteps(input.numInferenceSteps || 1) + + await dispatchProgress(input.progressCallback, { + status: ProgressStatus.EncodingPrompt, + }) + + const promptEmbeds = await this.encodePrompt(input.prompt) + + const latentShape = [batchSize, 4, width / 8, height / 8] + let latents = randomNormalTensor(latentShape, undefined, undefined, 'float32', seed) // Normal latents used in Text-to-Image + const timesteps = this.scheduler.timesteps.data + + latents = latents.mul(this.scheduler.initNoiseSigma) + + let humanStep = 1 + let cachedImages: Tensor[] | null = null + + for (const step of timesteps) { + // for some reason v1.4 takes int64 as timestep input. ideally we should get input dtype from the model + // but currently onnxruntime-node does not give out types, only input names + const timestep = input.sdV1 + ? new Tensor(BigInt64Array.from([BigInt(step)])) + : new Tensor(new Float32Array([step])) + await dispatchProgress(input.progressCallback, { + status: ProgressStatus.RunningUnet, + unetTimestep: humanStep, + unetTotalSteps: timesteps.length, + }) + const latentInput = this.scheduler.scaleInput(latents) + + const noise = await this.unet.run( + { sample: latentInput, timestep, encoder_hidden_states: promptEmbeds }, + ) + + const noisePred = noise.out_sample + + latents = this.scheduler.step( + noisePred, + step, + latents, + ) + + if (input.runVaeOnEachStep) { + await dispatchProgress(input.progressCallback, { + status: ProgressStatus.RunningVae, + unetTimestep: humanStep, + unetTotalSteps: timesteps.length, + }) + cachedImages = await this.makeImages(latents) + } + humanStep++ + } + + await dispatchProgress(input.progressCallback, { + status: ProgressStatus.Done, + }) + + if (input.runVaeOnEachStep) { + return cachedImages! + } + + return this.makeImages(latents) + } + + async encodeImage (inputImage: Float32Array, width: number, height: number) { + const encoded = await this.vaeEncoder.run( + { sample: new Tensor('float32', inputImage, [1, 3, width, height]) }, + ) + + const encodedImage = encoded.latent_sample + return encodedImage.mul(0.18215) + } +} diff --git a/src/schedulers/EulerDiscreteScheduler.ts b/src/schedulers/EulerDiscreteScheduler.ts new file mode 100644 index 0000000..fe4a64d --- /dev/null +++ b/src/schedulers/EulerDiscreteScheduler.ts @@ -0,0 +1,130 @@ +import { SchedulerBase, SchedulerConfig } from '@/schedulers/SchedulerBase' +import { cat, interp, linspace, randomNormalTensor, range } from '@/util/Tensor' +import { Tensor } from '@xenova/transformers' + +/** + * Euler discrete scheduler + */ +export class EulerDiscreteScheduler extends SchedulerBase { + sigmas: Tensor + stepIndex: number = 0 + + constructor ( + config: SchedulerConfig, + ) { + super(config) + this.betas = linspace( + config.beta_start ** 0.5, + config.beta_end ** 0.5, + config.num_train_timesteps, + ).pow(2) + + this.alphas = linspace(1, 1, config.num_train_timesteps).sub(this.betas) + this.alphasCumprod = this.alphas.cumprod() + + this.sigmas = linspace(1, 1, config.num_train_timesteps) + .sub(this.alphasCumprod) + .div(this.alphasCumprod) + .sqrt() + this.timesteps = linspace( + 0, + config.num_train_timesteps - 1, + config.num_train_timesteps, + ).reverse() + + this.sigmas = cat([ + this.sigmas.reverse(), + new Tensor(this.sigmas.type, [0], [1]), + ]) + + this.config = config + } + + setTimesteps (numInferenceSteps: number) { + this.numInferenceSteps = numInferenceSteps + + const stepRatio = ~~( + this.config.num_train_timesteps / this.numInferenceSteps + ) + this.timesteps = range(1, numInferenceSteps + 1) + .reverse() + .mul(stepRatio) + .round() + this.timesteps = this.timesteps.sub(1) + + this.sigmas = linspace(1, 1, this.config.num_train_timesteps) + .sub(this.alphasCumprod) + .div(this.alphasCumprod) + .sqrt() + this.sigmas = interp( + this.timesteps, + range(0, this.sigmas.data.length), + this.sigmas, + ) + + this.sigmas = cat([this.sigmas, new Tensor(this.sigmas.type, [0], [1])]) + + this.stepIndex = 0 + } + + scaleInput (input: Tensor) { + const sigma = this.sigmas.data[this.stepIndex] + const scale = (sigma ** 2 + 1) ** 0.5 + return input.div(scale) + } + + get initNoiseSigma () { + return Math.max(...this.sigmas.data) + } + + step ( + modelOutput: Tensor, + timestep: number, + sample: Tensor, + s_churn: number = 0.0, + s_tmin: number = 0.0, + s_tmax: number = Infinity, + s_noise: number = 1.0, + ) { + if (this.numInferenceSteps === null) { + throw new Error( + "Number of inference steps is 'null', you need to run 'setTimesteps' after creating the scheduler", + ) + } + + const sigma = this.sigmas.data[this.stepIndex] + + // Get gama with the equivalent of this python code + let gamma = 0.0 + if (s_tmin <= sigma && sigma <= s_tmax) { + gamma = Math.min( + s_churn / (this.sigmas.data.length - 1), + Math.sqrt(2) - 1, + ) + } + + const noise = randomNormalTensor(modelOutput.dims) + + const eps = noise.mul(s_noise) + const sigma_hat = sigma * (gamma + 1) + + if (gamma > 0) { + sample = sample.add(eps.mul(sigma_hat ** 2 - sigma ** 2).sqrt()) + } + + // # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + // # config.prediction_type == "epsilon": + const denoised = sample.sub(modelOutput.mul(sigma_hat)) + + // 2. Convert to an ODE derivative + const derivative = sample.sub(denoised).div(sigma_hat) + + const dt = this.sigmas.data[this.stepIndex + 1] - sigma_hat + + const prevSample = sample.add(derivative.mul(dt)) + + this.stepIndex++ + + return prevSample + } +} diff --git a/src/util/Tensor.ts b/src/util/Tensor.ts index cb3d223..03b45a6 100644 --- a/src/util/Tensor.ts +++ b/src/util/Tensor.ts @@ -227,6 +227,56 @@ Tensor.prototype.cos_ = function () { return this } +Tensor.prototype.sqrt = function () { + return this.clone().sqrt_() +} + +Tensor.prototype.sqrt_ = function () { + for (let i = 0; i < this.data.length; ++i) { + this.data[i] = Math.sqrt(this.data[i]) + } + return this +} + +export function interp ( + x: Tensor, + xp: Tensor, + fp: Tensor, +) { + if (xp.dims.length !== 1) { + throw new Error('xp must be 1 dimensional') + } + if (fp.dims.length !== 1) { + throw new Error('fp must be 1 dimensional') + } + if (xp.dims[0] !== fp.dims[0]) { + throw new Error('xp and fp must have the same length') + } + if (x.dims.length !== 1) { + throw new Error('x must be 1 dimensional') + } + const newDims = x.dims.slice() + // @ts-ignore + const newData = new x.data.constructor(newDims.reduce((a, b) => a * b)) + const left = fp.data[0] + const right = fp.data[fp.data.length - 1] + for (let i = 0; i < newData.length; ++i) { + const index = xp.data.findIndex((v) => v > x.data[i]) + if (index === -1) { + newData[i] = right + } else if (index === 0) { + newData[i] = left + } else { + const x1 = xp.data[index - 1] + const x2 = xp.data[index] + const y1 = fp.data[index - 1] + const y2 = fp.data[index] + newData[i] = ((x.data[i] - x1) * (y2 - y1)) / (x2 - x1) + y1 + } + } + return new Tensor(x.type, newData, newDims) +} + Tensor.prototype.location = 'cpu' export function range (start: number, end: number, step = 1, type = 'float32') {