invoke-ai · RyanJDick · Aug 29, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
@@ -40,20 +40,18 @@ class FluxTextEncoderInvocation(BaseInvocation):
 
  @torch.no_grad()
  def invoke(self, context: InvocationContext) -> FluxConditioningOutput:
- t5_embeddings, clip_embeddings = self._encode_prompt(context)
+ # Note: The T5 and CLIP encoding are done in separate functions to ensure that all model references are locally
+ # scoped. This ensures that the T5 model can be freed and gc'd before loading the CLIP model (if necessary).
+ t5_embeddings = self._t5_encode(context)
+ clip_embeddings = self._clip_encode(context)
  conditioning_data = ConditioningFieldData(
  conditionings=[FLUXConditioningInfo(clip_embeds=clip_embeddings, t5_embeds=t5_embeddings)]
  )
 
  conditioning_name = context.conditioning.save(conditioning_data)
  return FluxConditioningOutput.build(conditioning_name)
 
- def _encode_prompt(self, context: InvocationContext) -> tuple[torch.Tensor, torch.Tensor]:
- # Load CLIP.
- clip_tokenizer_info = context.models.load(self.clip.tokenizer)
- clip_text_encoder_info = context.models.load(self.clip.text_encoder)
-
- # Load T5.
+ def _t5_encode(self, context: InvocationContext) -> torch.Tensor:
  t5_tokenizer_info = context.models.load(self.t5_encoder.tokenizer)
  t5_text_encoder_info = context.models.load(self.t5_encoder.text_encoder)
 
@@ -70,6 +68,15 @@ def _encode_prompt(self, context: InvocationContext) -> tuple[torch.Tensor, torc
 
  prompt_embeds = t5_encoder(prompt)
 
+ assert isinstance(prompt_embeds, torch.Tensor)
+ return prompt_embeds
+
+ def _clip_encode(self, context: InvocationContext) -> torch.Tensor:
+ clip_tokenizer_info = context.models.load(self.clip.tokenizer)
+ clip_text_encoder_info = context.models.load(self.clip.text_encoder)
+
+ prompt = [self.prompt]
+
  with (
  clip_text_encoder_info as clip_text_encoder,
  clip_tokenizer_info as clip_tokenizer,
@@ -81,6 +88,5 @@ def _encode_prompt(self, context: InvocationContext) -> tuple[torch.Tensor, torc
 
  pooled_prompt_embeds = clip_encoder(prompt)
 
- assert isinstance(prompt_embeds, torch.Tensor)
  assert isinstance(pooled_prompt_embeds, torch.Tensor)
- return prompt_embeds, pooled_prompt_embeds
+ return pooled_prompt_embeds
@@ -58,26 +58,28 @@ class FluxTextToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
 
  @torch.no_grad()
  def invoke(self, context: InvocationContext) -> ImageOutput:
- # Load the conditioning data.
- cond_data = context.conditioning.load(self.positive_text_conditioning.conditioning_name)
- assert len(cond_data.conditionings) == 1
- flux_conditioning = cond_data.conditionings[0]
- assert isinstance(flux_conditioning, FLUXConditioningInfo)
-
- latents = self._run_diffusion(context, flux_conditioning.clip_embeds, flux_conditioning.t5_embeds)
+ latents = self._run_diffusion(context)
  image = self._run_vae_decoding(context, latents)
  image_dto = context.images.save(image=image)
  return ImageOutput.build(image_dto)
 
  def _run_diffusion(
  self,
  context: InvocationContext,
- clip_embeddings: torch.Tensor,
- t5_embeddings: torch.Tensor,
  ):
- transformer_info = context.models.load(self.transformer.transformer)
  inference_dtype = torch.bfloat16
 
+ # Load the conditioning data.
+ cond_data = context.conditioning.load(self.positive_text_conditioning.conditioning_name)
+ assert len(cond_data.conditionings) == 1
+ flux_conditioning = cond_data.conditionings[0]
+ assert isinstance(flux_conditioning, FLUXConditioningInfo)
+ flux_conditioning = flux_conditioning.to(dtype=inference_dtype)
+ t5_embeddings = flux_conditioning.t5_embeds
+ clip_embeddings = flux_conditioning.clip_embeds
+
+ transformer_info = context.models.load(self.transformer.transformer)
+
  # Prepare input noise.
  x = get_noise(
  num_samples=1,
@@ -88,24 +90,19 @@ def _run_diffusion(
  seed=self.seed,
  )
 
- img, img_ids = prepare_latent_img_patches(x)
+ x, img_ids = prepare_latent_img_patches(x)
 
  is_schnell = "schnell" in transformer_info.config.config_path
 
  timesteps = get_schedule(
  num_steps=self.num_steps,
- image_seq_len=img.shape[1],
+ image_seq_len=x.shape[1],
  shift=not is_schnell,
  )
 
  bs, t5_seq_len, _ = t5_embeddings.shape
  txt_ids = torch.zeros(bs, t5_seq_len, 3, dtype=inference_dtype, device=TorchDevice.choose_torch_device())
 
- # HACK(ryand): Manually empty the cache. Currently we don't check the size of the model before loading it from
- # disk. Since the transformer model is large (24GB), there's a good chance that it will OOM on 32GB RAM systems
- # if the cache is not empty.
- context.models._services.model_manager.load.ram_cache.make_room(24 * 2**30)
-
  with transformer_info as transformer:
  assert isinstance(transformer, Flux)
 
@@ -140,7 +137,7 @@ def step_callback() -> None:
 
  x = denoise(
  model=transformer,
- img=img,
+ img=x,
  img_ids=img_ids,
  txt=t5_embeddings,
  txt_ids=txt_ids,

@@ -111,16 +111,7 @@ def denoise(
  step_callback: Callable[[], None],
  guidance: float = 4.0,
 ):
- dtype = model.txt_in.bias.dtype
-
- # TODO(ryand): This shouldn't be necessary if we manage the dtypes properly in the caller.
- img = img.to(dtype=dtype)
- img_ids = img_ids.to(dtype=dtype)
- txt = txt.to(dtype=dtype)
- txt_ids = txt_ids.to(dtype=dtype)
- vec = vec.to(dtype=dtype)
-
- # this is ignored for schnell
+ # guidance_vec is ignored for schnell.
  guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
  for t_curr, t_prev in tqdm(list(zip(timesteps[:-1], timesteps[1:], strict=True))):
  t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
@@ -168,9 +159,9 @@ def prepare_latent_img_patches(latent_img: torch.Tensor) -> tuple[torch.Tensor,
  img = repeat(img, "1 ... -> bs ...", bs=bs)
 
  # Generate patch position ids.
- img_ids = torch.zeros(h // 2, w // 2, 3, device=img.device)
- img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2, device=img.device)[:, None]
- img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2, device=img.device)[None, :]
+ img_ids = torch.zeros(h // 2, w // 2, 3, device=img.device, dtype=img.dtype)
+ img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2, device=img.device, dtype=img.dtype)[:, None]
+ img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2, device=img.device, dtype=img.dtype)[None, :]
  img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
 
  return img, img_ids
@@ -72,6 +72,7 @@ def _load_and_cache(self, config: AnyModelConfig, submodel_type: Optional[SubMod
  pass
 
  config.path = str(self._get_model_path(config))
+ self._ram_cache.make_room(self.get_size_fs(config, Path(config.path), submodel_type))
  loaded_model = self._load_model(config, submodel_type)
 
  self._ram_cache.put(

@@ -193,15 +193,6 @@ def get(
  """
  pass
 
- @abstractmethod
- def exists(
- self,
- key: str,
- submodel_type: Optional[SubModelType] = None,
- ) -> bool:
- """Return true if the model identified by key and submodel_type is in the cache."""
- pass
-
  @abstractmethod
  def cache_size(self) -> int:
  """Get the total size of the models currently cached."""