invoke-ai · hipsterusername · Nov 30, 2023 · Nov 17, 2023 · Nov 20, 2023 · Nov 20, 2023
@@ -79,6 +79,12 @@
 
 SAMPLER_NAME_VALUES = Literal[tuple(SCHEDULER_MAP.keys())]
 
+# HACK: Many nodes are currently hard-coded to use a fixed latent scale factor of 8. This is fragile, and will need to
+# be addressed if future models use a different latent scale factor. Also, note that there may be places where the scale
+# factor is hard-coded to a literal '8' rather than using this constant.
+# The ratio of image:latent dimensions is LATENT_SCALE_FACTOR:1, or 8:1.
+LATENT_SCALE_FACTOR = 8
+
 
 @invocation_output("scheduler_output")
 class SchedulerOutput(BaseInvocationOutput):
@@ -394,9 +400,9 @@ def prep_control_data(
         exit_stack: ExitStack,
         do_classifier_free_guidance: bool = True,
     ) -> List[ControlNetData]:
-        # assuming fixed dimensional scaling of 8:1 for image:latents
-        control_height_resize = latents_shape[2] * 8
-        control_width_resize = latents_shape[3] * 8
+        # Assuming fixed dimensional scaling of LATENT_SCALE_FACTOR.
+        control_height_resize = latents_shape[2] * LATENT_SCALE_FACTOR
+        control_width_resize = latents_shape[3] * LATENT_SCALE_FACTOR
         if control_input is None:
             control_list = None
         elif isinstance(control_input, list) and len(control_input) == 0:
@@ -909,12 +915,12 @@ class ResizeLatentsInvocation(BaseInvocation):
     )
     width: int = InputField(
         ge=64,
-        multiple_of=8,
+        multiple_of=LATENT_SCALE_FACTOR,
         description=FieldDescriptions.width,
     )
     height: int = InputField(
         ge=64,
-        multiple_of=8,
+        multiple_of=LATENT_SCALE_FACTOR,
         description=FieldDescriptions.width,
     )
     mode: LATENTS_INTERPOLATION_MODE = InputField(default="bilinear", description=FieldDescriptions.interp_mode)
@@ -928,7 +934,7 @@ def invoke(self, context: InvocationContext) -> LatentsOutput:
 
         resized_latents = torch.nn.functional.interpolate(
             latents.to(device),
-            size=(self.height // 8, self.width // 8),
+            size=(self.height // LATENT_SCALE_FACTOR, self.width // LATENT_SCALE_FACTOR),
             mode=self.mode,
             antialias=self.antialias if self.mode in ["bilinear", "bicubic"] else False,
         )
@@ -1166,3 +1172,60 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
         # context.services.latents.set(name, resized_latents)
         context.services.latents.save(name, blended_latents)
         return build_latents_output(latents_name=name, latents=blended_latents)
+
+
+# The Crop Latents node was copied from @skunkworxdark's implementation here:
+# https://github.com/skunkworxdark/XYGrid_nodes/blob/74647fa9c1fa57d317a94bd43ca689af7f0aae5e/images_to_grids.py#L1117C1-L1167C80
+@invocation(
+    "crop_latents",
+    title="Crop Latents",
+    tags=["latents", "crop"],
+    category="latents",
+    version="1.0.0",
+)
+# TODO(ryand): Named `CropLatentsCoreInvocation` to prevent a conflict with custom node `CropLatentsInvocation`.
+# Currently, if the class names conflict then 'GET /openapi.json' fails.
+class CropLatentsCoreInvocation(BaseInvocation):
+    """Crops a latent-space tensor to a box specified in image-space. The box dimensions and coordinates must be
+    divisible by the latent scale factor of 8.
+    """
+
+    latents: LatentsField = InputField(
+        description=FieldDescriptions.latents,
+        input=Input.Connection,
+    )
+    x: int = InputField(
+        ge=0,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The left x coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    y: int = InputField(
+        ge=0,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The top y coordinate (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    width: int = InputField(
+        ge=1,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The width (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+    height: int = InputField(
+        ge=1,
+        multiple_of=LATENT_SCALE_FACTOR,
+        description="The height (in px) of the crop rectangle in image space. This value will be converted to a dimension in latent space.",
+    )
+
+    def invoke(self, context: InvocationContext) -> LatentsOutput:
+        latents = context.services.latents.get(self.latents.latents_name)
+
+        x1 = self.x // LATENT_SCALE_FACTOR
+        y1 = self.y // LATENT_SCALE_FACTOR
+        x2 = x1 + (self.width // LATENT_SCALE_FACTOR)
+        y2 = y1 + (self.height // LATENT_SCALE_FACTOR)
+
+        cropped_latents = latents[..., y1:y2, x1:x2]
+
+        name = f"{context.graph_execution_state_id}__{self.id}"
+        context.services.latents.save(name, cropped_latents)
+
+        return build_latents_output(latents_name=name, latents=cropped_latents)
@@ -0,0 +1,181 @@
+import numpy as np
+from PIL import Image
+from pydantic import BaseModel
+
+from invokeai.app.invocations.baseinvocation import (
+    BaseInvocation,
+    BaseInvocationOutput,
+    InputField,
+    InvocationContext,
+    OutputField,
+    WithMetadata,
+    WithWorkflow,
+    invocation,
+    invocation_output,
+)
+from invokeai.app.invocations.primitives import ImageField, ImageOutput
+from invokeai.app.services.image_records.image_records_common import ImageCategory, ResourceOrigin
+from invokeai.backend.tiles.tiles import calc_tiles_with_overlap, merge_tiles_with_linear_blending
+from invokeai.backend.tiles.utils import Tile
+
+
+class TileWithImage(BaseModel):
+    tile: Tile
+    image: ImageField
+
+
+@invocation_output("calculate_image_tiles_output")
+class CalculateImageTilesOutput(BaseInvocationOutput):
+    tiles: list[Tile] = OutputField(description="The tiles coordinates that cover a particular image shape.")
+
+
+@invocation("calculate_image_tiles", title="Calculate Image Tiles", tags=["tiles"], category="tiles", version="1.0.0")
+class CalculateImageTilesInvocation(BaseInvocation):
+    """Calculate the coordinates and overlaps of tiles that cover a target image shape."""
+
+    image_width: int = InputField(ge=1, default=1024, description="The image width, in pixels, to calculate tiles for.")
+    image_height: int = InputField(
+        ge=1, default=1024, description="The image height, in pixels, to calculate tiles for."
+    )
+    tile_width: int = InputField(ge=1, default=576, description="The tile width, in pixels.")
+    tile_height: int = InputField(ge=1, default=576, description="The tile height, in pixels.")
+    overlap: int = InputField(
+        ge=0,
+        default=128,
+        description="The target overlap, in pixels, between adjacent tiles. Adjacent tiles will overlap by at least this amount",
+    )
+
+    def invoke(self, context: InvocationContext) -> CalculateImageTilesOutput:
+        tiles = calc_tiles_with_overlap(
+            image_height=self.image_height,
+            image_width=self.image_width,
+            tile_height=self.tile_height,
+            tile_width=self.tile_width,
+            overlap=self.overlap,
+        )
+        return CalculateImageTilesOutput(tiles=tiles)
+
+
+@invocation_output("tile_to_properties_output")
+class TileToPropertiesOutput(BaseInvocationOutput):
+    coords_left: int = OutputField(description="Left coordinate of the tile relative to its parent image.")
+    coords_right: int = OutputField(description="Right coordinate of the tile relative to its parent image.")
+    coords_top: int = OutputField(description="Top coordinate of the tile relative to its parent image.")
+    coords_bottom: int = OutputField(description="Bottom coordinate of the tile relative to its parent image.")
+
+    # HACK: The width and height fields are 'meta' fields that can easily be calculated from the other fields on this
+    # object. Including redundant fields that can cheaply/easily be re-calculated goes against conventional API design
+    # principles. These fields are included, because 1) they are often useful in tiled workflows, and 2) they are
+    # difficult to calculate in a workflow (even though it's just a couple of subtraction nodes the graph gets
+    # surprisingly complicated).
+    width: int = OutputField(description="The width of the tile. Equal to coords_right - coords_left.")
+    height: int = OutputField(description="The height of the tile. Equal to coords_bottom - coords_top.")
+
+    overlap_top: int = OutputField(description="Overlap between this tile and its top neighbor.")
+    overlap_bottom: int = OutputField(description="Overlap between this tile and its bottom neighbor.")
+    overlap_left: int = OutputField(description="Overlap between this tile and its left neighbor.")
+    overlap_right: int = OutputField(description="Overlap between this tile and its right neighbor.")
+
+
+@invocation("tile_to_properties", title="Tile to Properties", tags=["tiles"], category="tiles", version="1.0.0")
+class TileToPropertiesInvocation(BaseInvocation):
+    """Split a Tile into its individual properties."""
+
+    tile: Tile = InputField(description="The tile to split into properties.")
+
+    def invoke(self, context: InvocationContext) -> TileToPropertiesOutput:
+        return TileToPropertiesOutput(
+            coords_left=self.tile.coords.left,
+            coords_right=self.tile.coords.right,
+            coords_top=self.tile.coords.top,
+            coords_bottom=self.tile.coords.bottom,
+            width=self.tile.coords.right - self.tile.coords.left,
+            height=self.tile.coords.bottom - self.tile.coords.top,
+            overlap_top=self.tile.overlap.top,
+            overlap_bottom=self.tile.overlap.bottom,
+            overlap_left=self.tile.overlap.left,
+            overlap_right=self.tile.overlap.right,
+        )
+
+
+@invocation_output("pair_tile_image_output")
+class PairTileImageOutput(BaseInvocationOutput):
+    tile_with_image: TileWithImage = OutputField(description="A tile description with its corresponding image.")
+
+
+@invocation("pair_tile_image", title="Pair Tile with Image", tags=["tiles"], category="tiles", version="1.0.0")
+class PairTileImageInvocation(BaseInvocation):
+    """Pair an image with its tile properties."""
+
+    # TODO(ryand): The only reason that PairTileImage is needed is because the iterate/collect nodes don't preserve
+    # order. Can this be fixed?
+
+    image: ImageField = InputField(description="The tile image.")
+    tile: Tile = InputField(description="The tile properties.")
+
+    def invoke(self, context: InvocationContext) -> PairTileImageOutput:
+        return PairTileImageOutput(
+            tile_with_image=TileWithImage(
+                tile=self.tile,
+                image=self.image,
+            )
+        )
+
+
+@invocation("merge_tiles_to_image", title="Merge Tiles to Image", tags=["tiles"], category="tiles", version="1.0.0")
+class MergeTilesToImageInvocation(BaseInvocation, WithMetadata, WithWorkflow):
+    """Merge multiple tile images into a single image."""
+
+    # Inputs
+    tiles_with_images: list[TileWithImage] = InputField(description="A list of tile images with tile properties.")
+    blend_amount: int = InputField(
+        ge=0,
+        description="The amount to blend adjacent tiles in pixels. Must be <= the amount of overlap between adjacent tiles.",
+    )
+
+    def invoke(self, context: InvocationContext) -> ImageOutput:
+        images = [twi.image for twi in self.tiles_with_images]
+        tiles = [twi.tile for twi in self.tiles_with_images]
+
+        # Infer the output image dimensions from the max/min tile limits.
+        height = 0
+        width = 0
+        for tile in tiles:
+            height = max(height, tile.coords.bottom)
+            width = max(width, tile.coords.right)
+
+        # Get all tile images for processing.
+        # TODO(ryand): It pains me that we spend time PNG decoding each tile from disk when they almost certainly
+        # existed in memory at an earlier point in the graph.
+        tile_np_images: list[np.ndarray] = []
+        for image in images:
+            pil_image = context.services.images.get_pil_image(image.image_name)
+            pil_image = pil_image.convert("RGB")
+            tile_np_images.append(np.array(pil_image))
+
+        # Prepare the output image buffer.
+        # Check the first tile to determine how many image channels are expected in the output.
+        channels = tile_np_images[0].shape[-1]
+        dtype = tile_np_images[0].dtype
+        np_image = np.zeros(shape=(height, width, channels), dtype=dtype)
+
+        merge_tiles_with_linear_blending(
+            dst_image=np_image, tiles=tiles, tile_images=tile_np_images, blend_amount=self.blend_amount
+        )
+        pil_image = Image.fromarray(np_image)
+
+        image_dto = context.services.images.create(
+            image=pil_image,
+            image_origin=ResourceOrigin.INTERNAL,
+            image_category=ImageCategory.GENERAL,
+            node_id=self.id,
+            session_id=context.graph_execution_state_id,
+            is_intermediate=self.is_intermediate,
+            metadata=self.metadata,
+            workflow=self.workflow,
+        )
+        return ImageOutput(
+            image=ImageField(image_name=image_dto.image_name),
+            width=image_dto.width,
+            height=image_dto.height,
+        )