diff --git a/packages/api/package.json b/packages/api/package.json
index 65578977d..0d9b1a5ce 100644
--- a/packages/api/package.json
+++ b/packages/api/package.json
@@ -22,6 +22,7 @@
     "prepare:type-check": "tsc --pretty --noEmit",
     "prepare": "run-s compile-schemas && run-p \"prepare:**\"",
     "compile-schemas": "node -r esm src/schema/compile-schemas.js",
+    "pull-ai-schema": "node -r esm src/schema/pull-ai-schema.js",
     "dev-server": "run-s compile-schemas && node dist/cli.js",
     "redoc": "nodemon -w src/schema/schema.yaml -x npm run prepare:redoc",
     "siserver": "nodemon -w dist -x node -r esm dist/stream-info-service.js -e js,yaml",
diff --git a/packages/api/src/schema/ai-api-schema.yaml b/packages/api/src/schema/ai-api-schema.yaml
new file mode 100644
index 000000000..d650032dc
--- /dev/null
+++ b/packages/api/src/schema/ai-api-schema.yaml
@@ -0,0 +1,866 @@
+openapi: 3.1.0
+paths:
+  /api/beta/generate/text-to-image:
+    post:
+      tags:
+        - generate
+      summary: Text To Image
+      description: Generate images from text prompts.
+      operationId: genTextToImage
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TextToImageParams"
+        required: true
+      responses:
+        "200":
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ImageResponse"
+                x-speakeasy-name-override: data
+        "400":
+          description: Bad Request
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - &ref_0
+                    $ref: "#/components/schemas/error"
+        "401":
+          description: Unauthorized
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_0
+        "422":
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPValidationError"
+                  - *ref_0
+        "500":
+          description: Internal Server Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_0
+        default:
+          description: Error
+          content:
+            application/json:
+              schema: *ref_0
+      security:
+        - HTTPBearer: []
+      x-speakeasy-name-override: textToImage
+  /api/beta/generate/image-to-image:
+    post:
+      tags:
+        - generate
+      summary: Image To Image
+      description: Apply image transformations to a provided image.
+      operationId: genImageToImage
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              $ref: "#/components/schemas/Body_genImageToImage"
+        required: true
+      responses:
+        "200":
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ImageResponse"
+                x-speakeasy-name-override: data
+        "400":
+          description: Bad Request
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - &ref_1
+                    $ref: "#/components/schemas/error"
+        "401":
+          description: Unauthorized
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_1
+        "422":
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPValidationError"
+                  - *ref_1
+        "500":
+          description: Internal Server Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_1
+        default:
+          description: Error
+          content:
+            application/json:
+              schema: *ref_1
+      security:
+        - HTTPBearer: []
+      x-speakeasy-name-override: imageToImage
+  /api/beta/generate/image-to-video:
+    post:
+      tags:
+        - generate
+      summary: Image To Video
+      description: Generate a video from a provided image.
+      operationId: genImageToVideo
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              $ref: "#/components/schemas/Body_genImageToVideo"
+        required: true
+      responses:
+        "200":
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/VideoResponse"
+                x-speakeasy-name-override: data
+        "400":
+          description: Bad Request
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - &ref_2
+                    $ref: "#/components/schemas/error"
+        "401":
+          description: Unauthorized
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_2
+        "422":
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPValidationError"
+                  - *ref_2
+        "500":
+          description: Internal Server Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_2
+        default:
+          description: Error
+          content:
+            application/json:
+              schema: *ref_2
+      security:
+        - HTTPBearer: []
+      x-speakeasy-name-override: imageToVideo
+  /api/beta/generate/upscale:
+    post:
+      tags:
+        - generate
+      summary: Upscale
+      description: Upscale an image by increasing its resolution.
+      operationId: genUpscale
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              $ref: "#/components/schemas/Body_genUpscale"
+        required: true
+      responses:
+        "200":
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ImageResponse"
+                x-speakeasy-name-override: data
+        "400":
+          description: Bad Request
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - &ref_3
+                    $ref: "#/components/schemas/error"
+        "401":
+          description: Unauthorized
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_3
+        "422":
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPValidationError"
+                  - *ref_3
+        "500":
+          description: Internal Server Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_3
+        default:
+          description: Error
+          content:
+            application/json:
+              schema: *ref_3
+      security:
+        - HTTPBearer: []
+      x-speakeasy-name-override: upscale
+  /api/beta/generate/audio-to-text:
+    post:
+      tags:
+        - generate
+      summary: Audio To Text
+      description: Transcribe audio files to text.
+      operationId: genAudioToText
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              $ref: "#/components/schemas/Body_genAudioToText"
+        required: true
+      responses:
+        "200":
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TextResponse"
+                x-speakeasy-name-override: data
+        "400":
+          description: Bad Request
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - &ref_4
+                    $ref: "#/components/schemas/error"
+        "401":
+          description: Unauthorized
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_4
+        "413":
+          description: Request Entity Too Large
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_4
+        "422":
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPValidationError"
+                  - *ref_4
+        "500":
+          description: Internal Server Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_4
+        default:
+          description: Error
+          content:
+            application/json:
+              schema: *ref_4
+      security:
+        - HTTPBearer: []
+      x-speakeasy-name-override: audioToText
+  /api/beta/generate/segment-anything-2:
+    post:
+      tags:
+        - generate
+      summary: Segment Anything 2
+      description: Segment objects in an image.
+      operationId: genSegmentAnything2
+      requestBody:
+        content:
+          multipart/form-data:
+            schema:
+              $ref: "#/components/schemas/Body_genSegmentAnything2"
+        required: true
+      responses:
+        "200":
+          description: Successful Response
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/MasksResponse"
+                x-speakeasy-name-override: data
+        "400":
+          description: Bad Request
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - &ref_5
+                    $ref: "#/components/schemas/error"
+        "401":
+          description: Unauthorized
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_5
+        "422":
+          description: Validation Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPValidationError"
+                  - *ref_5
+        "500":
+          description: Internal Server Error
+          content:
+            application/json:
+              schema:
+                oneOf:
+                  - $ref: "#/components/schemas/HTTPError"
+                  - *ref_5
+        default:
+          description: Error
+          content:
+            application/json:
+              schema: *ref_5
+      security:
+        - HTTPBearer: []
+      x-speakeasy-name-override: segmentAnything2
+components:
+  schemas:
+    APIError:
+      properties:
+        msg:
+          type: string
+          title: Msg
+          description: The error message.
+      type: object
+      required:
+        - msg
+      title: APIError
+      description: API error response model.
+    Body_genAudioToText:
+      properties:
+        audio:
+          type: string
+          format: binary
+          title: Audio
+          description: Uploaded audio file to be transcribed.
+        model_id:
+          type: string
+          title: Model Id
+          description: Hugging Face model ID used for transcription.
+          default: ""
+      type: object
+      required:
+        - audio
+        - model_id
+      title: Body_genAudioToText
+    Body_genImageToImage:
+      properties:
+        prompt:
+          type: string
+          title: Prompt
+          description: Text prompt(s) to guide image generation.
+        image:
+          type: string
+          format: binary
+          title: Image
+          description: Uploaded image to modify with the pipeline.
+        model_id:
+          type: string
+          title: Model Id
+          description: Hugging Face model ID used for image generation.
+          default: ""
+        strength:
+          type: number
+          title: Strength
+          description:
+            Degree of transformation applied to the reference image (0 to 1).
+          default: 0.8
+        guidance_scale:
+          type: number
+          title: Guidance Scale
+          description: >-
+            Encourages model to generate images closely linked to the text
+            prompt (higher values may reduce image quality).
+          default: 7.5
+        image_guidance_scale:
+          type: number
+          title: Image Guidance Scale
+          description: >-
+            Degree to which the generated image is pushed towards the initial
+            image.
+          default: 1.5
+        negative_prompt:
+          type: string
+          title: Negative Prompt
+          description: >-
+            Text prompt(s) to guide what to exclude from image generation.
+            Ignored if guidance_scale < 1.
+          default: ""
+        safety_check:
+          type: boolean
+          title: Safety Check
+          description: >-
+            Perform a safety check to estimate if generated images could be
+            offensive or harmful.
+          default: true
+        seed:
+          type: integer
+          title: Seed
+          description: Seed for random number generation.
+        num_inference_steps:
+          type: integer
+          title: Num Inference Steps
+          description: >-
+            Number of denoising steps. More steps usually lead to higher quality
+            images but slower inference. Modulated by strength.
+          default: 100
+        num_images_per_prompt:
+          type: integer
+          title: Num Images Per Prompt
+          description: Number of images to generate per prompt.
+          default: 1
+      type: object
+      required:
+        - prompt
+        - image
+        - model_id
+      title: Body_genImageToImage
+    Body_genImageToVideo:
+      properties:
+        image:
+          type: string
+          format: binary
+          title: Image
+          description: Uploaded image to generate a video from.
+        model_id:
+          type: string
+          title: Model Id
+          description: Hugging Face model ID used for video generation.
+          default: ""
+        height:
+          type: integer
+          title: Height
+          description: The height in pixels of the generated video.
+          default: 576
+        width:
+          type: integer
+          title: Width
+          description: The width in pixels of the generated video.
+          default: 1024
+        fps:
+          type: integer
+          title: Fps
+          description: The frames per second of the generated video.
+          default: 6
+        motion_bucket_id:
+          type: integer
+          title: Motion Bucket Id
+          description: >-
+            Used for conditioning the amount of motion for the generation. The
+            higher the number the more motion will be in the video.
+          default: 127
+        noise_aug_strength:
+          type: number
+          title: Noise Aug Strength
+          description: >-
+            Amount of noise added to the conditioning image. Higher values
+            reduce resemblance to the conditioning image and increase motion.
+          default: 0.02
+        safety_check:
+          type: boolean
+          title: Safety Check
+          description: >-
+            Perform a safety check to estimate if generated images could be
+            offensive or harmful.
+          default: true
+        seed:
+          type: integer
+          title: Seed
+          description: Seed for random number generation.
+        num_inference_steps:
+          type: integer
+          title: Num Inference Steps
+          description: >-
+            Number of denoising steps. More steps usually lead to higher quality
+            images but slower inference. Modulated by strength.
+          default: 25
+      type: object
+      required:
+        - image
+        - model_id
+      title: Body_genImageToVideo
+    Body_genSegmentAnything2:
+      properties:
+        image:
+          type: string
+          format: binary
+          title: Image
+          description: Image to segment.
+        model_id:
+          type: string
+          title: Model Id
+          description: Hugging Face model ID used for image generation.
+          default: ""
+        point_coords:
+          type: string
+          title: Point Coords
+          description: >-
+            Nx2 array of point prompts to the model, where each point is in
+            (X,Y) in pixels.
+        point_labels:
+          type: string
+          title: Point Labels
+          description: >-
+            Labels for the point prompts, where 1 indicates a foreground point
+            and 0 indicates a background point.
+        box:
+          type: string
+          title: Box
+          description:
+            "A length 4 array given as a box prompt to the model, in XYXY
+            format."
+        mask_input:
+          type: string
+          title: Mask Input
+          description: >-
+            A low-resolution mask input to the model, typically from a previous
+            prediction iteration, with the form 1xHxW (H=W=256 for SAM).
+        multimask_output:
+          type: boolean
+          title: Multimask Output
+          description: >-
+            If true, the model will return three masks for ambiguous input
+            prompts, often producing better masks than a single prediction.
+          default: true
+        return_logits:
+          type: boolean
+          title: Return Logits
+          description: >-
+            If true, returns un-thresholded mask logits instead of a binary
+            mask.
+          default: true
+        normalize_coords:
+          type: boolean
+          title: Normalize Coords
+          description: >-
+            If true, the point coordinates will be normalized to the range
+            [0,1], with point_coords expected to be with respect to image
+            dimensions.
+          default: true
+      type: object
+      required:
+        - image
+        - model_id
+      title: Body_genSegmentAnything2
+    Body_genUpscale:
+      properties:
+        prompt:
+          type: string
+          title: Prompt
+          description: Text prompt(s) to guide upscaled image generation.
+        image:
+          type: string
+          format: binary
+          title: Image
+          description: Uploaded image to modify with the pipeline.
+        model_id:
+          type: string
+          title: Model Id
+          description: Hugging Face model ID used for upscaled image generation.
+          default: ""
+        safety_check:
+          type: boolean
+          title: Safety Check
+          description: >-
+            Perform a safety check to estimate if generated images could be
+            offensive or harmful.
+          default: true
+        seed:
+          type: integer
+          title: Seed
+          description: Seed for random number generation.
+        num_inference_steps:
+          type: integer
+          title: Num Inference Steps
+          description: >-
+            Number of denoising steps. More steps usually lead to higher quality
+            images but slower inference. Modulated by strength.
+          default: 75
+      type: object
+      required:
+        - prompt
+        - image
+        - model_id
+      title: Body_genUpscale
+    HTTPError:
+      properties:
+        detail:
+          allOf:
+            - $ref: "#/components/schemas/APIError"
+          description: Detailed error information.
+      type: object
+      required:
+        - detail
+      title: HTTPError
+      description: HTTP error response model.
+    HTTPValidationError:
+      properties:
+        detail:
+          items:
+            $ref: "#/components/schemas/ValidationError"
+          type: array
+          title: Detail
+      type: object
+      title: HTTPValidationError
+    ImageResponse:
+      properties:
+        images:
+          items:
+            $ref: "#/components/schemas/Media"
+          type: array
+          title: Images
+          description: The generated images.
+      type: object
+      required:
+        - images
+      title: ImageResponse
+      description: Response model for image generation.
+    MasksResponse:
+      properties:
+        masks:
+          type: string
+          title: Masks
+          description: The generated masks.
+        scores:
+          type: string
+          title: Scores
+          description: The model's confidence scores for each generated mask.
+        logits:
+          type: string
+          title: Logits
+          description:
+            "The raw, unnormalized predictions (logits) for the masks."
+      type: object
+      required:
+        - masks
+        - scores
+        - logits
+      title: MasksResponse
+      description: Response model for object segmentation.
+    Media:
+      properties:
+        url:
+          type: string
+          title: Url
+          description: The URL where the media can be accessed.
+        seed:
+          type: integer
+          title: Seed
+          description: The seed used to generate the media.
+        nsfw:
+          type: boolean
+          title: Nsfw
+          description: Whether the media was flagged as NSFW.
+      type: object
+      required:
+        - url
+        - seed
+        - nsfw
+      title: Media
+      description:
+        A media object containing information about the generated media.
+    TextResponse:
+      properties:
+        text:
+          type: string
+          title: Text
+          description: The generated text.
+        chunks:
+          items:
+            $ref: "#/components/schemas/chunk"
+          type: array
+          title: Chunks
+          description: The generated text chunks.
+      type: object
+      required:
+        - text
+        - chunks
+      title: TextResponse
+      description: Response model for text generation.
+    TextToImageParams:
+      properties:
+        model_id:
+          type: string
+          title: Model Id
+          description: Hugging Face model ID used for image generation.
+          default: SG161222/RealVisXL_V4.0_Lightning
+        prompt:
+          type: string
+          title: Prompt
+          description: >-
+            Text prompt(s) to guide image generation. Separate multiple prompts
+            with '|' if supported by the model.
+        height:
+          type: integer
+          title: Height
+          description: The height in pixels of the generated image.
+          default: 576
+        width:
+          type: integer
+          title: Width
+          description: The width in pixels of the generated image.
+          default: 1024
+        guidance_scale:
+          type: number
+          title: Guidance Scale
+          description: >-
+            Encourages model to generate images closely linked to the text
+            prompt (higher values may reduce image quality).
+          default: 7.5
+        negative_prompt:
+          type: string
+          title: Negative Prompt
+          description: >-
+            Text prompt(s) to guide what to exclude from image generation.
+            Ignored if guidance_scale < 1.
+          default: ""
+        safety_check:
+          type: boolean
+          title: Safety Check
+          description: >-
+            Perform a safety check to estimate if generated images could be
+            offensive or harmful.
+          default: true
+        seed:
+          type: integer
+          title: Seed
+          description: Seed for random number generation.
+        num_inference_steps:
+          type: integer
+          title: Num Inference Steps
+          description: >-
+            Number of denoising steps. More steps usually lead to higher quality
+            images but slower inference. Modulated by strength.
+          default: 50
+        num_images_per_prompt:
+          type: integer
+          title: Num Images Per Prompt
+          description: Number of images to generate per prompt.
+          default: 1
+      type: object
+      required:
+        - prompt
+        - model_id
+      title: TextToImageParams
+    ValidationError:
+      properties:
+        loc:
+          items:
+            anyOf:
+              - type: string
+              - type: integer
+          type: array
+          title: Location
+        msg:
+          type: string
+          title: Message
+        type:
+          type: string
+          title: Error Type
+      type: object
+      required:
+        - loc
+        - msg
+        - type
+      title: ValidationError
+    VideoResponse:
+      properties:
+        images:
+          items:
+            $ref: "#/components/schemas/Media"
+          type: array
+          title: Images
+          description: The generated images.
+      type: object
+      required:
+        - images
+      title: VideoResponse
+      description: Response model for image generation.
+    chunk:
+      properties:
+        timestamp:
+          items: {}
+          type: array
+          title: Timestamp
+          description: The timestamp of the chunk.
+        text:
+          type: string
+          title: Text
+          description: The text of the chunk.
+      type: object
+      required:
+        - timestamp
+        - text
+      title: chunk
+      description: A chunk of text with a timestamp.
+  securitySchemes:
+    HTTPBearer:
+      type: http
+      scheme: bearer
diff --git a/packages/api/src/schema/api-schema.yaml b/packages/api/src/schema/api-schema.yaml
index 12fafc8e0..1ef3d52d2 100644
--- a/packages/api/src/schema/api-schema.yaml
+++ b/packages/api/src/schema/api-schema.yaml
@@ -35,6 +35,7 @@ tags:
     description: Operations related to tasks api
   - name: generate
     description: Operations related to AI generate api
+$ref: "./ai-api-schema.yaml"
 components:
   securitySchemes:
     apiKey:
@@ -2860,241 +2861,6 @@ components:
         targetSegmentSizeSecs:
           $ref: >-
             #/components/schemas/new-asset-payload/properties/targetSegmentSizeSecs
-    # AI Generate payloads. Keep in mind that these use snake_case instead of camelCase since
-    # they implement the same interface as the AI Gateway Livepeer node.
-    audio-to-text-payload:
-      type: object
-      required:
-        - audio
-      properties:
-        audio:
-          type: string
-          format: binary
-          maxLength: 10485760 # 10MiB
-        model_id:
-          type: string
-          default: openai/whisper-large-v3
-          enum:
-            - openai/whisper-large-v3
-    text-to-image-payload:
-      type: object
-      required:
-        - prompt
-      additionalProperties: false
-      properties:
-        prompt:
-          type: string
-        model_id:
-          type: string
-          default: SG161222/RealVisXL_V4.0_Lightning
-          enum:
-            - SG161222/RealVisXL_V4.0_Lightning
-            - ByteDance/SDXL-Lightning
-        height:
-          type: integer
-        width:
-          type: integer
-        guidance_scale:
-          type: number
-          default: 7.5
-        negative_prompt:
-          type: string
-          default: ""
-        safety_check:
-          type: boolean
-          default: true
-        seed:
-          type: integer
-        num_inference_steps:
-          type: integer
-          default: 50
-          minimum: 1
-          maximum: 200
-        num_images_per_prompt:
-          type: integer
-          default: 1
-          minimum: 1
-          maximum: 20
-    image-to-image-payload:
-      type: object
-      required:
-        - prompt
-        - image
-      additionalProperties: false
-      properties:
-        prompt:
-          type: string
-        image:
-          type: string
-          format: binary
-          maxLength: 10485760 # 10MiB
-        model_id:
-          type: string
-          default: timbrooks/instruct-pix2pix
-          enum:
-            - timbrooks/instruct-pix2pix
-            - ByteDance/SDXL-Lightning
-            - SG161222/RealVisXL_V4.0_Lightning
-        strength:
-          type: number
-          default: 0.8
-        guidance_scale:
-          type: number
-          default: 7.5
-        image_guidance_scale:
-          type: number
-          default: 1.5
-        negative_prompt:
-          type: string
-          default: ""
-        safety_check:
-          type: boolean
-          default: true
-        seed:
-          type: integer
-        num_images_per_prompt:
-          type: integer
-          default: 1
-          minimum: 1
-          maximum: 20
-    image-to-video-payload:
-      type: object
-      required:
-        - image
-      additionalProperties: false
-      properties:
-        image:
-          type: string
-          format: binary
-          maxLength: 10485760 # 10MiB
-        model_id:
-          type: string
-          default: stabilityai/stable-video-diffusion-img2vid-xt-1-1
-          enum:
-            - stabilityai/stable-video-diffusion-img2vid-xt-1-1
-        height:
-          type: integer
-          default: 576
-        width:
-          type: integer
-          default: 1024
-        fps:
-          type: integer
-          default: 6
-        motion_bucket_id:
-          type: integer
-          default: 127
-        noise_aug_strength:
-          type: number
-          default: 0.02
-        seed:
-          type: integer
-        safety_check:
-          type: boolean
-          default: true
-    upscale-payload:
-      type: object
-      required:
-        - prompt
-        - image
-      additionalProperties: false
-      properties:
-        prompt:
-          type: string
-        image:
-          type: string
-          format: binary
-          maxLength: 10485760 # 10MiB
-        model_id:
-          type: string
-          default: stabilityai/stable-diffusion-x4-upscaler
-          enum:
-            - stabilityai/stable-diffusion-x4-upscaler
-        safety_check:
-          type: boolean
-          default: true
-        seed:
-          type: integer
-    ai-text-response:
-      type: object
-      required:
-        - text
-        - chunks
-      properties:
-        text:
-          type: string
-        chunks:
-          type: array
-          items:
-            type: object
-            required:
-              - timestamp
-              - text
-            properties:
-              timestamp:
-                type: array
-                items: {}
-              text:
-                type: string
-    ai-image-response:
-      type: object
-      required:
-        - images
-      properties:
-        images:
-          type: array
-          items:
-            type: object
-            required:
-              - url
-              - seed
-              - nsfw
-            properties:
-              url:
-                type: string
-                title: Url
-              seed:
-                type: integer
-                title: Seed
-              nsfw:
-                type: boolean
-                title: Nsfw
-    ai-error:
-      oneOf:
-        - $ref: "#/components/schemas/error"
-        - type: object
-          required:
-            - detail
-          properties:
-            detail:
-              type: object
-              required:
-                - msg
-              properties:
-                msg:
-                  type: string
-        - type: object
-          properties:
-            detail:
-              type: array
-              items:
-                type: object
-                required:
-                  - loc
-                  - msg
-                  - type
-                properties:
-                  msg:
-                    type: string
-                  type:
-                    type: string
-                    title: Error Type
-                  loc:
-                    type: array
-                    items:
-                      anyOf:
-                        - type: string
-                        - type: integer
 paths:
   /stream:
     post:
@@ -5202,139 +4968,3 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/error"
-  # AI APIs section, imported from AI OpenAPI spec with some adjustments
-  "/beta/generate/audio-to-text":
-    post:
-      operationId: genAudioToText
-      summary: Audio To Text
-      x-speakeasy-name-override: audioToText
-      tags:
-        - generate
-      requestBody:
-        required: true
-        content:
-          multipart/form-data:
-            schema:
-              $ref: "#/components/schemas/audio-to-text-payload"
-      responses:
-        "200":
-          description: Successful response
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-text-response"
-                x-speakeasy-name-override: data
-        default:
-          description: Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-error"
-  "/beta/generate/text-to-image":
-    post:
-      operationId: genTextToImage
-      summary: Text To Image
-      x-speakeasy-name-override: textToImage
-      tags:
-        - generate
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/text-to-image-payload"
-      responses:
-        "200":
-          description: Successful Response
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-image-response"
-                x-speakeasy-name-override: data
-        default:
-          description: Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-error"
-  "/beta/generate/image-to-image":
-    post:
-      operationId: genImageToImage
-      x-speakeasy-name-override: imageToImage
-      summary: Image To Image
-      tags:
-        - generate
-      requestBody:
-        required: true
-        content:
-          multipart/form-data:
-            schema:
-              $ref: "#/components/schemas/image-to-image-payload"
-      responses:
-        "200":
-          description: Successful Response
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-image-response"
-                x-speakeasy-name-override: data
-        default:
-          description: Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-error"
-  "/beta/generate/image-to-video":
-    post:
-      operationId: genImageToVideo
-      x-speakeasy-name-override: imageToVideo
-      summary: Image To Video
-      tags:
-        - generate
-      requestBody:
-        content:
-          multipart/form-data:
-            schema:
-              $ref: "#/components/schemas/image-to-video-payload"
-        required: true
-      responses:
-        "200":
-          description: Successful Response
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-image-response"
-                x-speakeasy-name-override: data
-        default:
-          description: Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-error"
-  "/beta/generate/upscale":
-    post:
-      operationId: genUpscale
-      x-speakeasy-name-override: upscale
-      summary: Upscale
-      tags:
-        - generate
-      requestBody:
-        content:
-          multipart/form-data:
-            schema:
-              $ref: "#/components/schemas/upscale-payload"
-        required: true
-      responses:
-        "200":
-          description: Successful Response
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-image-response"
-                x-speakeasy-name-override: data
-        default:
-          description: Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ai-error"
diff --git a/packages/api/src/schema/pull-ai-schema.js b/packages/api/src/schema/pull-ai-schema.js
new file mode 100644
index 000000000..02e340d75
--- /dev/null
+++ b/packages/api/src/schema/pull-ai-schema.js
@@ -0,0 +1,102 @@
+import fs from "fs-extra";
+import { safeLoad as parseYaml, safeDump as serializeYaml } from "js-yaml";
+import path from "path";
+
+// This downloads the AI schema from the AI worker repo and saves in the local
+// ai-api-schema.yaml file, referenced by our main api-schema.yaml file.
+
+const defaultModels = {
+  "text-to-image": "SG161222/RealVisXL_V4.0_Lightning",
+  "image-to-image": "timbrooks/instruct-pix2pix",
+  "image-to-video": "stabilityai/stable-video-diffusion-img2vid-xt-1-1",
+  upscale: "stabilityai/stable-diffusion-x4-upscaler",
+  "audio-to-text": "openai/whisper-large-v3",
+};
+const schemaDir = path.resolve(__dirname, ".");
+const aiSchemaUrl =
+  "https://raw.githubusercontent.com/livepeer/ai-worker/refs/heads/main/runner/gateway.openapi.yaml";
+
+const write = (dir, data) => {
+  if (fs.existsSync(dir)) {
+    const existing = fs.readFileSync(dir, "utf8");
+    if (existing === data) {
+      return;
+    }
+  }
+  fs.writeFileSync(dir, data, "utf8");
+  console.log(`wrote ${dir}`);
+};
+
+const mapObject = (obj, fn) => {
+  return Object.fromEntries(
+    Object.entries(obj).map(([key, value]) => fn(key, value)),
+  );
+};
+
+const downloadAiSchema = async () => {
+  // download the file
+  const response = await fetch(aiSchemaUrl);
+  const data = await response.text();
+  const schema = parseYaml(data);
+
+  // remove info and servers fields
+  delete schema.info;
+  delete schema.servers;
+
+  // patches to the paths section
+  schema.paths = mapObject(schema.paths, (path, value) => {
+    // prefix paths with /api/beta/generate
+    path = `/api/beta/generate${path}`;
+    // remove security field
+    delete value.security;
+    // add $ref: "#/components/schemas/error" as oneOf to all of the error responses
+    const apiError = { $ref: "#/components/schemas/error" };
+    value.post.responses = mapObject(
+      value.post.responses,
+      (statusCode, response) => {
+        if (Math.floor(parseInt(statusCode) / 100) === 2) {
+          return [statusCode, response];
+        }
+        response.content["application/json"].schema = {
+          oneOf: [response.content["application/json"].schema, apiError],
+        };
+        return [statusCode, response];
+      },
+    );
+    // add $ref: "#/components/schemas/error" as the default response
+    if (!value.post.responses["default"]) {
+      value.post.responses["default"] = {
+        description: "Error",
+        content: { "application/json": { schema: apiError } },
+      };
+    }
+    return [path, value];
+  });
+
+  // Add default model_id to params objects
+  schema.components.schemas = mapObject(
+    schema.components.schemas,
+    (key, value) => {
+      if (!key.endsWith("Params")) {
+        return [key, value];
+      }
+      // transforms PipeNameParams to pipe-name
+      const pipelineName = key
+        .slice(0, -6)
+        .replace(/([a-z])([A-Z])/g, "$1-$2")
+        .toLowerCase();
+      if (pipelineName in defaultModels && value.properties.model_id) {
+        value.properties.model_id.default = defaultModels[pipelineName];
+      }
+      return [key, value];
+    },
+  );
+
+  const yaml = serializeYaml(schema);
+  write(path.resolve(schemaDir, "ai-api-schema.yaml"), yaml);
+};
+
+downloadAiSchema().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});