Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions FirebaseAI/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
- [fixed] Fixed various links in the Live API doc comments not mapping correctly.
- [fixed] Fixed minor translation issue for nanosecond conversion when receiving
`LiveServerGoingAwayNotice`. (#15410)
- [feature] Added support for sending video frames with the Live API via the `sendVideoRealtime`
method on [`LiveSession`](https://firebase.google.com/docs/reference/swift/firebaseai/api/reference/Classes/LiveSession).
(#15432)

# 12.4.0
- [feature] Added support for the URL context tool, which allows the model to access content
Expand Down
21 changes: 15 additions & 6 deletions FirebaseAI/Sources/Types/Public/Live/LiveSession.swift
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,24 @@ public final class LiveSession: Sendable {
await service.send(.realtimeInput(message))
}

/// Sends a video input stream to the model, using the realtime API.
/// Sends a video frame to the model, using the realtime API.
///
/// Instead of raw video data, the model expects individual frames of the video,
/// sent as images.
///
/// If your video has audio, send it seperately through ``LiveSession/sendAudioRealtime(_:)``.
///
/// For better performance, frames can also be sent at a lower rate than the video;
/// even as low as 1 frame per second.
///
/// - Parameters:
/// - video: Encoded video data, used to update the model on the client's conversation.
/// - format: The format that the video was encoded in (eg; `mp4`, `webm`, `wmv`, etc.,).
// TODO: (b/448671945) Make public after testing and next release
func sendVideoRealtime(_ video: Data, format: String) async {
/// - video: Encoded image data extracted from a frame of the video, used to update the model on
/// the client's conversation.
/// - mimeType: The IANA standard MIME type of the video frame data (eg; `images/png`,
/// `images/jpeg`etc.,).
public func sendVideoRealtime(_ video: Data, mimeType: String) async {
let message = BidiGenerateContentRealtimeInput(
video: InlineData(data: video, mimeType: "video/\(format)")
video: InlineData(data: video, mimeType: mimeType)
)
await service.send(.realtimeInput(message))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
objects = {

/* Begin PBXBuildFile section */
0E0481222EA2E51300A50172 /* DataUtils.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0E0481212EA2E51100A50172 /* DataUtils.swift */; };
0E460FAB2E9858E4007E26A6 /* LiveSessionTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0E460FAA2E9858E4007E26A6 /* LiveSessionTests.swift */; };
0EC8BAE22E98784E0075A4E0 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 868A7C532CCC26B500E449DD /* Assets.xcassets */; };
862218812D04E098007ED2D4 /* IntegrationTestUtils.swift in Sources */ = {isa = PBXBuildFile; fileRef = 862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */; };
Expand Down Expand Up @@ -44,6 +45,7 @@
/* End PBXContainerItemProxy section */

/* Begin PBXFileReference section */
0E0481212EA2E51100A50172 /* DataUtils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DataUtils.swift; sourceTree = "<group>"; };
0E460FAA2E9858E4007E26A6 /* LiveSessionTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LiveSessionTests.swift; sourceTree = "<group>"; };
862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = IntegrationTestUtils.swift; sourceTree = "<group>"; };
864F8F702D4980D60002EA7E /* ImagenIntegrationTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ImagenIntegrationTests.swift; sourceTree = "<group>"; };
Expand Down Expand Up @@ -168,6 +170,7 @@
8698D7442CD3CEF700ABA833 /* Utilities */ = {
isa = PBXGroup;
children = (
0E0481212EA2E51100A50172 /* DataUtils.swift */,
86D77E032D7B6C95003D155D /* InstanceConfig.swift */,
862218802D04E08D007ED2D4 /* IntegrationTestUtils.swift */,
);
Expand Down Expand Up @@ -304,6 +307,7 @@
DEF0BB512DA9B7450093E9F4 /* SchemaTests.swift in Sources */,
DEF0BB4F2DA74F680093E9F4 /* TestHelpers.swift in Sources */,
868A7C4F2CCC229F00E449DD /* Credentials.swift in Sources */,
0E0481222EA2E51300A50172 /* DataUtils.swift in Sources */,
864F8F712D4980DD0002EA7E /* ImagenIntegrationTests.swift in Sources */,
862218812D04E098007ED2D4 /* IntegrationTestUtils.swift in Sources */,
86D77DFC2D7A5340003D155D /* GenerateContentIntegrationTests.swift in Sources */,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"data" : [
{
"filename" : "videoplayback.mp4",
"idiom" : "universal",
"universal-type-identifier" : "public.mpeg-4"
}
],
"info" : {
"author" : "xcode",
"version" : 1
}
}
Binary file not shown.
51 changes: 51 additions & 0 deletions FirebaseAI/Tests/TestApp/Tests/Integration/LiveSessionTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,14 @@ struct LiveSessionTests {
role: "system",
parts: "When you receive a message, if the message is a single word, assume it's the first name of a person, and call the getLastName tool to get the last name of said person. Only respond with the last name."
)

static let animalInVideo = ModelContent(
role: "system",
parts: """
Send a one word response of what ANIMAL is in the video. \
If you don't receive a video, send "Test is broken, I didn't receive a video.".
""".trimmingCharacters(in: .whitespacesAndNewlines)
)
}

@Test(arguments: arguments)
Expand Down Expand Up @@ -181,6 +189,49 @@ struct LiveSessionTests {
#expect(modelResponse == "goodbye")
}

@Test(arguments: arguments.filter { $0.1 != ModelNames.gemini2FlashLive })
// gemini-2.0-flash-live-001 is buggy and likes to respond to the audio or system instruction
// (eg; it will say 'okay' or 'hello', instead of following the instructions)
func sendVideoRealtime_receiveText(_ config: InstanceConfig, modelName: String) async throws {
let model = FirebaseAI.componentInstance(config).liveModel(
modelName: modelName,
generationConfig: textConfig,
systemInstruction: SystemInstructions.animalInVideo
)

let session = try await model.connect()
guard let videoFile = NSDataAsset(name: "cat") else {
Issue.record("Missing video file 'cat' in Assets")
return
}

let frames = try await videoFile.videoFrames()
for frame in frames {
await session.sendVideoRealtime(frame, mimeType: "image/png")
}

// the model doesn't respond unless we send some audio too
// vertex also responds if you send text, but google ai doesn't
// (they both respond with audio though)
guard let audioFile = NSDataAsset(name: "hello") else {
Issue.record("Missing audio file 'hello.wav' in Assets")
return
}
await session.sendAudioRealtime(audioFile.data)
await session.sendAudioRealtime(Data(repeating: 0, count: audioFile.data.count))

let text = try await session.collectNextTextResponse()

await session.close()
let modelResponse = text
.trimmingCharacters(in: .whitespacesAndNewlines)
.trimmingCharacters(in: .punctuationCharacters)
.lowercased()

// model response varies
#expect(["kitten", "cat", "kitty"].contains(modelResponse))
}

@Test(arguments: arguments)
func realtime_functionCalling(_ config: InstanceConfig, modelName: String) async throws {
let model = FirebaseAI.componentInstance(config).liveModel(
Expand Down
64 changes: 64 additions & 0 deletions FirebaseAI/Tests/TestApp/Tests/Utilities/DataUtils.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

import AVFoundation
import SwiftUI

extension NSDataAsset {
/// The preferred file extension for this asset, if any.
///
/// This is set in the Asset catalog under the `File Type` field.
var fileExtension: String? {
UTType(typeIdentifier)?.preferredFilenameExtension
}

/// Extracts `.png` frames from a video at a rate of 1 FPS.
///
/// - Returns:
/// An array of `Data` corresponding to individual images for each frame.
func videoFrames() async throws -> [Data] {
guard let fileExtension else {
fatalError(
"Failed to find file extension; ensure the \"File Type\" is set in the asset catalog."
)
}

// we need a temp file so we can provide a URL to AVURLAsset
let tempFileURL = URL(fileURLWithPath: NSTemporaryDirectory())
.appendingPathComponent(UUID().uuidString, isDirectory: false)
.appendingPathExtension(fileExtension)

try data.write(to: tempFileURL)

defer {
try? FileManager.default.removeItem(at: tempFileURL)
}

let asset = AVURLAsset(url: tempFileURL)
let generator = AVAssetImageGenerator(asset: asset)

let duration = try await asset.load(.duration).seconds
return try stride(from: 0, to: duration, by: 1).map { seconds in
let time = CMTime(seconds: seconds, preferredTimescale: 1)
let cg = try generator.copyCGImage(at: time, actualTime: nil)

let image = UIImage(cgImage: cg)
guard let png = image.pngData() else {
fatalError("Failed to encode image to png")
}

return png
}
}
}
Loading