From e9a6c149c5025b640695583c78105ef469dc9ef7 Mon Sep 17 00:00:00 2001 From: Chase Farmer Date: Wed, 6 Mar 2024 14:44:32 -0800 Subject: [PATCH 1/9] Implement selecting input device (#51) * add microphone assignment * PR Feedback: Clean up code fork with alias, move Picker to controls view. --- .../WhisperAX/Views/ContentView.swift | 57 +++++-- Sources/WhisperKit/Core/AudioProcessor.swift | 153 ++++++++++++++++-- 2 files changed, 189 insertions(+), 21 deletions(-) diff --git a/Examples/WhisperAX/WhisperAX/Views/ContentView.swift b/Examples/WhisperAX/WhisperAX/Views/ContentView.swift index 0df7640e..2bb6982e 100644 --- a/Examples/WhisperAX/WhisperAX/Views/ContentView.swift +++ b/Examples/WhisperAX/WhisperAX/Views/ContentView.swift @@ -12,6 +12,9 @@ import AVFoundation struct ContentView: View { @State var whisperKit: WhisperKit? = nil + #if os(macOS) + @State var audioDevices: [AudioDevice]? = nil + #endif @State var isRecording: Bool = false @State var isTranscribing: Bool = false @State var currentText: String = "" @@ -24,7 +27,8 @@ struct ContentView: View { @State private var availableModels: [String] = [] @State private var availableLanguages: [String] = [] @State private var disabledModels: [String] = WhisperKit.recommendedModels().disabled - + + @AppStorage("selectedAudioInput") private var selectedAudioInput: String = "No Audio Input" @AppStorage("selectedModel") private var selectedModel: String = WhisperKit.recommendedModels().default @AppStorage("selectedTab") private var selectedTab: String = "Transcribe" @AppStorage("selectedTask") private var selectedTask: String = "transcribe" @@ -302,7 +306,6 @@ struct ContentView: View { } // MARK: - Controls - var controlsView: some View { VStack { basicSettingsView @@ -417,14 +420,39 @@ struct ContentView: View { .buttonStyle(BorderlessButtonStyle()) .disabled(modelState != .loaded) .frame(minWidth: 0, maxWidth: .infinity) - - Button { - showAdvancedOptions.toggle() - } label: { - Label("Settings", systemImage: "slider.horizontal.3") + + VStack { + Button { + showAdvancedOptions.toggle() + } label: { + Label("Settings", systemImage: "slider.horizontal.3") + } + .frame(minWidth: 0, maxWidth: .infinity) + .buttonStyle(.borderless) + + #if os(macOS) + HStack { + if let audioDevices = audioDevices, audioDevices.count > 0 { + Picker("", selection: $selectedAudioInput) { + ForEach(audioDevices, id: \.self) { device in + Text(device.name).tag(device.name) + } + } + .frame(minWidth: 80) + .disabled(isRecording) + } + } + .onAppear { + audioDevices = AudioProcessor.getAudioDevices() + if let audioDevices = audioDevices, + !audioDevices.isEmpty, + selectedAudioInput == "No Audio Input", + let device = audioDevices.first { + selectedAudioInput = device.name + } + } + #endif } - .frame(minWidth: 0, maxWidth: .infinity) - .buttonStyle(.borderless) } default: EmptyView() @@ -854,8 +882,17 @@ struct ContentView: View { print("Microphone access was not granted.") return } + + var deviceId: DeviceID? + #if os(macOS) + if self.selectedAudioInput != "No Audio Input", + let devices = self.audioDevices, + let device = devices.first(where: {$0.name == selectedAudioInput}) { + deviceId = device.id + } + #endif - try? audioProcessor.startRecordingLive { _ in + try? audioProcessor.startRecordingLive(inputDeviceID: deviceId) { _ in DispatchQueue.main.async { bufferEnergy = whisperKit?.audioProcessor.relativeEnergy ?? [] } diff --git a/Sources/WhisperKit/Core/AudioProcessor.swift b/Sources/WhisperKit/Core/AudioProcessor.swift index 248396fd..4909f26c 100644 --- a/Sources/WhisperKit/Core/AudioProcessor.swift +++ b/Sources/WhisperKit/Core/AudioProcessor.swift @@ -6,6 +6,18 @@ import AVFoundation import CoreAudio import CoreML +/// Core Audio Device +#if os(macOS) +public typealias DeviceID = AudioDeviceID +#else +public typealias DeviceID = String +#endif + +public struct AudioDevice: Identifiable, Hashable { + public let id: DeviceID + public let name: String +} + public protocol AudioProcessing { /// Loads audio data from a specified file path. /// - Parameter audioFilePath: The file path of the audio file. @@ -40,8 +52,8 @@ public protocol AudioProcessing { var relativeEnergyWindow: Int { get set } /// Starts recording audio from the specified input device, resetting the previous state - func startRecordingLive(callback: (([Float]) -> Void)?) throws - + func startRecordingLive(inputDeviceID: DeviceID?, callback: (([Float]) -> Void)?) throws + /// Pause recording func pauseRecording() @@ -51,8 +63,8 @@ public protocol AudioProcessing { /// Overrideable default methods for AudioProcessing public extension AudioProcessing { - func startRecordingLive(callback: (([Float]) -> Void)?) throws { - try startRecordingLive(callback: callback) + func startRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)?) throws { + try startRecordingLive(inputDeviceID: inputDeviceID, callback: callback) } static func padOrTrimAudio(fromArray audioArray: [Float], startAt startIndex: Int = 0, toLength frameLength: Int = 480_000, saveSegment: Bool = false) -> MLMultiArray? { @@ -304,6 +316,94 @@ public class AudioProcessor: NSObject, AudioProcessing { public static func requestRecordPermission() async -> Bool { await AVAudioApplication.requestRecordPermission() } + + #if os(macOS) + public static func getAudioDevices() -> [AudioDevice] { + var devices = [AudioDevice]() + + var propertySize: UInt32 = 0 + var status: OSStatus = noErr + + // Get the number of devices + var propertyAddress = AudioObjectPropertyAddress( + mSelector: kAudioHardwarePropertyDevices, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain + ) + status = AudioObjectGetPropertyDataSize( + AudioObjectID(kAudioObjectSystemObject), + &propertyAddress, + 0, + nil, + &propertySize + ) + if status != noErr { + Logging.error("Error: Unable to get the number of audio devices.") + return devices + } + + // Get the device IDs + let deviceCount = Int(propertySize) / MemoryLayout.size + var deviceIDs = [AudioDeviceID](repeating: 0, count: deviceCount) + status = AudioObjectGetPropertyData( + AudioObjectID(kAudioObjectSystemObject), + &propertyAddress, + 0, + nil, + &propertySize, + &deviceIDs + ) + if status != noErr { + Logging.error("Error: Unable to get the audio device IDs.") + return devices + } + + // Get device info for each device + for deviceID in deviceIDs { + var deviceName: String = "" + var inputChannels: Int = 0 + + // Get device name + var propertySize: UInt32 = UInt32(MemoryLayout?>.size) + var name: Unmanaged? = nil + propertyAddress.mSelector = kAudioDevicePropertyDeviceNameCFString + + status = AudioObjectGetPropertyData( + deviceID, + &propertyAddress, + 0, + nil, + &propertySize, + &name + ) + if status == noErr, let deviceNameCF = name?.takeUnretainedValue() as String? { + deviceName = deviceNameCF + } + + // Get input channels + propertyAddress.mSelector = kAudioDevicePropertyStreamConfiguration + propertyAddress.mScope = kAudioDevicePropertyScopeInput + status = AudioObjectGetPropertyDataSize(deviceID, &propertyAddress, 0, nil, &propertySize) + if status == noErr { + let bufferListPointer = UnsafeMutablePointer.allocate(capacity: 1) + defer { bufferListPointer.deallocate() } + status = AudioObjectGetPropertyData(deviceID, &propertyAddress, 0, nil, &propertySize, bufferListPointer) + if status == noErr { + let bufferList = UnsafeMutableAudioBufferListPointer(bufferListPointer) + for buffer in bufferList { + inputChannels += Int(buffer.mNumberChannels) + } + } + } + + if inputChannels > 0 { + devices.append(AudioDevice(id: deviceID, name: deviceName)) + } + } + + return devices + } + #endif deinit { stopRecording() @@ -336,10 +436,43 @@ public extension AudioProcessor { Logging.debug("Current audio size: \(self.audioSamples.count) samples, most recent buffer: \(buffer.count) samples, most recent energy: \(newEnergy)") } } - - func setupEngine() throws -> AVAudioEngine { + + #if os(macOS) + func assignAudioInput(inputNode: AVAudioInputNode, inputDeviceID: AudioDeviceID) { + guard let audioUnit = inputNode.audioUnit else { + Logging.error("Failed to access the audio unit of the input node.") + return + } + + var inputDeviceID = inputDeviceID + + let error = AudioUnitSetProperty( + audioUnit, + kAudioOutputUnitProperty_CurrentDevice, + kAudioUnitScope_Global, + 0, + &inputDeviceID, + UInt32(MemoryLayout.size) + ) + + if error != noErr { + Logging.error("Error setting Audio Unit property: \(error)") + } else { + Logging.info("Successfully set input device.") + } + } + #endif + + func setupEngine(inputDeviceID: DeviceID? = nil) throws -> AVAudioEngine { let audioEngine = AVAudioEngine() let inputNode = audioEngine.inputNode + + #if os(macOS) + if let inputDeviceID = inputDeviceID { + assignAudioInput(inputNode: inputNode, inputDeviceID: inputDeviceID) + } + #endif + let inputFormat = inputNode.outputFormat(forBus: 0) // Desired format (16,000 Hz, 1 channel) @@ -384,14 +517,12 @@ public extension AudioProcessor { audioSamples.removeFirst(audioSamples.count - keep) } } - - func startRecordingLive(callback: (([Float]) -> Void)? = nil) throws { + + func startRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)? = nil) throws { audioSamples = [] audioEnergy = [] - // TODO: implement selecting input device - - audioEngine = try setupEngine() + audioEngine = try setupEngine(inputDeviceID: inputDeviceID) // Set the callback audioBufferCallback = callback From 880f95e88f4c234042d275ef1edb82ecc57507f7 Mon Sep 17 00:00:00 2001 From: Zach Nagengast Date: Wed, 6 Mar 2024 17:45:03 -0800 Subject: [PATCH 2/9] macOS 13 support (#40) * Initial macOS 13 support * Handle watchos case * Test on macos 13 and 14 runners * Update test script for macos 13 simulators * Use m1 macos 13 runners * Use m1 macos 13 runners with appropriate matrix * Use m1 macos 13 runners with appropriate matrix * Reduce ios requirement to 16 * Only test watchos on macos 14 * Add ios to the avaudioapplication test * Add note on version choices * Remove missing os versions from Package.swift * Add missing os checks * Remove excess os checks causing warnings * Test mel with cpuonly on simulator --- .github/workflows/unit-tests.yml | 16 ++++++++-- Package.swift | 15 +++++---- Sources/WhisperKit/Core/AudioEncoder.swift | 2 +- Sources/WhisperKit/Core/AudioProcessor.swift | 31 +++++++++++++++++-- .../Core/AudioStreamTranscriber.swift | 3 ++ .../WhisperKit/Core/FeatureExtractor.swift | 2 +- Sources/WhisperKit/Core/LogitsFilter.swift | 6 ++-- Sources/WhisperKit/Core/Models.swift | 16 +++++----- Sources/WhisperKit/Core/SegmentSeeker.swift | 4 +-- Sources/WhisperKit/Core/TextDecoder.swift | 6 ++-- Sources/WhisperKit/Core/TokenSampler.swift | 2 +- Sources/WhisperKit/Core/Utils.swift | 1 + Sources/WhisperKit/Core/WhisperKit.swift | 3 +- Sources/WhisperKitCLI/transcribe.swift | 2 +- Tests/WhisperKitTests/FunctionalTests.swift | 2 +- Tests/WhisperKitTests/UnitTests.swift | 11 ++++--- 16 files changed, 82 insertions(+), 40 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 7526d95c..5167fed2 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -9,7 +9,15 @@ on: jobs: build-and-test: - runs-on: macos-14 + strategy: + matrix: + os: [macos-13-xlarge, macos-14] + include: + - os: macos-13-xlarge + ios-version: "16.1" # oldest available version + - os: macos-14 + ios-version: "17.2" # latest available version + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - uses: maxim-lobanov/setup-xcode@v1 @@ -40,14 +48,16 @@ jobs: run: | set -o pipefail xcodebuild clean build-for-testing -scheme whisperkit-Package -destination generic/platform=iOS | xcpretty - xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=iOS Simulator,OS=17.2,name=iPhone 15" | xcpretty + xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=iOS Simulator,OS=${{ matrix.ios-version }},name=iPhone 15" | xcpretty - name: Build and Test - watchOS + if: matrix.os == 'macos-14' run: | set -o pipefail xcodebuild clean build-for-testing -scheme whisperkit-Package -destination generic/platform=watchOS | xcpretty xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=watchOS Simulator,OS=10.2,name=Apple Watch Ultra 2 (49mm)" | xcpretty - name: Build and Test - visionOS + if: matrix.os == 'macos-14' run: | set -o pipefail xcodebuild clean build-for-testing -scheme whisperkit-Package -destination generic/platform=visionOS | xcpretty - xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=visionOS Simulator,name=Apple Vision Pro" | xcpretty \ No newline at end of file + xcodebuild test -only-testing WhisperKitTests/UnitTests -scheme whisperkit-Package -destination "platform=visionOS Simulator,name=Apple Vision Pro" | xcpretty diff --git a/Package.swift b/Package.swift index 057482c2..b4887194 100644 --- a/Package.swift +++ b/Package.swift @@ -6,10 +6,8 @@ import PackageDescription let package = Package( name: "whisperkit", platforms: [ - .iOS(.v17), - .macOS(.v14), - .watchOS(.v10), - .visionOS(.v1) + .iOS(.v16), + .macOS(.v13), ], products: [ .library( @@ -18,7 +16,8 @@ let package = Package( ), .executable( name: "transcribe", - targets: ["WhisperKitCLI"]) + targets: ["WhisperKitCLI"] + ), ], dependencies: [ .package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.2"), @@ -35,7 +34,7 @@ let package = Package( name: "WhisperKitCLI", dependencies: [ "WhisperKit", - .product(name: "ArgumentParser", package: "swift-argument-parser") + .product(name: "ArgumentParser", package: "swift-argument-parser"), ] ), .testTarget( @@ -51,11 +50,11 @@ let package = Package( "Makefile", "README.md", "LICENSE", - "CONTRIBUTING.md" + "CONTRIBUTING.md", ], resources: [ .process("Tests/WhisperKitTests/Resources"), - .copy("Models/whisperkit-coreml") + .copy("Models/whisperkit-coreml"), ] ), ] diff --git a/Sources/WhisperKit/Core/AudioEncoder.swift b/Sources/WhisperKit/Core/AudioEncoder.swift index bd32fad6..73061157 100644 --- a/Sources/WhisperKit/Core/AudioEncoder.swift +++ b/Sources/WhisperKit/Core/AudioEncoder.swift @@ -14,7 +14,7 @@ public protocol AudioEncoding { func encodeFeatures(_ features: MLMultiArray) async throws -> MLMultiArray? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class AudioEncoder: AudioEncoding, WhisperMLModel { public var model: MLModel? diff --git a/Sources/WhisperKit/Core/AudioProcessor.swift b/Sources/WhisperKit/Core/AudioProcessor.swift index 4909f26c..0231fba4 100644 --- a/Sources/WhisperKit/Core/AudioProcessor.swift +++ b/Sources/WhisperKit/Core/AudioProcessor.swift @@ -143,7 +143,7 @@ public extension AudioProcessing { } } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class AudioProcessor: NSObject, AudioProcessing { public var audioEngine: AVAudioEngine? public var audioSamples: ContiguousArray = [] @@ -314,7 +314,32 @@ public class AudioProcessor: NSObject, AudioProcessing { } public static func requestRecordPermission() async -> Bool { - await AVAudioApplication.requestRecordPermission() + if #available(macOS 14, iOS 17, *) { + return await AVAudioApplication.requestRecordPermission() + } else { + #if os(watchOS) + // watchOS does not support AVCaptureDevice + return true + #else + let microphoneStatus = AVCaptureDevice.authorizationStatus(for: .audio) + switch microphoneStatus { + case .notDetermined: + return await withCheckedContinuation { continuation in + AVCaptureDevice.requestAccess(for: .audio) { granted in + continuation.resume(returning: granted) + } + } + case .restricted, .denied: + Logging.error("Microphone access denied") + return false + case .authorized: + return true + @unknown default: + Logging.error("Unknown authorization status") + return false + } + #endif + } } #if os(macOS) @@ -412,7 +437,7 @@ public class AudioProcessor: NSObject, AudioProcessing { // MARK: - Streaming -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public extension AudioProcessor { /// We have a new buffer, process and store it. /// NOTE: Assumes audio is 16khz mono diff --git a/Sources/WhisperKit/Core/AudioStreamTranscriber.swift b/Sources/WhisperKit/Core/AudioStreamTranscriber.swift index c3c158ff..779a25d3 100644 --- a/Sources/WhisperKit/Core/AudioStreamTranscriber.swift +++ b/Sources/WhisperKit/Core/AudioStreamTranscriber.swift @@ -3,6 +3,7 @@ import Foundation +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public extension AudioStreamTranscriber { struct State { public var isRecording: Bool = false @@ -17,9 +18,11 @@ public extension AudioStreamTranscriber { } } +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public typealias AudioStreamTranscriberCallback = (AudioStreamTranscriber.State, AudioStreamTranscriber.State) -> Void /// Responsible for streaming audio from the microphone, processing it, and transcribing it in real-time. +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public actor AudioStreamTranscriber { private var state: AudioStreamTranscriber.State = .init() { didSet { diff --git a/Sources/WhisperKit/Core/FeatureExtractor.swift b/Sources/WhisperKit/Core/FeatureExtractor.swift index 78544f72..c44e56ab 100644 --- a/Sources/WhisperKit/Core/FeatureExtractor.swift +++ b/Sources/WhisperKit/Core/FeatureExtractor.swift @@ -12,7 +12,7 @@ public protocol FeatureExtracting { func logMelSpectrogram(fromAudio inputAudio: MLMultiArray) async throws -> MLMultiArray? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class FeatureExtractor: FeatureExtracting, WhisperMLModel { public var model: MLModel? diff --git a/Sources/WhisperKit/Core/LogitsFilter.swift b/Sources/WhisperKit/Core/LogitsFilter.swift index e71f0186..6bd60a37 100644 --- a/Sources/WhisperKit/Core/LogitsFilter.swift +++ b/Sources/WhisperKit/Core/LogitsFilter.swift @@ -9,7 +9,7 @@ public protocol LogitsFiltering { func filterLogits(_ logits: MLMultiArray, withTokens tokens: [Int]) -> MLMultiArray } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class SuppressTokensFilter: LogitsFiltering { let suppressTokens: [Int] private let suppressTokenIndexes: [[NSNumber]] @@ -25,7 +25,7 @@ public class SuppressTokensFilter: LogitsFiltering { } } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class SuppressBlankFilter: LogitsFiltering { let suppressBlankTokens: [Int] let sampleBegin: Int @@ -46,7 +46,7 @@ public class SuppressBlankFilter: LogitsFiltering { } } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TimestampRulesFilter: LogitsFiltering { let tokenizer: Tokenizer let sampleBegin: Int diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift index 2048089f..a1d8196b 100644 --- a/Sources/WhisperKit/Core/Models.swift +++ b/Sources/WhisperKit/Core/Models.swift @@ -7,7 +7,7 @@ import NaturalLanguage import Tokenizers #if os(watchOS) || arch(arm64) -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public typealias FloatType = Float16 #else public typealias FloatType = Float @@ -200,7 +200,7 @@ public struct DecodingCache { /// - logProbThreshold: If the average log probability over sampled tokens is below this value, treat as failed. /// - noSpeechThreshold: If the no speech probability is higher than this value AND the average log /// probability over sampled tokens is below `logProbThreshold`, consider the segment as silent. -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public struct DecodingOptions { public var verbose: Bool public var task: DecodingTask @@ -489,7 +489,7 @@ public class MelSpectrogramInput: MLFeatureProvider { } /// Model Prediction Output Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class MelSpectrogramOutput: MLFeatureProvider { /// Source provided by CoreML private let provider: MLFeatureProvider @@ -526,7 +526,7 @@ public class MelSpectrogramOutput: MLFeatureProvider { // MARK: AudioEncoder /// Model Prediction Input Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class AudioEncoderInput: MLFeatureProvider { /// melspectrogram_features as 1 × {80,128} × 1 × 3000 4-dimensional array of floats public var melspectrogram_features: MLMultiArray @@ -552,7 +552,7 @@ public class AudioEncoderInput: MLFeatureProvider { } /// Model Prediction Output Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class AudioEncoderOutput: MLFeatureProvider { /// Source provided by CoreML private let provider: MLFeatureProvider @@ -589,7 +589,7 @@ public class AudioEncoderOutput: MLFeatureProvider { // MARK: TextDecoder /// Model Prediction Input Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TextDecoderInput: MLFeatureProvider { /// input_ids as 1 element vector of 32-bit integers public var input_ids: MLMultiArray @@ -657,7 +657,7 @@ public class TextDecoderInput: MLFeatureProvider { } /// Model Prediction Output Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TextDecoderOutput: MLFeatureProvider { /// Source provided by CoreML private let provider: MLFeatureProvider @@ -764,7 +764,7 @@ public class TextDecoderCachePrefillInput: MLFeatureProvider { } /// Model Prediction Output Type -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TextDecoderCachePrefillOutput: MLFeatureProvider { /// Source provided by CoreML private let provider: MLFeatureProvider diff --git a/Sources/WhisperKit/Core/SegmentSeeker.swift b/Sources/WhisperKit/Core/SegmentSeeker.swift index 8e78415f..e37f8e63 100644 --- a/Sources/WhisperKit/Core/SegmentSeeker.swift +++ b/Sources/WhisperKit/Core/SegmentSeeker.swift @@ -6,7 +6,7 @@ import CoreML import Foundation import Tokenizers -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public protocol SegmentSeeking { func findSeekPointAndSegments( decodingResult: DecodingResult, @@ -34,7 +34,7 @@ public protocol SegmentSeeking { ) throws -> [TranscriptionSegment]? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class SegmentSeeker: SegmentSeeking { public init() {} diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift index 97f87323..6575b32a 100644 --- a/Sources/WhisperKit/Core/TextDecoder.swift +++ b/Sources/WhisperKit/Core/TextDecoder.swift @@ -5,7 +5,7 @@ import Accelerate import CoreML import Tokenizers -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public protocol TextDecoding { var tokenizer: Tokenizer? { get set } var prefillData: WhisperMLModel? { get set } @@ -43,7 +43,7 @@ public protocol TextDecoding { ) } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public extension TextDecoding { func prepareDecoderInputs(withPrompt initialPrompt: [Int]) -> DecodingInputs? { let tokenShape = [NSNumber(value: 1), NSNumber(value: initialPrompt.count)] @@ -234,7 +234,7 @@ public class TextDecoderContextPrefill: WhisperMLModel { public var model: MLModel? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class TextDecoder: TextDecoding, WhisperMLModel { public var model: MLModel? public var tokenizer: Tokenizer? diff --git a/Sources/WhisperKit/Core/TokenSampler.swift b/Sources/WhisperKit/Core/TokenSampler.swift index d06b69d6..19470543 100644 --- a/Sources/WhisperKit/Core/TokenSampler.swift +++ b/Sources/WhisperKit/Core/TokenSampler.swift @@ -16,7 +16,7 @@ public struct SamplingResult { public var completed: Bool } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class GreedyTokenSampler: TokenSampling { public var temperature: FloatType public var eotToken: Int diff --git a/Sources/WhisperKit/Core/Utils.swift b/Sources/WhisperKit/Core/Utils.swift index e90bcc6d..193b2168 100644 --- a/Sources/WhisperKit/Core/Utils.swift +++ b/Sources/WhisperKit/Core/Utils.swift @@ -39,6 +39,7 @@ extension MLMultiArray { } } +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) func initMLMultiArray(shape: [NSNumber], dataType: MLMultiArrayDataType, initialValue: Any) -> MLMultiArray { let multiArray = try! MLMultiArray(shape: shape, dataType: dataType) diff --git a/Sources/WhisperKit/Core/WhisperKit.swift b/Sources/WhisperKit/Core/WhisperKit.swift index 649674b9..9628a9fc 100644 --- a/Sources/WhisperKit/Core/WhisperKit.swift +++ b/Sources/WhisperKit/Core/WhisperKit.swift @@ -9,12 +9,13 @@ import Hub import TensorUtils import Tokenizers +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public protocol Transcriber { func transcribe(audioPath: String, decodeOptions: DecodingOptions?, callback: TranscriptionCallback) async throws -> TranscriptionResult? func transcribe(audioArray: [Float], decodeOptions: DecodingOptions?, callback: TranscriptionCallback) async throws -> TranscriptionResult? } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public class WhisperKit: Transcriber { /// Models public var modelVariant: ModelVariant = .tiny diff --git a/Sources/WhisperKitCLI/transcribe.swift b/Sources/WhisperKitCLI/transcribe.swift index d2d73a6a..7a1c571a 100644 --- a/Sources/WhisperKitCLI/transcribe.swift +++ b/Sources/WhisperKitCLI/transcribe.swift @@ -7,7 +7,7 @@ import Foundation import WhisperKit -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) @main struct WhisperKitCLI: AsyncParsableCommand { @Option(help: "Path to audio file") diff --git a/Tests/WhisperKitTests/FunctionalTests.swift b/Tests/WhisperKitTests/FunctionalTests.swift index e783adfb..44fd92f3 100644 --- a/Tests/WhisperKitTests/FunctionalTests.swift +++ b/Tests/WhisperKitTests/FunctionalTests.swift @@ -5,7 +5,7 @@ import CoreML @testable import WhisperKit import XCTest -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) final class FunctionalTests: XCTestCase { func testInitLarge() async { let modelPath = largev3ModelPath() diff --git a/Tests/WhisperKitTests/UnitTests.swift b/Tests/WhisperKitTests/UnitTests.swift index 96f02e93..c6781d21 100644 --- a/Tests/WhisperKitTests/UnitTests.swift +++ b/Tests/WhisperKitTests/UnitTests.swift @@ -7,7 +7,7 @@ import Tokenizers @testable import WhisperKit import XCTest -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) final class UnitTests: XCTestCase { func testInit() async { let whisperKit = try? await WhisperKit(prewarm: false, load: false, download: false) @@ -226,7 +226,9 @@ final class UnitTests: XCTestCase { } func testWindowing() async { - let computeOptions = ModelComputeOptions() + let computeOptions = ModelComputeOptions( + melCompute: .cpuOnly + ) let whisperKit = try? await WhisperKit(modelFolder: tinyModelPath(), computeOptions: computeOptions, verbose: true, logLevel: .debug) guard let audioFilePath = Bundle.module.path(forResource: "jfk", ofType: "wav") else { @@ -373,7 +375,7 @@ final class UnitTests: XCTestCase { XCTFail("Failed to transcribe") return } - XCTAssertEqual(result.text.prefix(4), "東京は晴") + XCTAssertEqual(result.text.prefix(3), "東京は") } func testNoTimestamps() async { @@ -831,6 +833,7 @@ final class UnitTests: XCTestCase { // MARK: Helpers +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) extension MLMultiArray { /// Create `MLMultiArray` of shape [1, 1, arr.count] and fill up the last /// dimension with with values from arr. @@ -858,7 +861,7 @@ extension MLMultiArray { } } -@available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) extension XCTestCase { func transcribe(with variant: ModelVariant, options: DecodingOptions, audioFile: String = "jfk.wav", file: StaticString = #file, line: UInt = #line) async throws -> TranscriptionResult? { var modelPath = tinyModelPath() From 20c90018a8948e5c323abf8c85d1fbda8ef21bdc Mon Sep 17 00:00:00 2001 From: Jan Krukowski Date: Thu, 7 Mar 2024 16:27:25 +0100 Subject: [PATCH 3/9] Fixed Conformance of 'Float16' warning (#58) * fixed warnings * removed fixme --- Sources/WhisperKit/Core/Models.swift | 8 ++++++-- Sources/WhisperKit/Core/TokenSampler.swift | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift index a1d8196b..e9a7b2bd 100644 --- a/Sources/WhisperKit/Core/Models.swift +++ b/Sources/WhisperKit/Core/Models.swift @@ -1,18 +1,22 @@ // For licensing see accompanying LICENSE.md file. // Copyright © 2024 Argmax, Inc. All rights reserved. +import Accelerate import CoreML import Hub import NaturalLanguage import Tokenizers -#if os(watchOS) || arch(arm64) -@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) +#if !((os(macOS) || targetEnvironment(macCatalyst)) && arch(x86_64)) public typealias FloatType = Float16 #else public typealias FloatType = Float #endif +#if (os(macOS) || targetEnvironment(macCatalyst)) && arch(arm64) +extension Float16: BNNSScalar {} +#endif + // MARK: - CoreML public protocol WhisperMLModel { diff --git a/Sources/WhisperKit/Core/TokenSampler.swift b/Sources/WhisperKit/Core/TokenSampler.swift index 19470543..931331a4 100644 --- a/Sources/WhisperKit/Core/TokenSampler.swift +++ b/Sources/WhisperKit/Core/TokenSampler.swift @@ -44,7 +44,7 @@ public class GreedyTokenSampler: TokenSampling { let logitsDescriptor = BNNSNDArrayDescriptor( data: logitsRawPointer, - scalarType: FloatType.self, // FIXME: Float16 here breaks in swift 6 + scalarType: FloatType.self, shape: .vector(logits.count, stride: 1) )! From df15f89289d5ac7b57138ef5b6fc7328c1cb178f Mon Sep 17 00:00:00 2001 From: Finn Voorhees Date: Thu, 7 Mar 2024 15:46:20 +0000 Subject: [PATCH 4/9] Fix memory leak from non-async MLModel prediction (#56) --- Sources/WhisperKit/Core/AudioEncoder.swift | 2 +- Sources/WhisperKit/Core/FeatureExtractor.swift | 2 +- Sources/WhisperKit/Core/TextDecoder.swift | 4 ++-- Sources/WhisperKit/Core/Utils.swift | 15 +++++++++++++++ 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/Sources/WhisperKit/Core/AudioEncoder.swift b/Sources/WhisperKit/Core/AudioEncoder.swift index 73061157..8b66b1f2 100644 --- a/Sources/WhisperKit/Core/AudioEncoder.swift +++ b/Sources/WhisperKit/Core/AudioEncoder.swift @@ -46,7 +46,7 @@ public class AudioEncoder: AudioEncoding, WhisperMLModel { try Task.checkCancellation() - let outputFeatures = try await model.prediction(from: modelInputs, options: MLPredictionOptions()) + let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions()) let output = AudioEncoderOutput(features: outputFeatures) diff --git a/Sources/WhisperKit/Core/FeatureExtractor.swift b/Sources/WhisperKit/Core/FeatureExtractor.swift index c44e56ab..4838fa06 100644 --- a/Sources/WhisperKit/Core/FeatureExtractor.swift +++ b/Sources/WhisperKit/Core/FeatureExtractor.swift @@ -35,7 +35,7 @@ public class FeatureExtractor: FeatureExtracting, WhisperMLModel { try Task.checkCancellation() - let outputFeatures = try await model.prediction(from: modelInputs, options: MLPredictionOptions()) + let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions()) let output = MelSpectrogramOutput(features: outputFeatures) diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift index 6575b32a..2e1fbd1c 100644 --- a/Sources/WhisperKit/Core/TextDecoder.swift +++ b/Sources/WhisperKit/Core/TextDecoder.swift @@ -176,7 +176,7 @@ public extension TextDecoding { try Task.checkCancellation() - let outputFeatures = try await prefillModel.prediction(from: modelInputs, options: MLPredictionOptions()) + let outputFeatures = try await prefillModel.asyncPrediction(from: modelInputs, options: MLPredictionOptions()) let output = TextDecoderCachePrefillOutput(features: outputFeatures) @@ -291,7 +291,7 @@ public class TextDecoder: TextDecoding, WhisperMLModel { try Task.checkCancellation() - let outputFeatures = try await model.prediction(from: modelInputs, options: MLPredictionOptions()) + let outputFeatures = try await model.asyncPrediction(from: modelInputs, options: MLPredictionOptions()) let output = TextDecoderOutput(features: outputFeatures) diff --git a/Sources/WhisperKit/Core/Utils.swift b/Sources/WhisperKit/Core/Utils.swift index 193b2168..ce614902 100644 --- a/Sources/WhisperKit/Core/Utils.swift +++ b/Sources/WhisperKit/Core/Utils.swift @@ -39,6 +39,21 @@ extension MLMultiArray { } } +extension MLModel { + func asyncPrediction( + from input: MLFeatureProvider, + options: MLPredictionOptions + ) async throws -> MLFeatureProvider { + if #available(macOS 14, iOS 17, watchOS 10, visionOS 1, *) { + return try await prediction(from: input, options: options) + } else { + return try await Task { + try prediction(from: input, options: options) + }.value + } + } +} + @available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) func initMLMultiArray(shape: [NSNumber], dataType: MLMultiArrayDataType, initialValue: Any) -> MLMultiArray { let multiArray = try! MLMultiArray(shape: shape, dataType: dataType) From d08fb1b67567d92b481adfedf2a6b42d51ab2c58 Mon Sep 17 00:00:00 2001 From: Finn Voorhees Date: Fri, 8 Mar 2024 00:27:34 +0000 Subject: [PATCH 5/9] Expose downloadBase in WhisperKit init (#57) --- Sources/WhisperKit/Core/WhisperKit.swift | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Sources/WhisperKit/Core/WhisperKit.swift b/Sources/WhisperKit/Core/WhisperKit.swift index 9628a9fc..f0678c7a 100644 --- a/Sources/WhisperKit/Core/WhisperKit.swift +++ b/Sources/WhisperKit/Core/WhisperKit.swift @@ -49,6 +49,7 @@ public class WhisperKit: Transcriber { public init( model: String? = nil, + downloadBase: URL? = nil, modelRepo: String? = nil, modelFolder: String? = nil, computeOptions: ModelComputeOptions? = nil, @@ -74,7 +75,7 @@ public class WhisperKit: Transcriber { Logging.shared.logLevel = verbose ? logLevel : .none currentTimings = TranscriptionTimings() - try await setupModels(model: model, modelRepo: modelRepo, modelFolder: modelFolder, download: download) + try await setupModels(model: model, downloadBase: downloadBase, modelRepo: modelRepo, modelFolder: modelFolder, download: download) if let prewarm = prewarm, prewarm { Logging.info("Prewarming models...") @@ -179,7 +180,7 @@ public class WhisperKit: Transcriber { } /// Sets up the model folder either from a local path or by downloading from a repository. - public func setupModels(model: String?, modelRepo: String?, modelFolder: String?, download: Bool) async throws { + public func setupModels(model: String?, downloadBase: URL? = nil, modelRepo: String?, modelFolder: String?, download: Bool) async throws { // Determine the model variant to use let modelVariant = model ?? WhisperKit.recommendedModels().default @@ -189,7 +190,7 @@ public class WhisperKit: Transcriber { } else if download { let repo = modelRepo ?? "argmaxinc/whisperkit-coreml" do { - let hubModelFolder = try await Self.download(variant: modelVariant, from: repo) + let hubModelFolder = try await Self.download(variant: modelVariant, downloadBase: downloadBase, from: repo) self.modelFolder = hubModelFolder! } catch { // Handle errors related to model downloading From c0dc2644ae1313ea2a1da9c32257e7ebdee1a46f Mon Sep 17 00:00:00 2001 From: ZachNagengast Date: Fri, 8 Mar 2024 14:58:07 -0800 Subject: [PATCH 6/9] Disable early stopping during prefill tokens, fixing #63 --- Sources/WhisperKit/Core/TextDecoder.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift index 2e1fbd1c..bc22446e 100644 --- a/Sources/WhisperKit/Core/TextDecoder.swift +++ b/Sources/WhisperKit/Core/TextDecoder.swift @@ -484,7 +484,7 @@ public class TextDecoder: TextDecoding, WhisperMLModel { // Call the callback if it is provided if let shouldContinue = callback?(result) { - if !shouldContinue { + if !shouldContinue && !isPrefill { Logging.debug("Early stopping") break } From 7fcda5194d0d64dce6bb6b4136b0f52b70dcd28e Mon Sep 17 00:00:00 2001 From: ZachNagengast Date: Fri, 8 Mar 2024 14:58:39 -0800 Subject: [PATCH 7/9] Enable ipad interface for example app #60 --- Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj b/Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj index 4a4d4bcc..743e7730 100644 --- a/Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj +++ b/Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj @@ -872,9 +872,10 @@ SDKROOT = auto; SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; SUPPORTS_MACCATALYST = NO; + SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = YES; SWIFT_EMIT_LOC_STRINGS = YES; SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = 1; + TARGETED_DEVICE_FAMILY = "1,2"; }; name = Debug; }; @@ -915,9 +916,10 @@ SDKROOT = auto; SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx"; SUPPORTS_MACCATALYST = NO; + SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = YES; SWIFT_EMIT_LOC_STRINGS = YES; SWIFT_VERSION = 5.0; - TARGETED_DEVICE_FAMILY = 1; + TARGETED_DEVICE_FAMILY = "1,2"; }; name = Release; }; From ccdd77d56d3ae4d5636251f88dc801678d029a82 Mon Sep 17 00:00:00 2001 From: Chase Farmer Date: Fri, 8 Mar 2024 15:19:36 -0800 Subject: [PATCH 8/9] Add audio device selector to transcribe + take a stab at Delete/Retry models (#54) * Add audio devices to transcribe view * Add delete and retry buttons * Adjust mic device picker location * RM retry --------- Co-authored-by: ZachNagengast --- .../WhisperAX/Views/ContentView.swift | 148 ++++++++++++------ 1 file changed, 100 insertions(+), 48 deletions(-) diff --git a/Examples/WhisperAX/WhisperAX/Views/ContentView.swift b/Examples/WhisperAX/WhisperAX/Views/ContentView.swift index 2bb6982e..92fe7fb6 100644 --- a/Examples/WhisperAX/WhisperAX/Views/ContentView.swift +++ b/Examples/WhisperAX/WhisperAX/Views/ContentView.swift @@ -246,6 +246,16 @@ struct ContentView: View { .progressViewStyle(CircularProgressViewStyle()) .scaleEffect(0.5) } + + Button(action: { + deleteModel() + }, label: { + Image(systemName: "trash") + }) + .help("Delete model") + .buttonStyle(BorderlessButtonStyle()) + .disabled(localModels.count == 0) + .disabled(!localModels.contains(selectedModel)) #if os(macOS) Button(action: { @@ -306,6 +316,33 @@ struct ContentView: View { } // MARK: - Controls + var audioDevicesView: some View { + Group { + #if os(macOS) + HStack { + if let audioDevices = audioDevices, audioDevices.count > 0 { + Picker("", selection: $selectedAudioInput) { + ForEach(audioDevices, id: \.self) { device in + Text(device.name).tag(device.name) + } + } + .frame(width: 250) + .disabled(isRecording) + } + } + .onAppear { + audioDevices = AudioProcessor.getAudioDevices() + if let audioDevices = audioDevices, + !audioDevices.isEmpty, + selectedAudioInput == "No Audio Input", + let device = audioDevices.first { + selectedAudioInput = device.name + } + } + #endif + } + } + var controlsView: some View { VStack { basicSettingsView @@ -321,7 +358,13 @@ struct ContentView: View { Label("Reset", systemImage: "arrow.clockwise") } .buttonStyle(.borderless) + + Spacer() + + audioDevicesView + Spacer() + Button { showAdvancedOptions.toggle() } label: { @@ -395,63 +438,50 @@ struct ContentView: View { } } case "Stream": - HStack { - Button { - resetState() - } label: { - Label("Reset", systemImage: "arrow.clockwise") - } - .frame(minWidth: 0, maxWidth: .infinity) - .buttonStyle(.borderless) - - Button { - withAnimation { - toggleRecording(shouldLoop: true) - } - } label: { - Image(systemName: !isRecording ? "record.circle" : "stop.circle.fill") - .resizable() - .scaledToFit() - .frame(width: 70, height: 70) - .padding() - .foregroundColor(modelState != .loaded ? .gray : .red) - } - .contentTransition(.symbolEffect(.replace)) - .buttonStyle(BorderlessButtonStyle()) - .disabled(modelState != .loaded) - .frame(minWidth: 0, maxWidth: .infinity) - - VStack { + VStack { + HStack { Button { - showAdvancedOptions.toggle() + resetState() } label: { - Label("Settings", systemImage: "slider.horizontal.3") + Label("Reset", systemImage: "arrow.clockwise") } .frame(minWidth: 0, maxWidth: .infinity) .buttonStyle(.borderless) - - #if os(macOS) - HStack { - if let audioDevices = audioDevices, audioDevices.count > 0 { - Picker("", selection: $selectedAudioInput) { - ForEach(audioDevices, id: \.self) { device in - Text(device.name).tag(device.name) - } - } - .frame(minWidth: 80) - .disabled(isRecording) + + Spacer() + + audioDevicesView + + Spacer() + + VStack { + Button { + showAdvancedOptions.toggle() + } label: { + Label("Settings", systemImage: "slider.horizontal.3") } + .frame(minWidth: 0, maxWidth: .infinity) + .buttonStyle(.borderless) } - .onAppear { - audioDevices = AudioProcessor.getAudioDevices() - if let audioDevices = audioDevices, - !audioDevices.isEmpty, - selectedAudioInput == "No Audio Input", - let device = audioDevices.first { - selectedAudioInput = device.name + } + + HStack { + Button { + withAnimation { + toggleRecording(shouldLoop: true) } + } label: { + Image(systemName: !isRecording ? "record.circle" : "stop.circle.fill") + .resizable() + .scaledToFit() + .frame(width: 70, height: 70) + .padding() + .foregroundColor(modelState != .loaded ? .gray : .red) } - #endif + .contentTransition(.symbolEffect(.replace)) + .buttonStyle(BorderlessButtonStyle()) + .disabled(modelState != .loaded) + .frame(minWidth: 0, maxWidth: .infinity) } } default: @@ -779,6 +809,10 @@ struct ContentView: View { try await whisperKit.loadModels() await MainActor.run { + if !localModels.contains(model) { + localModels.append(model) + } + availableLanguages = whisperKit.tokenizer?.langauges.map { $0.key }.sorted() ?? ["english"] loadingProgressValue = 1.0 modelState = whisperKit.modelState @@ -786,6 +820,24 @@ struct ContentView: View { } } } + + func deleteModel() { + if localModels.contains(selectedModel) { + let modelFolder = URL(fileURLWithPath: localModelPath).appendingPathComponent("openai_whisper-\(selectedModel)") + + do { + try FileManager.default.removeItem(at: modelFolder) + + if let index = localModels.firstIndex(of: selectedModel) { + localModels.remove(at: index) + } + + modelState = .unloaded + } catch { + print("Error deleting model: \(error)") + } + } + } func updateProgressBar(targetProgress: Float, maxTime: TimeInterval) async { let initialProgress = loadingProgressValue From bfa357e897dadae0dca6314424bb119149e375d2 Mon Sep 17 00:00:00 2001 From: bharat9806 <60149810+bharat9806@users.noreply.github.com> Date: Sat, 9 Mar 2024 05:29:21 +0530 Subject: [PATCH 9/9] Issue - 42 WhisperKit support simulator fixed (#52) * Issue - 42 WhisperKit support simulator fixed * Issue - 42 enhancement done * Setup AVAudioSession, which fixes crash in simulators * Add availability checks for model compute options --------- Co-authored-by: ZachNagengast --- .../xcshareddata/swiftpm/Package.resolved | 3 ++- .../WhisperAXExampleView.swift | 2 +- Sources/WhisperKit/Core/AudioProcessor.swift | 24 ++++++++++++++++++- Sources/WhisperKit/Core/Models.swift | 8 +++++++ Sources/WhisperKit/Core/Utils.swift | 11 +++++++++ 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved index 307759dd..3877d56d 100644 --- a/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/Examples/WhisperAX/WhisperAX.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -1,4 +1,5 @@ { + "originHash" : "cd17206b47bb810af9459722192530e3838d8e6629a970988e32a432aaa05f6e", "pins" : [ { "identity" : "networkimage", @@ -37,5 +38,5 @@ } } ], - "version" : 2 + "version" : 3 } diff --git a/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift b/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift index 5e206781..2809c834 100644 --- a/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift +++ b/Examples/WhisperAX/WhisperAXWatchApp/WhisperAXExampleView.swift @@ -73,7 +73,7 @@ struct WhisperAXWatchView: View { var body: some View { NavigationSplitView { - if WhisperKit.deviceName().hasPrefix("Watch7") { + if WhisperKit.deviceName().hasPrefix("Watch7") || WhisperKit.isRunningOnSimulator { modelSelectorView .navigationTitle("WhisperAX") .navigationBarTitleDisplayMode(.automatic) diff --git a/Sources/WhisperKit/Core/AudioProcessor.swift b/Sources/WhisperKit/Core/AudioProcessor.swift index 0231fba4..a746135c 100644 --- a/Sources/WhisperKit/Core/AudioProcessor.swift +++ b/Sources/WhisperKit/Core/AudioProcessor.swift @@ -487,7 +487,27 @@ public extension AudioProcessor { } } #endif - + + /// Attempts to setup the shared audio session if available on the device's OS + func setupAudioSessionForDevice() throws { + #if !os(macOS) // AVAudioSession is not available on macOS + + #if !os(watchOS) // watchOS does not support .defaultToSpeaker + let options: AVAudioSession.CategoryOptions = [.defaultToSpeaker, .allowBluetooth] + #else + let options: AVAudioSession.CategoryOptions = .mixWithOthers + #endif + + let audioSession = AVAudioSession.sharedInstance() + do { + try audioSession.setCategory(.playAndRecord, options: options) + try audioSession.setActive(true, options: .notifyOthersOnDeactivation) + } catch let error as NSError { + throw WhisperError.audioProcessingFailed("Failed to set up audio session: \(error)") + } + #endif + } + func setupEngine(inputDeviceID: DeviceID? = nil) throws -> AVAudioEngine { let audioEngine = AVAudioEngine() let inputNode = audioEngine.inputNode @@ -546,6 +566,8 @@ public extension AudioProcessor { func startRecordingLive(inputDeviceID: DeviceID? = nil, callback: (([Float]) -> Void)? = nil) throws { audioSamples = [] audioEnergy = [] + + try? setupAudioSessionForDevice() audioEngine = try setupEngine(inputDeviceID: inputDeviceID) diff --git a/Sources/WhisperKit/Core/Models.swift b/Sources/WhisperKit/Core/Models.swift index e9a7b2bd..50e92283 100644 --- a/Sources/WhisperKit/Core/Models.swift +++ b/Sources/WhisperKit/Core/Models.swift @@ -126,6 +126,7 @@ public enum ModelState: CustomStringConvertible { } } +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) public struct ModelComputeOptions { public var melCompute: MLComputeUnits public var audioEncoderCompute: MLComputeUnits @@ -138,6 +139,13 @@ public struct ModelComputeOptions { textDecoderCompute: MLComputeUnits = .cpuAndNeuralEngine, prefillCompute: MLComputeUnits = .cpuOnly ) { + if WhisperKit.isRunningOnSimulator { + self.melCompute = .cpuOnly + self.audioEncoderCompute = .cpuOnly + self.textDecoderCompute = .cpuOnly + self.prefillCompute = .cpuOnly + return + } self.melCompute = melCompute self.audioEncoderCompute = audioEncoderCompute self.textDecoderCompute = textDecoderCompute diff --git a/Sources/WhisperKit/Core/Utils.swift b/Sources/WhisperKit/Core/Utils.swift index ce614902..8d9e5202 100644 --- a/Sources/WhisperKit/Core/Utils.swift +++ b/Sources/WhisperKit/Core/Utils.swift @@ -238,6 +238,17 @@ extension Process { } #endif +@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *) +public extension WhisperKit { + static var isRunningOnSimulator: Bool { + #if targetEnvironment(simulator) + return true + #else + return false + #endif + } +} + public func resolveAbsolutePath(_ inputPath: String) -> String { let fileManager = FileManager.default