WhisperKitの方は、
大体以下のコードでいい感じになってきた。
import Accelerate
import AVFoundation
import CoreML
import Foundation
import SwiftData
import SwiftUI
import WhisperKit
@MainActor
class TranscriptionViewModel: ObservableObject {
enum VADState: String {
case silent = "静か"
case talking = "話し中"
}
@AppStorage("selectedLanguage") private var selectedLanguage: String = Locale.current.languageCode ?? "ja"
@AppStorage("selectedModel") private var selectedModel: String = WhisperModel.none.rawValue
@AppStorage("inputLanguage") private var inputLanguage: String = "ja"
@AppStorage("enableTranslation") private var enableTranslation: Bool = false
@Published var transcribedText: String = ""
@Published var isRecording: Bool = false
@Published var isModelReady: Bool = false
@Published var modelState: ModelState = .notConfigured
@Published var exportedAudioURL: URL? = nil
@Published var isFinalizing = false
@Published var currentItem: Item? = nil
@Published var vadState: VADState = .silent
@Published var liveText: String = ""
@Published var downloadProgress: Double = 0.0 // 0.0〜1.0
private var processingItem: Item? = nil
private var audioRecorder = AudioRecorderControl()
private var whisperKit: WhisperKit?
private var transcriber: AudioStreamTranscriber?
private let player = AudioPlayer()
// 🔑 draftをキャッシュして最後に回収する
private var lastUnconfirmedDraft: String = ""
// 🔇 3秒無音で未確定を強制フラッシュするための軽量ウォッチ
private let silenceHoldSec: TimeInterval = 3.0
private var lastVoiceAt: Date = .distantPast
private var lastForceFlushAt: Date? = nil
var onNewItem: ((Item) -> Void)?
var canRecord: Bool {
return isModelReady && !isRecording
}
enum ModelState: CustomStringConvertible, Equatable {
case notConfigured, loadingModel, ready, recording, failed(Error)
var description: String {
switch self {
case .notConfigured: return "モデル未設定"
case .loadingModel: return "モデル読み込み中…"
case .ready: return "モデル準備完了"
case .recording: return "録音中"
case let .failed(e): return "エラー: \(e.localizedDescription)"
}
}
static func == (lhs: ModelState, rhs: ModelState) -> Bool {
switch (lhs, rhs) {
case (.notConfigured, .notConfigured), (.loadingModel, .loadingModel), (.ready, .ready), (.recording, .recording):
return true
case let (.failed(e1), .failed(e2)):
return e1.localizedDescription == e2.localizedDescription
default:
return false
}
}
}
init() {
if selectedModel != WhisperModel.none.rawValue {
Task { await loadModel(selectedModel) }
}
}
func loadModel(_ model: String, redownload: Bool = false) async {
modelState = .loadingModel
do {
let config = WhisperKitConfig(
computeOptions: .init(audioEncoderCompute: .cpuAndNeuralEngine,
textDecoderCompute: .cpuAndNeuralEngine),
logLevel: .none,
prewarm: false
)
whisperKit = try await WhisperKit(config)
guard let whisperKit = whisperKit else {
throw NSError(domain: "WhisperKitInit", code: 1,
userInfo: [NSLocalizedDescriptionKey: "WhisperKitの初期化に失敗しました"])
}
// ✅ キャッシュ優先でモデル取得
let supportDir = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first!
let modelDir = supportDir.appendingPathComponent("WhisperModels/\(model)", isDirectory: true)
if FileManager.default.fileExists(atPath: modelDir.path) && !redownload {
whisperKit.modelFolder = modelDir
await MainActor.run { self.downloadProgress = 1.0 }
} else {
// 初回のみダウンロード
let folder = try await WhisperKit.download(
variant: model,
from: "argmaxinc/whisperkit-coreml",
progressCallback: { progress in
DispatchQueue.main.async {
self.downloadProgress = progress.fractionCompleted * 0.8 // ダウンロード分は0〜80%
}
}
)
try FileManager.default.createDirectory(
at: modelDir.deletingLastPathComponent(),
withIntermediateDirectories: true
)
try FileManager.default.moveItem(at: folder, to: modelDir)
whisperKit.modelFolder = modelDir
}
// ✅ 一度だけロード
if !isModelReady {
await MainActor.run { self.downloadProgress = 0.85 }
try await whisperKit.prewarmModels()
await MainActor.run { self.downloadProgress = 0.9 }
try await whisperKit.loadModels()
await MainActor.run { self.downloadProgress = 1.0 }
}
guard whisperKit.tokenizer != nil else {
throw NSError(domain: "WhisperKitInit", code: 2,
userInfo: [NSLocalizedDescriptionKey: "Tokenizerがnilです"])
}
transcriber = makeTranscriber()
isModelReady = true
await MainActor.run { self.modelState = .ready }
} catch {
await MainActor.run {
self.modelState = .failed(error)
if error.localizedDescription.contains("MILCompilerForANE") {
self.transcribedText = "⚠️ Neural Engine でモデルを初期化できませんでした。別のモデルを選択してください。"
} else {
self.transcribedText = "💥 モデル初期化エラー: \(error.localizedDescription)"
}
}
print("💥 WhisperKit model init error: \(error.localizedDescription)")
}
}
func toggleRecording() async {
if isRecording {
// 停止
isRecording = false
modelState = .ready
isFinalizing = true
// 🔚 少しだけ余裕を与える
try? await Task.sleep(nanoseconds: 150_000_000)
await transcriber?.stopStreamTranscription()
// ✅ 停止時に未確定テキストを強制的に確定(重複は末尾比較で回避)
if !lastUnconfirmedDraft.isEmpty {
appendDedup(lastUnconfirmedDraft)
lastUnconfirmedDraft = ""
}
audioRecorder.stopRecording()
processingItem?.audioFilePath = audioRecorder.lastRecordedURL?.path
// ✅ 停止時に必ず破棄
transcriber = nil
} else {
// 開始
let newItem = Item(timestamp: Date())
processingItem = newItem
currentItem = Item(timestamp: newItem.timestamp)
lastConfirmedSegmentEndSeconds = 0.0
transcribedText = ""
isRecording = true
modelState = .recording
isFinalizing = false
lastVoiceAt = Date()
lastForceFlushAt = nil
lastUnconfirmedDraft = ""
try? await AVAudioSession.sharedInstance().setActive(true)
audioRecorder.startRecording()
// ✅ 必要なら 0.1 秒だけ待機(短めに)
try? await Task.sleep(nanoseconds: 100_000_000)
// ✅ 停止時に破棄したので、ここで毎回新しく作る
transcriber = makeTranscriber()
try? await transcriber?.startStreamTranscription()
}
}
private func makeTranscriber() -> AudioStreamTranscriber? {
guard let whisperKit = whisperKit,
let tokenizer = whisperKit.tokenizer else {
print("❌ WhisperKit / Tokenizer が初期化されていません")
return nil
}
let decodeOptions = DecodingOptions(
task: enableTranslation ? .translate : .transcribe,
language: inputLanguage,
temperature: 0.0,
skipSpecialTokens: true,
suppressBlank: true,
compressionRatioThreshold: 2.4,
logProbThreshold: -1.0,
noSpeechThreshold: 0.6,
concurrentWorkerCount: 3
)
return AudioStreamTranscriber(
audioEncoder: whisperKit.audioEncoder,
featureExtractor: whisperKit.featureExtractor,
segmentSeeker: whisperKit.segmentSeeker,
textDecoder: whisperKit.textDecoder,
tokenizer: tokenizer,
audioProcessor: whisperKit.audioProcessor,
decodingOptions: decodeOptions,
requiredSegmentsForConfirmation: 1, // 即確定寄り
silenceThreshold: 0.15,
compressionCheckWindow: 0,
useVAD: true,
stateChangeCallback: { [weak self] _, newState in
Task { await self?.handleStateChange(newState) }
// 🔑 draftをキャッシュ
let draft = newState.unconfirmedText
.map { $0.replacingOccurrences(of: "Waiting for speech", with: "") }
.joined()
self?.lastUnconfirmedDraft = draft
}
)
}
private var lastConfirmedSegmentEndSeconds: Double = 0.0
private func handleStateChange(_ state: AudioStreamTranscriber.State) async {
guard let segments = state.confirmedSegments as? [TranscriptionSegment] else {
print("❌ confirmedSegments is not [TranscriptionSegment]")
return
}
// 新しいセグメントのみ抽出
let newSegments = segments.filter { Double($0.end) > self.lastConfirmedSegmentEndSeconds }
// テキストを追加(速さ重視)※重複防止
newSegments.forEach { segment in
appendDedup(segment.text)
}
// 最終 end 時刻を更新
if let last = newSegments.last {
self.lastConfirmedSegmentEndSeconds = Double(last.end)
self.currentItem?.whisText = self.processingItem!.whisText
}
// draft 更新
let draft = state.unconfirmedText
.map { $0.replacingOccurrences(of: "Waiting for speech", with: "") }
.joined()
self.liveText = (currentItem?.whisText ?? "") + draft
// 軽量VAD状態更新 + 無音監視
let energy = state.bufferEnergy.last ?? 0
if energy > 0.3 {
self.vadState = .talking
self.lastVoiceAt = Date()
} else {
self.vadState = .silent
// 3秒以上無音が続いたら未確定をフラッシュ(録音中のみ)
if self.isRecording,
!draft.isEmpty,
Date().timeIntervalSince(self.lastVoiceAt) >= self.silenceHoldSec
{
// 直近でフラッシュしていたら抑制
if self.lastForceFlushAt == nil || Date().timeIntervalSince(self.lastForceFlushAt!) > 0.8 {
appendDedup(draft)
self.lastUnconfirmedDraft = ""
self.currentItem?.whisText = self.processingItem?.whisText ?? ""
self.liveText = self.currentItem?.whisText ?? ""
self.lastForceFlushAt = Date()
}
}
}
// モデル状態:推論完了時の後処理
if !self.isRecording && !state.isRecording {
self.modelState = .ready
self.isFinalizing = false
if let item = self.processingItem {
self.onNewItem?(item)
}
self.processingItem = nil
self.currentItem = nil
}
}
// 末尾重複を避けて安全に追記
private func appendDedup(_ text: String) {
let t = text.trimmingCharacters(in: .whitespacesAndNewlines)
guard !t.isEmpty else { return }
if !transcribedText.hasSuffix(t) {
processingItem?.whisText += t
transcribedText += t
}
}
}