Swhift WhisperKit その2

開発


WhisperKitの方は、

大体以下のコードでいい感じになってきた。

import Accelerate
import AVFoundation
import CoreML
import Foundation
import SwiftData
import SwiftUI
import WhisperKit


@MainActor
class TranscriptionViewModel: ObservableObject {
    enum VADState: String {
        case silent = "静か"
        case talking = "話し中"
    }

    @AppStorage("selectedLanguage") private var selectedLanguage: String = Locale.current.languageCode ?? "ja"
    @AppStorage("selectedModel") private var selectedModel: String = WhisperModel.none.rawValue
    @AppStorage("inputLanguage") private var inputLanguage: String = "ja"
    @AppStorage("enableTranslation") private var enableTranslation: Bool = false

    @Published var transcribedText: String = ""
    @Published var isRecording: Bool = false
    @Published var isModelReady: Bool = false
    @Published var modelState: ModelState = .notConfigured
    @Published var exportedAudioURL: URL? = nil
    @Published var isFinalizing = false
    @Published var currentItem: Item? = nil
    @Published var vadState: VADState = .silent
    @Published var liveText: String = ""

    @Published var downloadProgress: Double = 0.0  // 0.0〜1.0
    
    private var processingItem: Item? = nil
    private var audioRecorder = AudioRecorderControl()
    private var whisperKit: WhisperKit?
    private var transcriber: AudioStreamTranscriber?
    private let player = AudioPlayer()

    // 🔑 draftをキャッシュして最後に回収する
    private var lastUnconfirmedDraft: String = ""

    // 🔇 3秒無音で未確定を強制フラッシュするための軽量ウォッチ
    private let silenceHoldSec: TimeInterval = 3.0
    private var lastVoiceAt: Date = .distantPast
    private var lastForceFlushAt: Date? = nil

    var onNewItem: ((Item) -> Void)?

    var canRecord: Bool {
        return isModelReady && !isRecording
    }

    enum ModelState: CustomStringConvertible, Equatable {
        case notConfigured, loadingModel, ready, recording, failed(Error)

        var description: String {
            switch self {
            case .notConfigured: return "モデル未設定"
            case .loadingModel: return "モデル読み込み中…"
            case .ready: return "モデル準備完了"
            case .recording: return "録音中"
            case let .failed(e): return "エラー: \(e.localizedDescription)"
            }
        }

        static func == (lhs: ModelState, rhs: ModelState) -> Bool {
            switch (lhs, rhs) {
            case (.notConfigured, .notConfigured), (.loadingModel, .loadingModel), (.ready, .ready), (.recording, .recording):
                return true
            case let (.failed(e1), .failed(e2)):
                return e1.localizedDescription == e2.localizedDescription
            default:
                return false
            }
        }
    }

    init() {
        if selectedModel != WhisperModel.none.rawValue {
            Task { await loadModel(selectedModel) }
        }
    }
    
    
    func loadModel(_ model: String, redownload: Bool = false) async {
        modelState = .loadingModel
        do {
            let config = WhisperKitConfig(
                computeOptions: .init(audioEncoderCompute: .cpuAndNeuralEngine,
                                      textDecoderCompute: .cpuAndNeuralEngine),
                logLevel: .none,
                prewarm: false
            )

            whisperKit = try await WhisperKit(config)
            guard let whisperKit = whisperKit else {
                throw NSError(domain: "WhisperKitInit", code: 1,
                              userInfo: [NSLocalizedDescriptionKey: "WhisperKitの初期化に失敗しました"])
            }

            // ✅ キャッシュ優先でモデル取得
            let supportDir = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first!
            let modelDir = supportDir.appendingPathComponent("WhisperModels/\(model)", isDirectory: true)

            if FileManager.default.fileExists(atPath: modelDir.path) && !redownload {
                whisperKit.modelFolder = modelDir
                await MainActor.run { self.downloadProgress = 1.0 }
            } else {
                // 初回のみダウンロード
                let folder = try await WhisperKit.download(
                    variant: model,
                    from: "argmaxinc/whisperkit-coreml",
                    progressCallback: { progress in
                        DispatchQueue.main.async {
                            self.downloadProgress = progress.fractionCompleted * 0.8 // ダウンロード分は0〜80%
                        }
                    }
                )

                try FileManager.default.createDirectory(
                    at: modelDir.deletingLastPathComponent(),
                    withIntermediateDirectories: true
                )
                try FileManager.default.moveItem(at: folder, to: modelDir)

                whisperKit.modelFolder = modelDir
            }

            // ✅ 一度だけロード
            if !isModelReady {
                await MainActor.run { self.downloadProgress = 0.85 }
                try await whisperKit.prewarmModels()
                await MainActor.run { self.downloadProgress = 0.9 }
                try await whisperKit.loadModels()
                await MainActor.run { self.downloadProgress = 1.0 }
            }

            guard whisperKit.tokenizer != nil else {
                throw NSError(domain: "WhisperKitInit", code: 2,
                              userInfo: [NSLocalizedDescriptionKey: "Tokenizerがnilです"])
            }

            transcriber = makeTranscriber()
            isModelReady = true
            await MainActor.run { self.modelState = .ready }

        } catch {
            await MainActor.run {
                self.modelState = .failed(error)
                if error.localizedDescription.contains("MILCompilerForANE") {
                    self.transcribedText = "⚠️ Neural Engine でモデルを初期化できませんでした。別のモデルを選択してください。"
                } else {
                    self.transcribedText = "💥 モデル初期化エラー: \(error.localizedDescription)"
                }
            }
            print("💥 WhisperKit model init error: \(error.localizedDescription)")
        }
    }
    
    func toggleRecording() async {
        if isRecording {
            // 停止
            isRecording = false
            modelState = .ready
            isFinalizing = true

            // 🔚 少しだけ余裕を与える
            try? await Task.sleep(nanoseconds: 150_000_000)
            await transcriber?.stopStreamTranscription()

            // ✅ 停止時に未確定テキストを強制的に確定(重複は末尾比較で回避)
            if !lastUnconfirmedDraft.isEmpty {
                appendDedup(lastUnconfirmedDraft)
                lastUnconfirmedDraft = ""
            }

            audioRecorder.stopRecording()
            processingItem?.audioFilePath = audioRecorder.lastRecordedURL?.path

            // ✅ 停止時に必ず破棄
            transcriber = nil

        } else {
            // 開始
            let newItem = Item(timestamp: Date())
            processingItem = newItem
            currentItem = Item(timestamp: newItem.timestamp)

            lastConfirmedSegmentEndSeconds = 0.0
            transcribedText = ""
            isRecording = true
            modelState = .recording
            isFinalizing = false
            lastVoiceAt = Date()
            lastForceFlushAt = nil
            lastUnconfirmedDraft = ""

            try? await AVAudioSession.sharedInstance().setActive(true)
            audioRecorder.startRecording()

            // ✅ 必要なら 0.1 秒だけ待機(短めに)
            try? await Task.sleep(nanoseconds: 100_000_000)

            // ✅ 停止時に破棄したので、ここで毎回新しく作る
            transcriber = makeTranscriber()

            try? await transcriber?.startStreamTranscription()
        }
    }

    private func makeTranscriber() -> AudioStreamTranscriber? {
        guard let whisperKit = whisperKit,
              let tokenizer = whisperKit.tokenizer else {
            print("❌ WhisperKit / Tokenizer が初期化されていません")
            return nil
        }

        let decodeOptions = DecodingOptions(
            task: enableTranslation ? .translate : .transcribe,
            language: inputLanguage,
            temperature: 0.0,
            skipSpecialTokens: true,
            suppressBlank: true,
            compressionRatioThreshold: 2.4,
            logProbThreshold: -1.0,
            noSpeechThreshold: 0.6,
            concurrentWorkerCount: 3
        )

        return AudioStreamTranscriber(
            audioEncoder: whisperKit.audioEncoder,
            featureExtractor: whisperKit.featureExtractor,
            segmentSeeker: whisperKit.segmentSeeker,
            textDecoder: whisperKit.textDecoder,
            tokenizer: tokenizer,
            audioProcessor: whisperKit.audioProcessor,
            decodingOptions: decodeOptions,
            requiredSegmentsForConfirmation: 1,  // 即確定寄り
            silenceThreshold: 0.15,
            compressionCheckWindow: 0,
            useVAD: true,
            stateChangeCallback: { [weak self] _, newState in
                Task { await self?.handleStateChange(newState) }

                // 🔑 draftをキャッシュ
                let draft = newState.unconfirmedText
                    .map { $0.replacingOccurrences(of: "Waiting for speech", with: "") }
                    .joined()
                self?.lastUnconfirmedDraft = draft
            }
        )
    }

    private var lastConfirmedSegmentEndSeconds: Double = 0.0
    
    private func handleStateChange(_ state: AudioStreamTranscriber.State) async {
        guard let segments = state.confirmedSegments as? [TranscriptionSegment] else {
            print("❌ confirmedSegments is not [TranscriptionSegment]")
            return
        }

        // 新しいセグメントのみ抽出
        let newSegments = segments.filter { Double($0.end) > self.lastConfirmedSegmentEndSeconds }

        // テキストを追加(速さ重視)※重複防止
        newSegments.forEach { segment in
            appendDedup(segment.text)
        }

        // 最終 end 時刻を更新
        if let last = newSegments.last {
            self.lastConfirmedSegmentEndSeconds = Double(last.end)
            self.currentItem?.whisText = self.processingItem!.whisText
        }
        
        // draft 更新
        let draft = state.unconfirmedText
            .map { $0.replacingOccurrences(of: "Waiting for speech", with: "") }
            .joined()
        
        self.liveText = (currentItem?.whisText ?? "") + draft

        // 軽量VAD状態更新 + 無音監視
        let energy = state.bufferEnergy.last ?? 0
        if energy > 0.3 {
            self.vadState = .talking
            self.lastVoiceAt = Date()
        } else {
            self.vadState = .silent
            // 3秒以上無音が続いたら未確定をフラッシュ(録音中のみ)
            if self.isRecording,
               !draft.isEmpty,
               Date().timeIntervalSince(self.lastVoiceAt) >= self.silenceHoldSec
            {
                // 直近でフラッシュしていたら抑制
                if self.lastForceFlushAt == nil || Date().timeIntervalSince(self.lastForceFlushAt!) > 0.8 {
                    appendDedup(draft)
                    self.lastUnconfirmedDraft = ""
                    self.currentItem?.whisText = self.processingItem?.whisText ?? ""
                    self.liveText = self.currentItem?.whisText ?? ""
                    self.lastForceFlushAt = Date()
                }
            }
        }

        // モデル状態:推論完了時の後処理
        if !self.isRecording && !state.isRecording {
            self.modelState = .ready
            self.isFinalizing = false

            if let item = self.processingItem {
                self.onNewItem?(item)
            }

            self.processingItem = nil
            self.currentItem = nil
        }
    }

    // 末尾重複を避けて安全に追記
    private func appendDedup(_ text: String) {
        let t = text.trimmingCharacters(in: .whitespacesAndNewlines)
        guard !t.isEmpty else { return }

        if !transcribedText.hasSuffix(t) {
            processingItem?.whisText += t
            transcribedText += t
        }
    }
}