in HuggingChat-Mac/LocalSTT/AudioModelManager.swift [852:973]
func transcribeEagerMode(_ samples: [Float]) async throws -> TranscriptionResult? {
guard let whisperKit = whisperKit else { return nil }
guard whisperKit.textDecoder.supportsWordTimestamps else {
confirmedText = "Eager mode requires word timestamps, which are not supported by the current model."
return nil
}
let languageCode = Constants.languages[selectedLanguage, default: Constants.defaultLanguageCode]
let task: DecodingTask = selectedTask == "transcribe" ? .transcribe : .translate
print(selectedLanguage)
print(languageCode)
let options = DecodingOptions(
verbose: true,
task: task,
language: languageCode,
temperature: Float(temperatureStart),
temperatureFallbackCount: Int(fallbackCount),
sampleLength: Int(sampleLength),
usePrefillPrompt: enablePromptPrefill,
usePrefillCache: enableCachePrefill,
skipSpecialTokens: !enableSpecialCharacters,
withoutTimestamps: !enableTimestamps,
wordTimestamps: true, // required for eager mode
firstTokenLogProbThreshold: -1.5 // higher threshold to prevent fallbacks from running to often
)
// Early stopping checks
let decodingCallback: ((TranscriptionProgress) -> Bool?) = { progress in
DispatchQueue.main.async {
let fallbacks = Int(progress.timings.totalDecodingFallbacks)
if progress.text.count < self.currentText.count {
if fallbacks == self.currentFallbacks {
// self.unconfirmedText.append(currentText)
} else {
print("Fallback occured: \(fallbacks)")
}
}
self.currentText = progress.text
self.currentFallbacks = fallbacks
self.currentDecodingLoops += 1
}
// Check early stopping
let currentTokens = progress.tokens
let checkWindow = Int(self.compressionCheckWindow)
if currentTokens.count > checkWindow {
let checkTokens: [Int] = currentTokens.suffix(checkWindow)
let compressionRatio = compressionRatio(of: checkTokens)
if compressionRatio > options.compressionRatioThreshold! {
Logging.debug("Early stopping due to compression threshold")
return false
}
}
if progress.avgLogprob! < options.logProbThreshold! {
Logging.debug("Early stopping due to logprob threshold")
return false
}
return nil
}
Logging.info("[EagerMode] \(lastAgreedSeconds)-\(Double(samples.count) / 16000.0) seconds")
let streamingAudio = samples
var streamOptions = options
streamOptions.clipTimestamps = [lastAgreedSeconds]
let lastAgreedTokens = lastAgreedWords.flatMap { $0.tokens }
streamOptions.prefixTokens = lastAgreedTokens
do {
let transcription: TranscriptionResult? = try await whisperKit.transcribe(audioArray: streamingAudio, decodeOptions: streamOptions, callback: decodingCallback).first
await MainActor.run {
var skipAppend = false
if let result = transcription {
hypothesisWords = result.allWords.filter { $0.start >= lastAgreedSeconds }
if let prevResult = prevResult {
prevWords = prevResult.allWords.filter { $0.start >= lastAgreedSeconds }
let commonPrefix = findLongestCommonPrefix(prevWords, hypothesisWords)
Logging.info("[EagerMode] Prev \"\((prevWords.map { $0.word }).joined())\"")
Logging.info("[EagerMode] Next \"\((hypothesisWords.map { $0.word }).joined())\"")
Logging.info("[EagerMode] Found common prefix \"\((commonPrefix.map { $0.word }).joined())\"")
if commonPrefix.count >= Int(tokenConfirmationsNeeded) {
lastAgreedWords = commonPrefix.suffix(Int(tokenConfirmationsNeeded))
lastAgreedSeconds = lastAgreedWords.first!.start
Logging.info("[EagerMode] Found new last agreed word \"\(lastAgreedWords.first!.word)\" at \(lastAgreedSeconds) seconds")
confirmedWords.append(contentsOf: commonPrefix.prefix(commonPrefix.count - Int(tokenConfirmationsNeeded)))
let currentWords = confirmedWords.map { $0.word }.joined()
Logging.info("[EagerMode] Current: \(lastAgreedSeconds) -> \(Double(samples.count) / 16000.0) \(currentWords)")
} else {
Logging.info("[EagerMode] Using same last agreed time \(lastAgreedSeconds)")
skipAppend = true
}
}
prevResult = result
}
if !skipAppend {
eagerResults.append(transcription)
}
}
await MainActor.run {
let finalWords = confirmedWords.map { $0.word }.joined()
confirmedText = finalWords
// Accept the final hypothesis because it is the last of the available audio
let lastHypothesis = lastAgreedWords + findLongestDifferentSuffix(prevWords, hypothesisWords)
hypothesisText = lastHypothesis.map { $0.word }.joined()
}
} catch {
Logging.error("[EagerMode] Error: \(error)")
finalizeText()
}
let mergedResult = mergeTranscriptionResults(eagerResults, confirmedWords: confirmedWords)
return mergedResult
}