in HuggingChat-Mac/LocalSTT/AudioModelManager.swift [727:850]
func transcribeCurrentBuffer() async throws {
guard let whisperKit = whisperKit else { return }
// Retrieve the current audio buffer from the audio processor
let currentBuffer = whisperKit.audioProcessor.audioSamples
// Calculate the size and duration of the next buffer segment
let nextBufferSize = currentBuffer.count - lastBufferSize
let nextBufferSeconds = Float(nextBufferSize) / Float(WhisperKit.sampleRate)
// Only run the transcribe if the next buffer has at least 1 second of audio
guard nextBufferSeconds > 0.1 else {
await MainActor.run {
if currentText == "" {
currentText = "Waiting for speech..."
}
}
try await Task.sleep(nanoseconds: 100_000_000) // sleep for 100ms for next buffer
return
}
if useVAD {
let voiceDetected = AudioProcessor.isVoiceDetected(
in: whisperKit.audioProcessor.relativeEnergy,
nextBufferInSeconds: nextBufferSeconds,
silenceThreshold: Float(silenceThreshold)
)
// Only run the transcribe if the next buffer has voice
guard voiceDetected else {
await MainActor.run {
if currentText == "" {
currentText = "Waiting for speech..."
}
}
// TODO: Implement silence buffer purging
// if nextBufferSeconds > 30 {
// // This is a completely silent segment of 30s, so we can purge the audio and confirm anything pending
// lastConfirmedSegmentEndSeconds = 0
// whisperKit.audioProcessor.purgeAudioSamples(keepingLast: 2 * WhisperKit.sampleRate) // keep last 2s to include VAD overlap
// currentBuffer = whisperKit.audioProcessor.audioSamples
// lastBufferSize = 0
// confirmedSegments.append(contentsOf: unconfirmedSegments)
// unconfirmedSegments = []
// }
// Sleep for 100ms and check the next buffer
try await Task.sleep(nanoseconds: 100_000_000)
return
}
}
// Store this for next iterations VAD
lastBufferSize = currentBuffer.count
if enableEagerDecoding && transcriptionMode == .streaming {
// Run realtime transcribe using word timestamps for segmentation
let transcription = try await transcribeEagerMode(Array(currentBuffer))
await MainActor.run {
currentText = ""
self.tokensPerSecond = transcription?.timings.tokensPerSecond ?? 0
self.firstTokenTime = transcription?.timings.firstTokenTime ?? 0
self.pipelineStart = transcription?.timings.pipelineStart ?? 0
self.currentLag = transcription?.timings.decodingLoop ?? 0
self.currentEncodingLoops = Int(transcription?.timings.totalEncodingRuns ?? 0)
let totalAudio = Double(currentBuffer.count) / Double(WhisperKit.sampleRate)
self.totalInferenceTime = transcription?.timings.fullPipeline ?? 0
self.effectiveRealTimeFactor = Double(totalInferenceTime) / totalAudio
self.effectiveSpeedFactor = totalAudio / Double(totalInferenceTime)
}
} else {
// Run realtime transcribe using timestamp tokens directly
let transcription = try await transcribeAudioSamples(Array(currentBuffer))
// We need to run this next part on the main thread
await MainActor.run {
currentText = ""
guard let segments = transcription?.segments else {
return
}
self.tokensPerSecond = transcription?.timings.tokensPerSecond ?? 0
self.firstTokenTime = transcription?.timings.firstTokenTime ?? 0
self.pipelineStart = transcription?.timings.pipelineStart ?? 0
self.currentLag = transcription?.timings.decodingLoop ?? 0
self.currentEncodingLoops += Int(transcription?.timings.totalEncodingRuns ?? 0)
let totalAudio = Double(currentBuffer.count) / Double(WhisperKit.sampleRate)
self.totalInferenceTime += transcription?.timings.fullPipeline ?? 0
self.effectiveRealTimeFactor = Double(totalInferenceTime) / totalAudio
self.effectiveSpeedFactor = totalAudio / Double(totalInferenceTime)
// Logic for moving segments to confirmedSegments
if segments.count > requiredSegmentsForConfirmation {
// Calculate the number of segments to confirm
let numberOfSegmentsToConfirm = segments.count - requiredSegmentsForConfirmation
// Confirm the required number of segments
let confirmedSegmentsArray = Array(segments.prefix(numberOfSegmentsToConfirm))
let remainingSegments = Array(segments.suffix(requiredSegmentsForConfirmation))
// Update lastConfirmedSegmentEnd based on the last confirmed segment
if let lastConfirmedSegment = confirmedSegmentsArray.last, lastConfirmedSegment.end > lastConfirmedSegmentEndSeconds {
lastConfirmedSegmentEndSeconds = lastConfirmedSegment.end
// print("Last confirmed segment end: \(lastConfirmedSegmentEndSeconds)")
// Add confirmed segments to the confirmedSegments array
for segment in confirmedSegmentsArray {
if !self.confirmedSegments.contains(segment: segment) {
self.confirmedSegments.append(segment)
}
}
}
// Update transcriptions to reflect the remaining segments
self.unconfirmedSegments = remainingSegments
} else {
// Handle the case where segments are fewer or equal to required
self.unconfirmedSegments = segments
}
}
}
}