@@ -12,6 +12,7 @@ public typealias DeviceID = AudioDeviceID
12
12
#else
13
13
public typealias DeviceID = String
14
14
#endif
15
+ public typealias ChannelMode = AudioInputConfig . ChannelMode
15
16
16
17
public struct AudioDevice : Identifiable , Hashable {
17
18
public let id : DeviceID
@@ -23,19 +24,43 @@ public struct AudioDevice: Identifiable, Hashable {
23
24
}
24
25
}
25
26
27
+ /// Configuration for audio input including device selection and channel processing options.
28
+ public struct AudioInputConfig {
29
+ /// Specifies how to handle audio channels when processing multi-channel audio.
30
+ public enum ChannelMode : Hashable , Codable {
31
+ /// Selects a single specific channel by index.
32
+ /// - Parameter index: The zero-based index of the channel to use.
33
+ /// 0 selects the first channel, 1 selects the second, etc.
34
+ case specificChannel( Int )
35
+
36
+ /// Mixes all channels together with peak normalization if parameter is left `nil`.
37
+ /// - Parameter channels: Array of zero-based channel indices to mix.
38
+ /// For example, `[0, 2]` mixes just the first and third channels.
39
+ /// The resulting mono audio will maintain the same peak level as the
40
+ /// loudest original channel to prevent clipping.
41
+ case sumChannels( [ Int ] ? )
42
+ }
43
+
44
+ /// Specifies how to process channels from multi-channel audio sources.
45
+ /// Defaults to summing all channels if not explicitly set.
46
+ public var channelMode : ChannelMode = . sumChannels( nil )
47
+ }
48
+
26
49
public protocol AudioProcessing {
27
50
/// Loads audio data from a specified file path.
28
51
/// - Parameters:
29
52
/// - audioFilePath: The file path of the audio file.
53
+ /// - channelMode: Channel Mode selected for loadAudio
30
54
/// - startTime: Optional start time in seconds to read from
31
55
/// - endTime: Optional end time in seconds to read until
32
56
/// - Returns: `AVAudioPCMBuffer` containing the audio data.
33
- static func loadAudio( fromPath audioFilePath: String , startTime: Double ? , endTime: Double ? , maxReadFrameSize: AVAudioFrameCount ? ) throws -> AVAudioPCMBuffer
57
+ static func loadAudio( fromPath audioFilePath: String , channelMode : ChannelMode , startTime: Double ? , endTime: Double ? , maxReadFrameSize: AVAudioFrameCount ? ) throws -> AVAudioPCMBuffer
34
58
35
59
/// Loads and converts audio data from a specified file paths.
36
60
/// - Parameter audioPaths: The file paths of the audio files.
61
+ /// - Parameter channelMode: Channel Mode selected for loadAudio
37
62
/// - Returns: Array of `.success` if the file was loaded and converted correctly, otherwise `.failure`
38
- static func loadAudio( at audioPaths: [ String ] ) async -> [ Result < [ Float ] , Swift . Error > ]
63
+ static func loadAudio( at audioPaths: [ String ] , channelMode : ChannelMode ) async -> [ Result < [ Float ] , Swift . Error > ]
39
64
40
65
/// Pad or trim the audio data to the desired length.
41
66
/// - Parameters:
@@ -189,21 +214,22 @@ public class AudioProcessor: NSObject, AudioProcessing {
189
214
190
215
public static func loadAudio(
191
216
fromPath audioFilePath: String ,
217
+ channelMode: ChannelMode = . sumChannels( nil ) ,
192
218
startTime: Double ? = 0 ,
193
219
endTime: Double ? = nil ,
194
220
maxReadFrameSize: AVAudioFrameCount ? = nil
195
221
) throws -> AVAudioPCMBuffer {
196
222
guard FileManager . default. fileExists ( atPath: audioFilePath) else {
197
223
throw WhisperError . loadAudioFailed ( " Resource path does not exist \( audioFilePath) " )
198
224
}
199
-
200
225
let audioFileURL = URL ( fileURLWithPath: audioFilePath)
201
226
let audioFile = try AVAudioFile ( forReading: audioFileURL, commonFormat: . pcmFormatFloat32, interleaved: false )
202
- return try loadAudio ( fromFile: audioFile, startTime: startTime, endTime: endTime, maxReadFrameSize: maxReadFrameSize)
227
+ return try loadAudio ( fromFile: audioFile, channelMode : channelMode , startTime: startTime, endTime: endTime, maxReadFrameSize: maxReadFrameSize)
203
228
}
204
229
205
230
public static func loadAudio(
206
231
fromFile audioFile: AVAudioFile ,
232
+ channelMode: ChannelMode = . sumChannels( nil ) ,
207
233
startTime: Double ? = 0 ,
208
234
endTime: Double ? = nil ,
209
235
maxReadFrameSize: AVAudioFrameCount ? = nil
@@ -241,8 +267,15 @@ public class AudioProcessor: NSObject, AudioProcessing {
241
267
outputBuffer = buffer
242
268
} else {
243
269
// Audio needs resampling to 16khz
244
- let maxReadFrameSize = maxReadFrameSize ?? Constants . defaultAudioReadFrameSize
245
- outputBuffer = resampleAudio ( fromFile: audioFile, toSampleRate: 16000 , channelCount: 1 , frameCount: frameCount, maxReadFrameSize: maxReadFrameSize)
270
+ let maxReadSize = maxReadFrameSize ?? Constants . defaultAudioReadFrameSize
271
+ outputBuffer = resampleAudio (
272
+ fromFile: audioFile,
273
+ toSampleRate: 16000 ,
274
+ channelCount: 1 ,
275
+ channelMode: channelMode,
276
+ frameCount: frameCount,
277
+ maxReadFrameSize: maxReadSize
278
+ )
246
279
}
247
280
248
281
if let outputBuffer = outputBuffer {
@@ -259,13 +292,13 @@ public class AudioProcessor: NSObject, AudioProcessing {
259
292
260
293
public static func loadAudioAsFloatArray(
261
294
fromPath audioFilePath: String ,
295
+ channelMode: ChannelMode = . sumChannels( nil ) ,
262
296
startTime: Double ? = 0 ,
263
297
endTime: Double ? = nil
264
298
) throws -> [ Float ] {
265
299
guard FileManager . default. fileExists ( atPath: audioFilePath) else {
266
300
throw WhisperError . loadAudioFailed ( " Resource path does not exist \( audioFilePath) " )
267
301
}
268
-
269
302
let audioFileURL = URL ( fileURLWithPath: audioFilePath)
270
303
let audioFile = try AVAudioFile ( forReading: audioFileURL, commonFormat: . pcmFormatFloat32, interleaved: false )
271
304
let inputSampleRate = audioFile. fileFormat. sampleRate
@@ -287,6 +320,7 @@ public class AudioProcessor: NSObject, AudioProcessing {
287
320
try autoreleasepool {
288
321
let buffer = try loadAudio (
289
322
fromFile: audioFile,
323
+ channelMode: channelMode,
290
324
startTime: currentTime,
291
325
endTime: chunkEnd
292
326
)
@@ -301,12 +335,12 @@ public class AudioProcessor: NSObject, AudioProcessing {
301
335
return result
302
336
}
303
337
304
- public static func loadAudio( at audioPaths: [ String ] ) async -> [ Result < [ Float ] , Swift . Error > ] {
338
+ public static func loadAudio( at audioPaths: [ String ] , channelMode : ChannelMode = . sumChannels ( nil ) ) async -> [ Result < [ Float ] , Swift . Error > ] {
305
339
await withTaskGroup ( of: [ ( index: Int , result: Result < [ Float ] , Swift . Error > ) ] . self) { taskGroup -> [ Result < [ Float ] , Swift . Error > ] in
306
340
for (index, audioPath) in audioPaths. enumerated ( ) {
307
341
taskGroup. addTask {
308
342
do {
309
- let audio = try AudioProcessor . loadAudioAsFloatArray ( fromPath: audioPath)
343
+ let audio = try AudioProcessor . loadAudioAsFloatArray ( fromPath: audioPath, channelMode : channelMode )
310
344
return [ ( index: index, result: . success( audio) ) ]
311
345
} catch {
312
346
return [ ( index: index, result: . failure( error) ) ]
@@ -334,6 +368,7 @@ public class AudioProcessor: NSObject, AudioProcessing {
334
368
fromFile audioFile: AVAudioFile ,
335
369
toSampleRate sampleRate: Double ,
336
370
channelCount: AVAudioChannelCount ,
371
+ channelMode: ChannelMode = . sumChannels( nil ) ,
337
372
frameCount: AVAudioFrameCount ? = nil ,
338
373
maxReadFrameSize: AVAudioFrameCount = Constants . defaultAudioReadFrameSize
339
374
) -> AVAudioPCMBuffer ? {
@@ -370,7 +405,15 @@ public class AudioProcessor: NSObject, AudioProcessing {
370
405
371
406
do {
372
407
try audioFile. read ( into: inputBuffer, frameCount: framesToRead)
373
- guard let resampledChunk = resampleAudio ( fromBuffer: inputBuffer,
408
+
409
+ // Convert to mono if needed
410
+ guard let monoChunk = convertToMono ( inputBuffer, mode: channelMode) else {
411
+ Logging . error ( " Failed to process audio channels " )
412
+ return nil
413
+ }
414
+
415
+ // Resample mono audio
416
+ guard let resampledChunk = resampleAudio ( fromBuffer: monoChunk,
374
417
toSampleRate: outputFormat. sampleRate,
375
418
channelCount: outputFormat. channelCount)
376
419
else {
@@ -461,6 +504,112 @@ public class AudioProcessor: NSObject, AudioProcessing {
461
504
return convertedBuffer
462
505
}
463
506
507
+ /// Convert multi channel audio to mono based on the specified mode
508
+ /// - Parameters:
509
+ /// - buffer: The input audio buffer with multiple channels
510
+ /// - mode: The channel processing mode
511
+ /// - Returns: A mono-channel audio buffer
512
+ public static func convertToMono( _ buffer: AVAudioPCMBuffer , mode: ChannelMode ) -> AVAudioPCMBuffer ? {
513
+ let channelCount = Int ( buffer. format. channelCount)
514
+ let frameLength = Int ( buffer. frameLength)
515
+
516
+ if channelCount <= 1 {
517
+ // Early return, audio is already mono format
518
+ return buffer
519
+ }
520
+
521
+ guard let channelData = buffer. floatChannelData else {
522
+ Logging . error ( " Buffer did not contain floatChannelData. " )
523
+ return nil
524
+ }
525
+
526
+ // Create a new single-channel buffer
527
+ guard let monoFormat = AVAudioFormat (
528
+ commonFormat: . pcmFormatFloat32,
529
+ sampleRate: buffer. format. sampleRate,
530
+ channels: 1 ,
531
+ interleaved: false
532
+ ) else {
533
+ Logging . error ( " Failed to create AVAudioFormat object. " )
534
+ return nil
535
+ }
536
+
537
+ guard let monoBuffer = AVAudioPCMBuffer (
538
+ pcmFormat: monoFormat,
539
+ frameCapacity: buffer. frameCapacity
540
+ ) else {
541
+ Logging . error ( " Failed to create mono buffer. " )
542
+ return nil
543
+ }
544
+
545
+ monoBuffer. frameLength = buffer. frameLength
546
+
547
+ // Make sure mono buffer has channel data
548
+ guard let monoChannelData = monoBuffer. floatChannelData else { return buffer }
549
+
550
+ // Clear the buffer to ensure it starts with zeros
551
+ vDSP_vclr ( monoChannelData [ 0 ] , 1 , vDSP_Length ( frameLength) )
552
+
553
+ switch mode {
554
+ case let . specificChannel( channelIndex) :
555
+ // Copy the specified channel, defaulting to first channel if out of range
556
+ let safeIndex = ( channelIndex >= 0 && channelIndex < channelCount) ? channelIndex : 0
557
+ memcpy ( monoChannelData [ 0 ] , channelData [ safeIndex] , frameLength * MemoryLayout< Float> . size)
558
+
559
+ case let . sumChannels( channelIndices) :
560
+ // Determine which channels to sum
561
+ let indicesToSum : [ Int ]
562
+
563
+ if let indices = channelIndices, !indices. isEmpty {
564
+ // Sum specific channels (filter out invalid indices)
565
+ indicesToSum = indices. filter { $0 >= 0 && $0 < channelCount }
566
+
567
+ // Handle case where all specified indices are invalid
568
+ if indicesToSum. isEmpty {
569
+ memcpy ( monoChannelData [ 0 ] , channelData [ 0 ] , frameLength * MemoryLayout< Float> . size)
570
+ Logging . debug ( " No valid channel indices provided, defaulting to first channel " )
571
+ return monoBuffer
572
+ }
573
+ } else {
574
+ // Sum all channels (nil or empty array provided)
575
+ indicesToSum = Array ( 0 ..< channelCount)
576
+ }
577
+
578
+ // First, find the maximum peak across selected input channels
579
+ var maxOriginalPeak : Float = 0.0
580
+ for channelIndex in indicesToSum {
581
+ var channelPeak : Float = 0.0
582
+ vDSP_maxmgv ( channelData [ channelIndex] , 1 , & channelPeak, vDSP_Length ( frameLength) )
583
+ maxOriginalPeak = max ( maxOriginalPeak, channelPeak)
584
+ }
585
+
586
+ // Sum the specified channels
587
+ for channelIndex in indicesToSum {
588
+ vDSP_vadd (
589
+ monoChannelData [ 0 ] , 1 ,
590
+ channelData [ channelIndex] , 1 ,
591
+ monoChannelData [ 0 ] , 1 ,
592
+ vDSP_Length ( frameLength)
593
+ )
594
+ }
595
+
596
+ // Find the peak in the mono mix
597
+ var monoPeak : Float = 0.0
598
+ vDSP_maxmgv ( monoChannelData [ 0 ] , 1 , & monoPeak, vDSP_Length ( frameLength) )
599
+
600
+ // Scale based on peak ratio (avoid division by zero)
601
+ var scale = maxOriginalPeak / max( monoPeak, 0.0001 )
602
+ vDSP_vsmul (
603
+ monoChannelData [ 0 ] , 1 ,
604
+ & scale,
605
+ monoChannelData [ 0 ] , 1 ,
606
+ vDSP_Length ( frameLength)
607
+ )
608
+ }
609
+
610
+ return monoBuffer
611
+ }
612
+
464
613
// MARK: - Utility
465
614
466
615
/// Detect voice activity in the given buffer of relative energy values.
@@ -584,7 +733,6 @@ public class AudioProcessor: NSObject, AudioProcessing {
584
733
585
734
let frameLength = Int ( buffer. frameLength)
586
735
let startPointer = channelData [ 0 ]
587
-
588
736
var result : [ Float ] = [ ]
589
737
result. reserveCapacity ( frameLength) // Reserve the capacity to avoid multiple allocations
590
738
0 commit comments