From 440c31ebfa02cf02fa560f37899ccaed7ed94458 Mon Sep 17 00:00:00 2001
From: Adam Nelson <anelson@users.noreply.github.com>
Date: Wed, 27 Nov 2024 22:09:22 +0100
Subject: [PATCH] Fix for whisper-microphone example failure if audio isn't
 chunk aligned

At least on my macOS Sequoia system (MBP 14" 2021, M1 Pro), when I run
the `whisper-microphone` example after it has gathered 10 seconds of
audio, it fails before the transcription:

```
Error: Insufficient buffer size 384 for input channel 0, expected 1024
```

At least for the audio device I'm using (Airpods Pro Max), there is no
guarantee that each audio buffer is a multiple of 1024 samples.  Thus at
the end of the 10 seconds, `buffered_pcm` can have some samples at the
end that do not form a complete 1024 sample chunk.

This fixes that by tracking when there is a partial chunk at the end of
the buffer, and leaving it in `buffered_pcm` to be processed on the next
loop iteration.

Note that, in the interest of keeping this PR as small as possible, I
didn't make any other changes to this example.
---
 .../examples/whisper-microphone/main.rs       | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/candle-examples/examples/whisper-microphone/main.rs b/candle-examples/examples/whisper-microphone/main.rs
index 5165da1c1e..373c40e2bb 100644
--- a/candle-examples/examples/whisper-microphone/main.rs
+++ b/candle-examples/examples/whisper-microphone/main.rs
@@ -624,13 +624,27 @@ pub fn main() -> Result<()> {
             continue;
         }
         let mut resampled_pcm = vec![];
-        for buffered_pcm in buffered_pcm.chunks(1024) {
+        // resample the audio, one chunk of 1024 samples at a time.
+        // in case the audio input failed to produce an exact multiple of 1024 samples,
+        // process the remainder on the next iteration of the loop.
+        let full_chunks = buffered_pcm.len() / 1024;
+        let remainder = buffered_pcm.len() % 1024;
+        for chunk in 0..full_chunks {
+            let buffered_pcm = &buffered_pcm[chunk * 1024..(chunk + 1) * 1024];
             let pcm = resampler.process(&[&buffered_pcm], None)?;
-            resampled_pcm.extend_from_slice(&pcm[0])
+            resampled_pcm.extend_from_slice(&pcm[0]);
         }
         let pcm = resampled_pcm;
         println!("{} {}", buffered_pcm.len(), pcm.len());
-        buffered_pcm.clear();
+        if remainder == 0 {
+            buffered_pcm.clear();
+        } else {
+            // efficiently copy the remainder to the beginning of the `buffered_pcm` buffer and
+            // truncate it.  That's more efficient then allocating a new vector and copying into it
+            println!("audio device produced partial chunk with {remainder} samples; processing the remainder on the next iteration of the loop");
+            buffered_pcm.copy_within(full_chunks * 1024.., 0);
+            buffered_pcm.truncate(remainder);
+        }
         let mel = audio::pcm_to_mel(&config, &pcm, &mel_filters);
         let mel_len = mel.len();
         let mel = Tensor::from_vec(