From a1d2229416dc6c073c19c9c927fb9c0bcda27276 Mon Sep 17 00:00:00 2001 From: Antoine Dufour Date: Sat, 18 Feb 2023 11:12:23 -0500 Subject: [PATCH] Improvement to transcription starting point with VAD --- whisperx/transcribe.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index c272f18e..3e06dc1c 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -262,7 +262,6 @@ def merge_chunks(segments, chunk_size=CHUNK_LENGTH): TODO: Make sure VAD segment isn't too long, otherwise it will cause OOM when input to alignment model TODO: Or sliding window alignment model over long segment. """ - curr_start = 0 curr_end = 0 merged_segments = [] seg_idxs = [] @@ -275,7 +274,11 @@ def merge_chunks(segments, chunk_size=CHUNK_LENGTH): for speech_turn in segments.get_timeline(): segments_list.append(Segment(speech_turn.start, speech_turn.end, "UNKNOWN")) - for sdx, seg in enumerate(segments_list): + assert segments_list, "segments_list is empty." + # Make sur the starting point is the start of the segment. + curr_start = segments_list[0].start + + for seg in segments_list: if seg.end - curr_start > chunk_size and curr_end-curr_start > 0: merged_segments.append({ "start": curr_start,