Force alignment for tts audio (#418)

* add originalText as param * save original text when added from tts speech * fix player in conversation sheet * minor fix
ZuodaoTech · Mar 18, 2024 · de89ae7 · de89ae7
1 parent 94dfabf
commit de89ae7
Show file tree

Hide file tree

Showing 10 changed files with 123 additions and 61 deletions.
diff --git a/enjoy/src/main/db/handlers/audios-handler.ts b/enjoy/src/main/db/handlers/audios-handler.ts
@@ -72,6 +72,7 @@ class AudiosHandler {
     params: {
       name?: string;
       coverUrl?: string;
+      originalText?: string;
     } = {}
   ) {
     let file = uri;
@@ -95,19 +96,33 @@ class AudiosHandler {
       }
     }
 
-    return Audio.buildFromLocalFile(file, {
-      source,
-      ...params,
-    })
-      .then((audio) => {
-        return audio.toJSON();
-      })
-      .catch((err) => {
-        return event.sender.send("on-notification", {
-          type: "error",
-          message: t("models.audio.failedToAdd", { error: err.message }),
+    try {
+      const audio = await Audio.buildFromLocalFile(file, {
+        source,
+        name: params.name,
+        coverUrl: params.coverUrl,
+      });
+
+      // create transcription if originalText is provided
+      const { originalText } = params;
+      if (originalText) {
+        await Transcription.create({
+          targetType: "Audio",
+          targetId: audio.id,
+          targetMd5: audio.md5,
+          result: {
+            originalText,
+          },
         });
+      }
+
+      return audio.toJSON();
+    } catch (err) {
+      return event.sender.send("on-notification", {
+        type: "error",
+        message: t("models.audio.failedToAdd", { error: err.message }),
       });
+    }
   }
 
   private async update(

diff --git a/enjoy/src/main/db/models/audio.ts b/enjoy/src/main/db/models/audio.ts
@@ -229,6 +229,12 @@ export class Audio extends Model<Audio> {
         targetType: "Audio",
       },
     });
+    Transcription.destroy({
+      where: {
+        targetId: audio.id,
+        targetType: "Audio",
+      },
+    });
 
     const webApi = new Client({
       baseUrl: process.env.WEB_API_URL || WEB_API_URL,

diff --git a/enjoy/src/renderer/components/audios/audio-player.tsx b/enjoy/src/renderer/components/audios/audio-player.tsx
@@ -26,6 +26,10 @@ export const AudioPlayer = (props: { id?: string; md5?: string }) => {
 
   useEffect(() => {
     setRef(ref);
+
+    return () => {
+      setRef(null);
+    };
   }, [ref]);
 
   return (

diff --git a/enjoy/src/renderer/components/medias/media-current-recording.tsx b/enjoy/src/renderer/components/medias/media-current-recording.tsx
@@ -72,6 +72,7 @@ export const MediaCurrentRecording = (props: { height?: number }) => {
 
   const removeComparingPitchContour = () => {
     if (!wavesurfer) return;
+    if (!regions) return;
 
     regions
       .getRegions()

diff --git a/enjoy/src/renderer/components/medias/media-transcription.tsx b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -72,7 +72,7 @@ export const MediaTranscription = () => {
       } as ScrollIntoViewOptions);
   }, [currentSegmentIndex, transcription, containerRef]);
 
-  if (!transcription?.result) {
+  if (!transcription?.result?.timeline) {
     return null;
   }
 

diff --git a/enjoy/src/renderer/components/messages/assistant-message.tsx b/enjoy/src/renderer/components/messages/assistant-message.tsx
@@ -104,6 +104,7 @@ export const AssistantMessageComponent = (props: {
           speech.text.length > 20
             ? speech.text.substring(0, 17).trim() + "..."
             : speech.text,
+        originalText: speech.text,
       });
       setResourcing(false);
     }
@@ -251,7 +252,7 @@ export const AssistantMessageComponent = (props: {
             </SheetClose>
           </SheetHeader>
 
-          {Boolean(speech) && <AudioPlayer md5={speech.md5} />}
+          {Boolean(speech) && shadowing && <AudioPlayer md5={speech.md5} />}
         </SheetContent>
       </Sheet>
     </div>

diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -117,7 +117,7 @@ export const MediaPlayerProvider = ({
   const initializeWavesurfer = async () => {
     if (!media) return;
     if (!mediaProvider) return;
-    if (!ref.current) return;
+    if (!ref?.current) return;
 
     const ws = WaveSurfer.create({
       container: ref.current,
@@ -299,22 +299,6 @@ export const MediaPlayerProvider = ({
     );
   };
 
-  useEffect(() => {
-    if (!media) return;
-
-    EnjoyApp.waveforms.find(media.md5).then((waveform) => {
-      setWaveForm(waveform);
-    });
-  }, [media]);
-
-  /*
-   * Initialize wavesurfer when container ref is available
-   * and mediaProvider is available
-   */
-  useEffect(() => {
-    initializeWavesurfer();
-  }, [media, ref, mediaProvider]);
-
   /*
    * When wavesurfer is decoded,
    * set up event listeners for wavesurfer
@@ -353,6 +337,7 @@ export const MediaPlayerProvider = ({
 
     return () => {
       subscriptions.forEach((unsub) => unsub());
+      wavesurfer?.destroy();
     };
   }, [wavesurfer]);
 
@@ -372,6 +357,10 @@ export const MediaPlayerProvider = ({
     } else if (activeRegion.id.startsWith("word-region")) {
       setFitZoomRatio(containerWidth / 3 / duration / minPxPerSec);
     }
+
+    return () => {
+      setFitZoomRatio(1.0);
+    }
   }, [ref, wavesurfer, activeRegion]);
 
   /*
@@ -395,7 +384,7 @@ export const MediaPlayerProvider = ({
     if (!activeRegion) return;
 
     renderPitchContour(activeRegion);
-  }, [activeRegion]);
+  }, [wavesurfer, activeRegion]);
 
   /*
    * Update player styles
@@ -408,6 +397,22 @@ export const MediaPlayerProvider = ({
     scrollContainer.style.scrollbarWidth = "thin";
   }, [decoded, wavesurfer]);
 
+  useEffect(() => {
+    if (!media) return;
+
+    EnjoyApp.waveforms.find(media.md5).then((waveform) => {
+      setWaveForm(waveform);
+    });
+  }, [media]);
+
+  /*
+   * Initialize wavesurfer when container ref is available
+   * and mediaProvider is available
+   */
+  useEffect(() => {
+    initializeWavesurfer();
+  }, [media, ref, mediaProvider]);
+
   return (
     <MediaPlayerProviderContext.Provider
       value={{

diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -79,34 +79,43 @@ export const useTranscribe = () => {
     params?: {
       targetId?: string;
       targetType?: string;
+      originalText?: string;
     }
   ): Promise<{
     engine: string;
     model: string;
     alignmentResult: AlignmentResult;
+    originalText?: string;
   }> => {
     const blob = await transcode(mediaSrc);
+    const { targetId, targetType, originalText } = params || {};
 
     let result;
-    if (whisperConfig.service === "local") {
+    if (originalText) {
+      result = {
+        engine: "original",
+        model: "original",
+      };
+    } else if (whisperConfig.service === "local") {
       result = await transcribeByLocal(blob);
     } else if (whisperConfig.service === "cloudflare") {
       result = await transcribeByCloudflareAi(blob);
     } else if (whisperConfig.service === "openai") {
       result = await transcribeByOpenAi(blob);
     } else if (whisperConfig.service === "azure") {
-      result = await transcribeByAzureAi(blob, params);
+      result = await transcribeByAzureAi(blob, { targetId, targetType });
     } else {
       throw new Error(t("whisperServiceNotSupported"));
     }
 
     const alignmentResult = await EnjoyApp.echogarden.align(
       new Uint8Array(await blob.arrayBuffer()),
-      result.result.map((segment) => segment.text).join(" ")
+      originalText || result.result.map((segment) => segment.text).join(" ")
     );
 
     return {
       ...result,
+      originalText,
       alignmentResult,
     };
   };

diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -29,30 +29,41 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
       setTranscription(record);
     }
   };
-  const findOrCreateTranscription = async () => {
-    if (!media) return;
-    if (transcription) return;
-
-    return EnjoyApp.transcriptions
-      .findOrCreate({
-        targetId: media.id,
-        targetType: media.mediaType,
-      })
-      .then((t) => {
-        if (t.result && !t.result["transcript"]) {
-          t.result = null;
-        }
-        setTranscription(t);
-      })
-      .catch((err) => {
-        toast.error(err.message);
-      });
-  };
+  const findOrCreateTranscription =
+    async (): Promise<TranscriptionType | void> => {
+      if (!media) return;
+      if (transcription?.targetId === media.id) return;
+
+      return EnjoyApp.transcriptions
+        .findOrCreate({
+          targetId: media.id,
+          targetType: media.mediaType,
+        })
+        .then((t) => {
+          if (t.result && !t.result["timeline"]) {
+            t.result = {
+              originalText: t.result?.originalText,
+            };
+          }
+          setTranscription(t);
+          return t;
+        })
+        .catch((err) => {
+          toast.error(err.message);
+        });
+    };
 
   const generateTranscription = async () => {
-    if (transcribing) return;
-    if (!transcription) {
-      await findOrCreateTranscription();
+    if (transcription?.targetId === media.id) return;
+
+    let originalText: string;
+    if (transcription) {
+      originalText = transcription.result?.originalText;
+    } else {
+      const r = await findOrCreateTranscription();
+      if (r) {
+        originalText = r.result?.originalText;
+      }
     }
 
     setTranscribing(true);
@@ -61,6 +72,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
       const { engine, model, alignmentResult } = await transcribe(media.src, {
         targetId: media.id,
         targetType: media.mediaType,
+        originalText,
       });
 
       let timeline: TimelineEntry[] = [];
@@ -105,6 +117,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
         result: {
           timeline: timeline,
           transcript: alignmentResult.transcript,
+          originalText,
         },
         engine,
         model,
@@ -126,14 +139,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
     });
 
     const transcript = (res?.transcriptions || []).filter((t) =>
-      ["base", "small", "medium", "large", "whisper-1"].includes(t.model)
+      ["base", "small", "medium", "large", "whisper-1", "original"].includes(
+        t.model
+      )
     )?.[0];
 
     if (!transcript) {
       return Promise.reject("Transcription not found");
     }
 
-    if (!transcript.result["transcript"]) {
+    if (!transcript.result["timeline"]) {
       return Promise.reject("Transcription not aligned");
     }
 
@@ -149,25 +164,31 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
     try {
       await findTranscriptionFromWebApi();
     } catch (err) {
-      console.error(err);
+      console.warn(err);
       await generateTranscription();
     }
   };
 
+  /*
+   * find or create transcription
+   */
   useEffect(() => {
     if (!media) return;
 
     findOrCreateTranscription();
   }, [media]);
 
+  /*
+   * auto-generate transcription result
+   */
   useEffect(() => {
     if (!transcription) return;
 
     addDblistener(onTransactionUpdate);
 
     if (
       transcription.state == "pending" ||
-      !transcription.result?.["transcript"]
+      !transcription.result?.["timeline"]
     ) {
       findOrGenerateTranscription();
     }

diff --git a/enjoy/src/types/transcription.d.ts b/enjoy/src/types/transcription.d.ts
@@ -5,7 +5,7 @@ type TranscriptionType = {
   state: "pending" | "processing" | "finished";
   engine: string;
   model: string;
-  result: AlignmentResult;
+  result: AlignmentResult & { original?: string };
 };
 
 type TranscriptionResultSegmentType = {