Skip to content

Commit

Permalink
Force alignment for tts audio (#418)
Browse files Browse the repository at this point in the history
* add originalText as param

* save original text when added from tts speech

* fix player in conversation sheet

* minor fix
  • Loading branch information
an-lee authored Mar 18, 2024
1 parent 94dfabf commit de89ae7
Show file tree
Hide file tree
Showing 10 changed files with 123 additions and 61 deletions.
37 changes: 26 additions & 11 deletions enjoy/src/main/db/handlers/audios-handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class AudiosHandler {
params: {
name?: string;
coverUrl?: string;
originalText?: string;
} = {}
) {
let file = uri;
Expand All @@ -95,19 +96,33 @@ class AudiosHandler {
}
}

return Audio.buildFromLocalFile(file, {
source,
...params,
})
.then((audio) => {
return audio.toJSON();
})
.catch((err) => {
return event.sender.send("on-notification", {
type: "error",
message: t("models.audio.failedToAdd", { error: err.message }),
try {
const audio = await Audio.buildFromLocalFile(file, {
source,
name: params.name,
coverUrl: params.coverUrl,
});

// create transcription if originalText is provided
const { originalText } = params;
if (originalText) {
await Transcription.create({
targetType: "Audio",
targetId: audio.id,
targetMd5: audio.md5,
result: {
originalText,
},
});
}

return audio.toJSON();
} catch (err) {
return event.sender.send("on-notification", {
type: "error",
message: t("models.audio.failedToAdd", { error: err.message }),
});
}
}

private async update(
Expand Down
6 changes: 6 additions & 0 deletions enjoy/src/main/db/models/audio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,12 @@ export class Audio extends Model<Audio> {
targetType: "Audio",
},
});
Transcription.destroy({
where: {
targetId: audio.id,
targetType: "Audio",
},
});

const webApi = new Client({
baseUrl: process.env.WEB_API_URL || WEB_API_URL,
Expand Down
4 changes: 4 additions & 0 deletions enjoy/src/renderer/components/audios/audio-player.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ export const AudioPlayer = (props: { id?: string; md5?: string }) => {

useEffect(() => {
setRef(ref);

return () => {
setRef(null);
};
}, [ref]);

return (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ export const MediaCurrentRecording = (props: { height?: number }) => {

const removeComparingPitchContour = () => {
if (!wavesurfer) return;
if (!regions) return;

regions
.getRegions()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ export const MediaTranscription = () => {
} as ScrollIntoViewOptions);
}, [currentSegmentIndex, transcription, containerRef]);

if (!transcription?.result) {
if (!transcription?.result?.timeline) {
return null;
}

Expand Down
3 changes: 2 additions & 1 deletion enjoy/src/renderer/components/messages/assistant-message.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ export const AssistantMessageComponent = (props: {
speech.text.length > 20
? speech.text.substring(0, 17).trim() + "..."
: speech.text,
originalText: speech.text,
});
setResourcing(false);
}
Expand Down Expand Up @@ -251,7 +252,7 @@ export const AssistantMessageComponent = (props: {
</SheetClose>
</SheetHeader>

{Boolean(speech) && <AudioPlayer md5={speech.md5} />}
{Boolean(speech) && shadowing && <AudioPlayer md5={speech.md5} />}
</SheetContent>
</Sheet>
</div>
Expand Down
41 changes: 23 additions & 18 deletions enjoy/src/renderer/context/media-player-provider.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ export const MediaPlayerProvider = ({
const initializeWavesurfer = async () => {
if (!media) return;
if (!mediaProvider) return;
if (!ref.current) return;
if (!ref?.current) return;

const ws = WaveSurfer.create({
container: ref.current,
Expand Down Expand Up @@ -299,22 +299,6 @@ export const MediaPlayerProvider = ({
);
};

useEffect(() => {
if (!media) return;

EnjoyApp.waveforms.find(media.md5).then((waveform) => {
setWaveForm(waveform);
});
}, [media]);

/*
* Initialize wavesurfer when container ref is available
* and mediaProvider is available
*/
useEffect(() => {
initializeWavesurfer();
}, [media, ref, mediaProvider]);

/*
* When wavesurfer is decoded,
* set up event listeners for wavesurfer
Expand Down Expand Up @@ -353,6 +337,7 @@ export const MediaPlayerProvider = ({

return () => {
subscriptions.forEach((unsub) => unsub());
wavesurfer?.destroy();
};
}, [wavesurfer]);

Expand All @@ -372,6 +357,10 @@ export const MediaPlayerProvider = ({
} else if (activeRegion.id.startsWith("word-region")) {
setFitZoomRatio(containerWidth / 3 / duration / minPxPerSec);
}

return () => {
setFitZoomRatio(1.0);
}
}, [ref, wavesurfer, activeRegion]);

/*
Expand All @@ -395,7 +384,7 @@ export const MediaPlayerProvider = ({
if (!activeRegion) return;

renderPitchContour(activeRegion);
}, [activeRegion]);
}, [wavesurfer, activeRegion]);

/*
* Update player styles
Expand All @@ -408,6 +397,22 @@ export const MediaPlayerProvider = ({
scrollContainer.style.scrollbarWidth = "thin";
}, [decoded, wavesurfer]);

useEffect(() => {
if (!media) return;

EnjoyApp.waveforms.find(media.md5).then((waveform) => {
setWaveForm(waveform);
});
}, [media]);

/*
* Initialize wavesurfer when container ref is available
* and mediaProvider is available
*/
useEffect(() => {
initializeWavesurfer();
}, [media, ref, mediaProvider]);

return (
<MediaPlayerProviderContext.Provider
value={{
Expand Down
15 changes: 12 additions & 3 deletions enjoy/src/renderer/hooks/use-transcribe.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -79,34 +79,43 @@ export const useTranscribe = () => {
params?: {
targetId?: string;
targetType?: string;
originalText?: string;
}
): Promise<{
engine: string;
model: string;
alignmentResult: AlignmentResult;
originalText?: string;
}> => {
const blob = await transcode(mediaSrc);
const { targetId, targetType, originalText } = params || {};

let result;
if (whisperConfig.service === "local") {
if (originalText) {
result = {
engine: "original",
model: "original",
};
} else if (whisperConfig.service === "local") {
result = await transcribeByLocal(blob);
} else if (whisperConfig.service === "cloudflare") {
result = await transcribeByCloudflareAi(blob);
} else if (whisperConfig.service === "openai") {
result = await transcribeByOpenAi(blob);
} else if (whisperConfig.service === "azure") {
result = await transcribeByAzureAi(blob, params);
result = await transcribeByAzureAi(blob, { targetId, targetType });
} else {
throw new Error(t("whisperServiceNotSupported"));
}

const alignmentResult = await EnjoyApp.echogarden.align(
new Uint8Array(await blob.arrayBuffer()),
result.result.map((segment) => segment.text).join(" ")
originalText || result.result.map((segment) => segment.text).join(" ")
);

return {
...result,
originalText,
alignmentResult,
};
};
Expand Down
73 changes: 47 additions & 26 deletions enjoy/src/renderer/hooks/use-transcriptions.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,41 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
setTranscription(record);
}
};
const findOrCreateTranscription = async () => {
if (!media) return;
if (transcription) return;

return EnjoyApp.transcriptions
.findOrCreate({
targetId: media.id,
targetType: media.mediaType,
})
.then((t) => {
if (t.result && !t.result["transcript"]) {
t.result = null;
}
setTranscription(t);
})
.catch((err) => {
toast.error(err.message);
});
};
const findOrCreateTranscription =
async (): Promise<TranscriptionType | void> => {
if (!media) return;
if (transcription?.targetId === media.id) return;

return EnjoyApp.transcriptions
.findOrCreate({
targetId: media.id,
targetType: media.mediaType,
})
.then((t) => {
if (t.result && !t.result["timeline"]) {
t.result = {
originalText: t.result?.originalText,
};
}
setTranscription(t);
return t;
})
.catch((err) => {
toast.error(err.message);
});
};

const generateTranscription = async () => {
if (transcribing) return;
if (!transcription) {
await findOrCreateTranscription();
if (transcription?.targetId === media.id) return;

let originalText: string;
if (transcription) {
originalText = transcription.result?.originalText;
} else {
const r = await findOrCreateTranscription();
if (r) {
originalText = r.result?.originalText;
}
}

setTranscribing(true);
Expand All @@ -61,6 +72,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
const { engine, model, alignmentResult } = await transcribe(media.src, {
targetId: media.id,
targetType: media.mediaType,
originalText,
});

let timeline: TimelineEntry[] = [];
Expand Down Expand Up @@ -105,6 +117,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
result: {
timeline: timeline,
transcript: alignmentResult.transcript,
originalText,
},
engine,
model,
Expand All @@ -126,14 +139,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
});

const transcript = (res?.transcriptions || []).filter((t) =>
["base", "small", "medium", "large", "whisper-1"].includes(t.model)
["base", "small", "medium", "large", "whisper-1", "original"].includes(
t.model
)
)?.[0];

if (!transcript) {
return Promise.reject("Transcription not found");
}

if (!transcript.result["transcript"]) {
if (!transcript.result["timeline"]) {
return Promise.reject("Transcription not aligned");
}

Expand All @@ -149,25 +164,31 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
try {
await findTranscriptionFromWebApi();
} catch (err) {
console.error(err);
console.warn(err);
await generateTranscription();
}
};

/*
* find or create transcription
*/
useEffect(() => {
if (!media) return;

findOrCreateTranscription();
}, [media]);

/*
* auto-generate transcription result
*/
useEffect(() => {
if (!transcription) return;

addDblistener(onTransactionUpdate);

if (
transcription.state == "pending" ||
!transcription.result?.["transcript"]
!transcription.result?.["timeline"]
) {
findOrGenerateTranscription();
}
Expand Down
2 changes: 1 addition & 1 deletion enjoy/src/types/transcription.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ type TranscriptionType = {
state: "pending" | "processing" | "finished";
engine: string;
model: string;
result: AlignmentResult;
result: AlignmentResult & { original?: string };
};

type TranscriptionResultSegmentType = {
Expand Down

0 comments on commit de89ae7

Please sign in to comment.