Skip to content

Commit

Permalink
Finish the dictate tool. Integrated with the listening method
Browse files Browse the repository at this point in the history
  • Loading branch information
yorevs committed Dec 18, 2024
1 parent 7e15c22 commit 577205a
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 65 deletions.
2 changes: 0 additions & 2 deletions src/demo/components/recorder_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from clitt.core.term.cursor import cursor
from clitt.core.tui.line_input.line_input import line_input

from askai.core.askai_messages import msg
from askai.core.component.recorder import recorder
from utils import init_context

Expand All @@ -27,7 +26,6 @@
cursor.write()
match opt:
case "1":
cursor.write(msg.listening())
audio_path, stt_text = recorder.listen()
cursor.writeln(f"Audio path: {audio_path}")
cursor.writeln(f"Transcribed text: {stt_text}")
Expand Down
3 changes: 1 addition & 2 deletions src/main/askai/core/askai_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,7 @@ def _cb_mic_listening_event(self, ev: Event) -> None:
"""Callback to handle microphone listening events.
:param ev: The event object representing the microphone listening event.
"""
if ev.args.listening:
self._reply(AIReply.info(msg.listening()))
pass # CLI don't really care about this event

def _cb_device_changed_event(self, ev: Event) -> None:
"""Callback to handle audio input device change events.
Expand Down
4 changes: 2 additions & 2 deletions src/main/askai/core/askai_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ def welcome_back(self) -> str:
return "How may I further assist you ?"

def listening(self) -> str:
return "I'm listening…"
return "Listening…"

def dictating(self) -> str:
return "Dictating…"

def transcribing(self) -> str:
return "I'm processing your voice…"
return "Processing your voice…"

def goodbye(self) -> str:
return "Goodbye, have a nice day !"
Expand Down
106 changes: 49 additions & 57 deletions src/main/askai/core/component/recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ def __device_watcher() -> None:
break
recorder.devices = new_list

@staticmethod
def countdown(sec: int, ev: threading.Event, counter_msg: str | None) -> None:
i = sec
if counter_msg:
events.reply.emit(reply=AIReply.mute(counter_msg + f" {i}"))
while not ev.is_set() and (i := (i - 1)) >= 0:
pause.milliseconds(990)
if counter_msg:
events.reply.emit(reply=AIReply.mute(counter_msg + f" {i}"), erase_last=True)

def __init__(self):
self._rec: Recognizer = Recognizer()
self._devices: list[InputDevice] = []
Expand Down Expand Up @@ -167,40 +177,59 @@ def listen(
self,
recognition_api: RecognitionApi = RecognitionApi.GOOGLE,
language: Language = Language.EN_US,
audio_path: Path | None = None,
raise_on_timeout: bool = True,
raise_on_intelligible: bool = True,
counter_msg: str = msg.listening(),
) -> tuple[Path, Optional[str]]:
"""Listen to the microphone, save the recorded audio as a WAV file, and transcribe the speech.
:param recognition_api: The API to use for recognizing the speech. Defaults to GOOGLE.
:param language: The spoken language. Defaults to EN_US.
:param audio_path: The audio file destination, generated by the listening process.
:param raise_on_timeout: If True, raises an error when no speech is detected within the timeout period.
:param raise_on_intelligible: If True, raises an error when the speech is unintelligible.
:param counter_msg: The message to display during the listening countdown.
:return: A tuple containing the path to the saved audio file and the transcribed text.
If transcription fails, the second element of the tuple will be None.
"""
stop_event: threading.Event = threading.Event()
timeout: float = seconds(configs.recorder_silence_timeout_millis)
limit: float = seconds(configs.recorder_phrase_limit_millis)

with Microphone(device_index=self._device_index) as mic_source:
try:
audio_path = Path(f"{REC_DIR}/askai-stt-{now_ms()}.wav")
stop_event.clear()
counter_thread = threading.Thread(target=self.countdown, args=(limit, stop_event, counter_msg))
audio_path = audio_path or Path(f"{REC_DIR}/askai-stt-{now_ms()}.wav")
self._detect_noise()
counter_thread.start()
events.listening.emit()
audio: AudioData = self._rec.listen(
mic_source,
phrase_time_limit=seconds(configs.recorder_phrase_limit_millis),
timeout=seconds(configs.recorder_silence_timeout_millis),
)
audio: AudioData = self._rec.listen(mic_source, phrase_time_limit=limit, timeout=timeout)
events.listening.emit(listening=False)
stop_event.set()
counter_thread.join()
events.reply.emit(reply=AIReply.mute(f"{counter_msg}  "), erase_last=True)
stt_text = self._write_audio_file(audio, audio_path, language, recognition_api)
except WaitTimeoutError as err:
err_msg: str = msg.timeout(f"waiting for a speech input => '{err}'")
log.warning("Timed out while waiting for a speech input!")
events.reply.emit(reply=AIReply.error(message=err_msg), erase_last=True)
if raise_on_timeout:
err_msg: str = msg.timeout(f"waiting for a speech input => '{err}'")
events.reply.emit(reply=AIReply.error(message=err_msg), erase_last=True)
stt_text = None
except UnknownValueError as err:
err_msg: str = msg.intelligible(err)
log.warning("Speech was not intelligible!")
events.reply.emit(reply=AIReply.error(message=err_msg), erase_last=True)
if raise_on_intelligible:
err_msg: str = msg.intelligible(err)
events.reply.emit(reply=AIReply.error(message=err_msg), erase_last=True)
stt_text = None
except AttributeError as err:
raise InvalidInputDevice(str(err)) from err
except RequestError as err:
raise InvalidRecognitionApiError(str(err)) from err
finally:
events.listening.emit(listening=False)
if not stop_event.is_set():
stop_event.set()
counter_thread.join()

return audio_path, stt_text

Expand All @@ -209,80 +238,43 @@ def dictate(
recognition_api: RecognitionApi = RecognitionApi.GOOGLE,
language: Language = Language.EN_US,
) -> Optional[str]:
"""Listen to the microphone, save the recorded audio as a WAV file, and transcribe the speech.
"""Captures dictated speech, processes it into text, and returns the transcribed text.
:param recognition_api: The API to use for recognizing the speech. Defaults to GOOGLE.
:param language: The spoken language. Defaults to EN_US.
:param language: The spoken language. Defaults to en_US.
:return: A string containing the dictated text. If transcription fails, None will be returned.
"""
phrase: str = ""
dictated_text: str = ""
limit: float = 10.0
noise_limit: float = 0.2
stop_event: threading.Event = threading.Event()
audio_path: Path = Path(f"{REC_DIR}/askai-dictate-{now_ms()}.wav")

def _countdown_(sec: int):
i = sec
sysout(msg.listening() + " ", end="")
sysout(f"{i}", end="")
while not stop_event.is_set() and (i := (i - 1)) >= 0:
pause.seconds(1)
sysout(f"%CUB({len(str(i + 1))})%{i}%EL0%", end="")
if not stop_event.is_set():
sysout(f"%CUB({len(str(i + 1))})%%EL0%", end="")

events.listening.emit()
while True:
with Microphone(device_index=self._device_index) as mic_source:
try:
stop_event.clear()
new_thread = threading.Thread(target=_countdown_, args=(limit,))
sysout(("…" if dictated_text else "") + phrase)
self._rec.adjust_for_ambient_noise(mic_source, duration=noise_limit)
new_thread.start()
audio: AudioData = self._rec.listen(mic_source, phrase_time_limit=limit)
sysout(f"%CUB({len(str(limit))})%%EL0%  ")
phrase = self._write_audio_file(audio, audio_path, language, recognition_api, True)
except (WaitTimeoutError, UnknownValueError):
phrase = ""
except AttributeError as err:
raise InvalidInputDevice(str(err)) from err
except RequestError as err:
raise InvalidRecognitionApiError(str(err)) from err
finally:
stop_event.set()
if not phrase or phrase in ["quit", "exit"]:
break
else:
dictated_text += (". " if dictated_text else "") + phrase.capitalize()

events.listening.emit(listening=False)
_, phrase = self.listen(recognition_api, language, audio_path, False, False, msg.dictating())
if not phrase or phrase in [msg.t("quit"), msg.t("exit"), msg.t("bye")]:
break
else:
sysout(f" {('…' if dictated_text else '') + phrase}  ")
dictated_text += (". " if dictated_text else "") + phrase.capitalize()

return ensure_endswith(dictated_text, "." + os.linesep)
return ensure_endswith(dictated_text, "." + os.linesep) if dictated_text else dictated_text

def _write_audio_file(
self,
audio: AudioData,
audio_path: AnyPath,
language: Language,
recognition_api: RecognitionApi,
mute: bool = False,
) -> Optional[str]:
"""Write the provided audio data to disk as a file and transcribe the contents into text using the specified
recognition API.
:param audio: The audio data to be saved.
:param audio_path: The path where the audio file will be saved.
:param language: The language of the spoken content in the audio.
:param recognition_api: The API used for speech recognition.
:param mute: Weather or not to emit a 'transcribing' reply; Defaults to False.
:return: The transcribed text from the audio file. If transcription fails, returns None.
"""
with open(str(audio_path), "wb") as f_rec:
f_rec.write(audio.get_wav_data())
log.debug("Voice recorded and saved as %s", audio_path)
if api := getattr(self._rec, recognition_api.value):
if not mute:
events.reply.emit(reply=AIReply.debug(message=msg.transcribing()), erase_last=True)
log.debug("Recognizing voice using %s", recognition_api)
assert isinstance(api, Callable)
return api(audio, language=language.language)
Expand Down
2 changes: 0 additions & 2 deletions src/main/askai/tui/askai_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,8 +390,6 @@ def _cb_mic_listening_event(self, ev: Event) -> None:
:param ev: The event object representing the microphone listening event.
"""
self.header.notifications.listening = ev.args.listening
if ev.args.listening:
self._reply(AIReply.info(msg.listening()))

def _cb_device_changed_event(self, ev: Event) -> None:
"""Callback to handle audio input device change events.
Expand Down

0 comments on commit 577205a

Please sign in to comment.