Addressing Whisper STT issues (#5929)

2024-10-01 01:26:03 -04:00 · 2024-06-29 06:32:54 +02:00 · 2024-06-29 06:32:54 +02:00 · cc825dd1f4
commit cc825dd1f4
parent 5c6b9c610d
2 changed files with 44 additions and 1 deletions
--- a/extensions/whisper_stt/script.js
+++ b/extensions/whisper_stt/script.js
@ -0,0 +1,25 @@
 var recButton = document.getElementsByClassName("record-button")[0].cloneNode(true);
 var generate_button = document.getElementById("Generate");
 generate_button.insertAdjacentElement("afterend", recButton);
 recButton.style.setProperty("margin-left", "-10px");
 recButton.innerText = "Rec.";
 recButton.addEventListener("click", function() {
  var originalRecordButton = document.getElementsByClassName("record-button")[1];
  originalRecordButton.click();
  var stopRecordButtons = document.getElementsByClassName("stop-button");
  if (stopRecordButtons.length > 1) generate_button.parentElement.removeChild(stopRecordButtons[0]);
  var stopRecordButton = document.getElementsByClassName("stop-button")[0];
  generate_button.insertAdjacentElement("afterend", stopRecordButton);
  //stopRecordButton.style.setProperty("margin-left", "-10px");
  stopRecordButton.style.setProperty("padding-right", "10px");
  recButton.style.display = "none";
  stopRecordButton.addEventListener("click", function() {
    recButton.style.display = "flex";
  });
 });
--- a/extensions/whisper_stt/script.py
+++ b/extensions/whisper_stt/script.py
@ -1,5 +1,8 @@
 from pathlib import Path
 import gradio as gr
 import speech_recognition as sr
 import numpy as np
 from modules import shared
@ -45,6 +48,11 @@ def do_stt(audio, whipser_model, whipser_language):
 def auto_transcribe(audio, auto_submit, whipser_model, whipser_language):
    if audio is None:
        return "", ""
    sample_rate, audio_data = audio
    if not isinstance(audio_data[0], np.ndarray):      # workaround for chrome audio. Mono?
        # Convert to 2 channels, so each sample s_i consists of the same value in both channels [val_i, val_i]
        audio_data = np.column_stack((audio_data, audio_data))
        audio = (sample_rate, audio_data)
    transcription = do_stt(audio, whipser_model, whipser_language)
    if auto_submit:
        input_hijack.update({"state": True, "value": [transcription, transcription]})
@ -55,7 +63,7 @@ def auto_transcribe(audio, auto_submit, whipser_model, whipser_language):
 def ui():
    with gr.Accordion("Whisper STT", open=True):
        with gr.Row():
-            audio = gr.Audio(source="microphone")
+            audio = gr.Audio(source="microphone", type="numpy")
        with gr.Row():
            with gr.Accordion("Settings", open=False):
                auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])
@ -69,3 +77,13 @@ def ui():
    whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None)
    whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None)
    auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)
 def custom_js():
    """
    Returns custom javascript as a string. It is applied whenever the web UI is
    loaded.
    :return:
    """
    with open(Path(__file__).parent.resolve() / "script.js", "r") as f:
        return f.read()