Speak to Khoj via Desktop, Web or Obsidian Client (#566)

- Create speech to text API endpoint - Use OpenAI Whisper for ASR offline (by downloading Whisper model) or online (via OpenAI API) - Add speech to text model configuration to Database - Speak to Khoj from the Web, Desktop or Obsidian client
khoj-ai · Nov 26, 2023 · ebeae54 · ebeae54
2 parents c18d52d + b249bbb
commit ebeae54
Show file tree

Hide file tree

Showing 24 changed files with 517 additions and 47 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -75,6 +75,7 @@ dependencies = [
     "tzdata == 2023.3",
     "rapidocr-onnxruntime == 1.3.8",
     "stripe == 7.3.0",
+    "openai-whisper >= 20231117",
 ]
 dynamic = ["version"]
 

diff --git a/src/interface/desktop/assets/icons/microphone-solid.svg b/src/interface/desktop/assets/icons/microphone-solid.svg
diff --git a/src/interface/desktop/assets/icons/stop-solid.svg b/src/interface/desktop/assets/icons/stop-solid.svg
diff --git a/src/interface/desktop/chat.html b/src/interface/desktop/chat.html
@@ -516,6 +516,18 @@
             }
         }
 
+        function flashStatusInChatInput(message) {
+            // Get chat input element and original placeholder
+            let chatInput = document.getElementById("chat-input");
+            let originalPlaceholder = chatInput.placeholder;
+            // Set placeholder to message
+            chatInput.placeholder = message;
+            // Reset placeholder after 2 seconds
+            setTimeout(() => {
+                chatInput.placeholder = originalPlaceholder;
+            }, 2000);
+        }
+
         async function clearConversationHistory() {
             let chatInput = document.getElementById("chat-input");
             let originalPlaceholder = chatInput.placeholder;
@@ -530,17 +542,71 @@
                 .then(data => {
                     chatBody.innerHTML = "";
                     loadChat();
-                    chatInput.placeholder = "Cleared conversation history";
+                    flashStatusInChatInput("🗑 Cleared conversation history");
                 })
                 .catch(err => {
-                    chatInput.placeholder = "Failed to clear conversation history";
+                    flashStatusInChatInput("⛔️ Failed to clear conversation history");
                 })
-                .finally(() => {
-                    setTimeout(() => {
-                        chatInput.placeholder = originalPlaceholder;
-                    }, 2000);
+        }
+
+        let mediaRecorder;
+        async function speechToText() {
+            const speakButtonImg = document.getElementById('speak-button-img');
+            const chatInput = document.getElementById('chat-input');
+
+            const hostURL = await window.hostURLAPI.getURL();
+            let url = `${hostURL}/api/transcribe?client=desktop`;
+            const khojToken = await window.tokenAPI.getToken();
+            const headers = { 'Authorization': `Bearer ${khojToken}` };
+
+            const sendToServer = (audioBlob) => {
+                const formData = new FormData();
+                formData.append('file', audioBlob);
+
+                fetch(url, { method: 'POST', body: formData, headers})
+                    .then(response => response.ok ? response.json() : Promise.reject(response))
+                    .then(data => { chatInput.value += data.text; })
+                    .catch(err => {
+                        err.status == 422
+                        ? flashStatusInChatInput("⛔️ Configure speech-to-text model on server.")
+                        : flashStatusInChatInput("⛔️ Failed to transcribe audio")
+                    });
+            };
+
+            const handleRecording = (stream) => {
+                const audioChunks = [];
+                const recordingConfig = { mimeType: 'audio/webm' };
+                mediaRecorder = new MediaRecorder(stream, recordingConfig);
+
+                mediaRecorder.addEventListener("dataavailable", function(event) {
+                    if (event.data.size > 0) audioChunks.push(event.data);
+                });
+
+                mediaRecorder.addEventListener("stop", function() {
+                    const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+                    sendToServer(audioBlob);
                 });
+
+                mediaRecorder.start();
+                speakButtonImg.src = './assets/icons/stop-solid.svg';
+                speakButtonImg.alt = 'Stop Transcription';
+            };
+
+            // Toggle recording
+            if (!mediaRecorder || mediaRecorder.state === 'inactive') {
+                navigator.mediaDevices
+                .getUserMedia({ audio: true })
+                .then(handleRecording)
+                .catch((e) => {
+                    flashStatusInChatInput("⛔️ Failed to access microphone");
+                });
+            } else if (mediaRecorder.state === 'recording') {
+                mediaRecorder.stop();
+                speakButtonImg.src = './assets/icons/microphone-solid.svg';
+                speakButtonImg.alt = 'Transcribe';
+            }
         }
+
     </script>
     <body>
         <div id="khoj-empty-container" class="khoj-empty-container">
@@ -569,8 +635,11 @@
             <div id="chat-tooltip" style="display: none;"></div>
             <div id="input-row">
                 <textarea id="chat-input" class="option" oninput="onChatInput()" onkeydown=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter."></textarea>
-                <button class="input-row-button" onclick="clearConversationHistory()">
-                    <img class="input-rown-button-img" src="./assets/icons/trash-solid.svg" alt="Clear Chat History"></img>
+                <button id="speak-button" class="input-row-button" onclick="speechToText()">
+                    <img id="speak-button-img" class="input-row-button-img" src="./assets/icons/microphone-solid.svg" alt="Transcribe"></img>
+                </button>
+                <button id="clear-chat" class="input-row-button" onclick="clearConversationHistory()">
+                    <img class="input-row-button-img" src="./assets/icons/trash-solid.svg" alt="Clear Chat History"></img>
                 </button>
             </div>
         </div>
@@ -620,7 +689,6 @@
         .chat-message.you {
             margin-right: auto;
             text-align: right;
-            white-space: pre-line;
         }
         /* basic style chat message text */
         .chat-message-text {
@@ -637,7 +705,6 @@
             color: var(--primary-inverse);
             background: var(--primary);
             margin-left: auto;
-            white-space: pre-line;
         }
         /* Spinner symbol when the chat message is loading */
         .spinner {
@@ -694,7 +761,7 @@
         }
         #input-row {
             display: grid;
-            grid-template-columns: auto 32px;
+            grid-template-columns: auto 32px 32px;
             grid-column-gap: 10px;
             grid-row-gap: 10px;
             background: #f9fafc

diff --git a/src/interface/obsidian/src/chat_modal.ts b/src/interface/obsidian/src/chat_modal.ts
@@ -1,4 +1,4 @@
-import { App, Modal, request, setIcon } from 'obsidian';
+import { App, Modal, RequestUrlParam, request, requestUrl, setIcon } from 'obsidian';
 import { KhojSetting } from 'src/settings';
 import fetch from "node-fetch";
 
@@ -51,6 +51,16 @@ export class KhojChatModal extends Modal {
             })
         chatInput.addEventListener('change', (event) => { this.result = (<HTMLInputElement>event.target).value });
 
+        let transcribe = inputRow.createEl("button", {
+            text: "Transcribe",
+            attr: {
+                id: "khoj-transcribe",
+                class: "khoj-transcribe khoj-input-row-button",
+            },
+        })
+        transcribe.addEventListener('click', async (_) => { await this.speechToText() });
+        setIcon(transcribe, "mic");
+
         let clearChat = inputRow.createEl("button", {
             text: "Clear History",
             attr: {
@@ -205,9 +215,19 @@ export class KhojChatModal extends Modal {
         }
     }
 
-    async clearConversationHistory() {
+    flashStatusInChatInput(message: string) {
+        // Get chat input element and original placeholder
         let chatInput = <HTMLInputElement>this.contentEl.getElementsByClassName("khoj-chat-input")[0];
         let originalPlaceholder = chatInput.placeholder;
+        // Set placeholder to message
+        chatInput.placeholder = message;
+        // Reset placeholder after 2 seconds
+        setTimeout(() => {
+            chatInput.placeholder = originalPlaceholder;
+        }, 2000);
+    }
+
+    async clearConversationHistory() {
         let chatBody = this.contentEl.getElementsByClassName("khoj-chat-body")[0];
 
         let response = await request({
@@ -224,15 +244,84 @@ export class KhojChatModal extends Modal {
                 // If conversation history is cleared successfully, clear chat logs from modal
                 chatBody.innerHTML = "";
                 await this.getChatHistory();
-                chatInput.placeholder = result.message;
+                this.flashStatusInChatInput(result.message);
             }
         } catch (err) {
-            chatInput.placeholder = "Failed to clear conversation history";
-        } finally {
-            // Reset to original placeholder text after some time
-            setTimeout(() => {
-                chatInput.placeholder = originalPlaceholder;
-            }, 2000);
+            this.flashStatusInChatInput("Failed to clear conversation history");
+        }
+    }
+
+    mediaRecorder: MediaRecorder | undefined;
+    async speechToText() {
+        const transcribeButton = <HTMLButtonElement>this.contentEl.getElementsByClassName("khoj-transcribe")[0];
+        const chatInput = <HTMLInputElement>this.contentEl.getElementsByClassName("khoj-chat-input")[0];
+
+        const generateRequestBody = async (audioBlob: Blob, boundary_string: string) => {
+            const boundary = `------${boundary_string}`;
+            const chunks: ArrayBuffer[] = [];
+
+            chunks.push(new TextEncoder().encode(`${boundary}\r\n`));
+            chunks.push(new TextEncoder().encode(`Content-Disposition: form-data; name="file"; filename="blob"\r\nContent-Type: "application/octet-stream"\r\n\r\n`));
+            chunks.push(await audioBlob.arrayBuffer());
+            chunks.push(new TextEncoder().encode('\r\n'));
+
+            await Promise.all(chunks);
+            chunks.push(new TextEncoder().encode(`${boundary}--\r\n`));
+            return await new Blob(chunks).arrayBuffer();
+        };
+
+        const sendToServer = async (audioBlob: Blob) => {
+            const boundary_string = `Boundary${Math.random().toString(36).slice(2)}`;
+            const requestBody = await generateRequestBody(audioBlob, boundary_string);
+
+            const response = await requestUrl({
+                url: `${this.setting.khojUrl}/api/transcribe?client=obsidian`,
+                method: 'POST',
+                headers: { "Authorization": `Bearer ${this.setting.khojApiKey}` },
+                contentType: `multipart/form-data; boundary=----${boundary_string}`,
+                body: requestBody,
+            });
+
+            // Parse response from Khoj backend
+            if (response.status === 200) {
+                console.log(response);
+                chatInput.value += response.json.text;
+            } else if (response.status === 422) {
+                throw new Error("⛔️ Failed to transcribe audio");
+            } else {
+                throw new Error("⛔️ Configure speech-to-text model on server.");
+            }
+        };
+
+        const handleRecording = (stream: MediaStream) => {
+            const audioChunks: Blob[] = [];
+            const recordingConfig = { mimeType: 'audio/webm' };
+            this.mediaRecorder = new MediaRecorder(stream, recordingConfig);
+
+            this.mediaRecorder.addEventListener("dataavailable", function(event) {
+                if (event.data.size > 0) audioChunks.push(event.data);
+            });
+
+            this.mediaRecorder.addEventListener("stop", async function() {
+                const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
+                await sendToServer(audioBlob);
+            });
+
+            this.mediaRecorder.start();
+            setIcon(transcribeButton, "mic-off");
+        };
+
+        // Toggle recording
+        if (!this.mediaRecorder || this.mediaRecorder.state === 'inactive') {
+            navigator.mediaDevices
+                .getUserMedia({ audio: true })
+                .then(handleRecording)
+                .catch((e) => {
+                    this.flashStatusInChatInput("⛔️ Failed to access microphone");
+                });
+        } else if (this.mediaRecorder.state === 'recording') {
+            this.mediaRecorder.stop();
+            setIcon(transcribeButton, "mic");
         }
     }
 }
diff --git a/src/interface/obsidian/styles.css b/src/interface/obsidian/styles.css
@@ -112,7 +112,7 @@ If your plugin does not need CSS, delete this file.
 }
 .khoj-input-row {
     display: grid;
-    grid-template-columns: auto 32px;
+    grid-template-columns: auto 32px 32px;
     grid-column-gap: 10px;
     grid-row-gap: 10px;
     background: var(--background-primary);

diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py
@@ -26,6 +26,7 @@
     OfflineChatProcessorConversationConfig,
     OpenAIProcessorConversationConfig,
     SearchModelConfig,
+    SpeechToTextModelOptions,
     Subscription,
     UserConversationConfig,
     OpenAIProcessorConversationConfig,
@@ -344,6 +345,10 @@ async def get_openai_chat():
     async def get_openai_chat_config():
         return await OpenAIProcessorConversationConfig.objects.filter().afirst()
 
+    @staticmethod
+    async def get_speech_to_text_config():
+        return await SpeechToTextModelOptions.objects.filter().afirst()
+
     @staticmethod
     async def aget_conversation_starters(user: KhojUser):
         all_questions = []

diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py
@@ -9,13 +9,15 @@
     OpenAIProcessorConversationConfig,
     OfflineChatProcessorConversationConfig,
     SearchModelConfig,
+    SpeechToTextModelOptions,
     Subscription,
     ReflectiveQuestion,
 )
 
 admin.site.register(KhojUser, UserAdmin)
 
 admin.site.register(ChatModelOptions)
+admin.site.register(SpeechToTextModelOptions)
 admin.site.register(OpenAIProcessorConversationConfig)
 admin.site.register(OfflineChatProcessorConversationConfig)
 admin.site.register(SearchModelConfig)