Skip to content

Commit

Permalink
Speak to Khoj via Desktop, Web or Obsidian Client (#566)
Browse files Browse the repository at this point in the history
- Create speech to text API endpoint
- Use OpenAI Whisper for ASR offline (by downloading Whisper model) or online (via OpenAI API)
- Add speech to text model configuration to Database
- Speak to Khoj from the Web, Desktop or Obsidian client
  • Loading branch information
debanjum authored Nov 26, 2023
2 parents c18d52d + b249bbb commit ebeae54
Show file tree
Hide file tree
Showing 24 changed files with 517 additions and 47 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ dependencies = [
"tzdata == 2023.3",
"rapidocr-onnxruntime == 1.3.8",
"stripe == 7.3.0",
"openai-whisper >= 20231117",
]
dynamic = ["version"]

Expand Down
1 change: 1 addition & 0 deletions src/interface/desktop/assets/icons/microphone-solid.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
37 changes: 37 additions & 0 deletions src/interface/desktop/assets/icons/stop-solid.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
89 changes: 78 additions & 11 deletions src/interface/desktop/chat.html
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,18 @@
}
}

function flashStatusInChatInput(message) {
// Get chat input element and original placeholder
let chatInput = document.getElementById("chat-input");
let originalPlaceholder = chatInput.placeholder;
// Set placeholder to message
chatInput.placeholder = message;
// Reset placeholder after 2 seconds
setTimeout(() => {
chatInput.placeholder = originalPlaceholder;
}, 2000);
}

async function clearConversationHistory() {
let chatInput = document.getElementById("chat-input");
let originalPlaceholder = chatInput.placeholder;
Expand All @@ -530,17 +542,71 @@
.then(data => {
chatBody.innerHTML = "";
loadChat();
chatInput.placeholder = "Cleared conversation history";
flashStatusInChatInput("🗑 Cleared conversation history");
})
.catch(err => {
chatInput.placeholder = "Failed to clear conversation history";
flashStatusInChatInput("⛔️ Failed to clear conversation history");
})
.finally(() => {
setTimeout(() => {
chatInput.placeholder = originalPlaceholder;
}, 2000);
}

let mediaRecorder;
async function speechToText() {
const speakButtonImg = document.getElementById('speak-button-img');
const chatInput = document.getElementById('chat-input');

const hostURL = await window.hostURLAPI.getURL();
let url = `${hostURL}/api/transcribe?client=desktop`;
const khojToken = await window.tokenAPI.getToken();
const headers = { 'Authorization': `Bearer ${khojToken}` };

const sendToServer = (audioBlob) => {
const formData = new FormData();
formData.append('file', audioBlob);

fetch(url, { method: 'POST', body: formData, headers})
.then(response => response.ok ? response.json() : Promise.reject(response))
.then(data => { chatInput.value += data.text; })
.catch(err => {
err.status == 422
? flashStatusInChatInput("⛔️ Configure speech-to-text model on server.")
: flashStatusInChatInput("⛔️ Failed to transcribe audio")
});
};

const handleRecording = (stream) => {
const audioChunks = [];
const recordingConfig = { mimeType: 'audio/webm' };
mediaRecorder = new MediaRecorder(stream, recordingConfig);

mediaRecorder.addEventListener("dataavailable", function(event) {
if (event.data.size > 0) audioChunks.push(event.data);
});

mediaRecorder.addEventListener("stop", function() {
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
sendToServer(audioBlob);
});

mediaRecorder.start();
speakButtonImg.src = './assets/icons/stop-solid.svg';
speakButtonImg.alt = 'Stop Transcription';
};

// Toggle recording
if (!mediaRecorder || mediaRecorder.state === 'inactive') {
navigator.mediaDevices
.getUserMedia({ audio: true })
.then(handleRecording)
.catch((e) => {
flashStatusInChatInput("⛔️ Failed to access microphone");
});
} else if (mediaRecorder.state === 'recording') {
mediaRecorder.stop();
speakButtonImg.src = './assets/icons/microphone-solid.svg';
speakButtonImg.alt = 'Transcribe';
}
}

</script>
<body>
<div id="khoj-empty-container" class="khoj-empty-container">
Expand Down Expand Up @@ -569,8 +635,11 @@
<div id="chat-tooltip" style="display: none;"></div>
<div id="input-row">
<textarea id="chat-input" class="option" oninput="onChatInput()" onkeydown=incrementalChat(event) autofocus="autofocus" placeholder="Type / to see a list of commands, or just type your questions and hit enter."></textarea>
<button class="input-row-button" onclick="clearConversationHistory()">
<img class="input-rown-button-img" src="./assets/icons/trash-solid.svg" alt="Clear Chat History"></img>
<button id="speak-button" class="input-row-button" onclick="speechToText()">
<img id="speak-button-img" class="input-row-button-img" src="./assets/icons/microphone-solid.svg" alt="Transcribe"></img>
</button>
<button id="clear-chat" class="input-row-button" onclick="clearConversationHistory()">
<img class="input-row-button-img" src="./assets/icons/trash-solid.svg" alt="Clear Chat History"></img>
</button>
</div>
</div>
Expand Down Expand Up @@ -620,7 +689,6 @@
.chat-message.you {
margin-right: auto;
text-align: right;
white-space: pre-line;
}
/* basic style chat message text */
.chat-message-text {
Expand All @@ -637,7 +705,6 @@
color: var(--primary-inverse);
background: var(--primary);
margin-left: auto;
white-space: pre-line;
}
/* Spinner symbol when the chat message is loading */
.spinner {
Expand Down Expand Up @@ -694,7 +761,7 @@
}
#input-row {
display: grid;
grid-template-columns: auto 32px;
grid-template-columns: auto 32px 32px;
grid-column-gap: 10px;
grid-row-gap: 10px;
background: #f9fafc
Expand Down
107 changes: 98 additions & 9 deletions src/interface/obsidian/src/chat_modal.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { App, Modal, request, setIcon } from 'obsidian';
import { App, Modal, RequestUrlParam, request, requestUrl, setIcon } from 'obsidian';
import { KhojSetting } from 'src/settings';
import fetch from "node-fetch";

Expand Down Expand Up @@ -51,6 +51,16 @@ export class KhojChatModal extends Modal {
})
chatInput.addEventListener('change', (event) => { this.result = (<HTMLInputElement>event.target).value });

let transcribe = inputRow.createEl("button", {
text: "Transcribe",
attr: {
id: "khoj-transcribe",
class: "khoj-transcribe khoj-input-row-button",
},
})
transcribe.addEventListener('click', async (_) => { await this.speechToText() });
setIcon(transcribe, "mic");

let clearChat = inputRow.createEl("button", {
text: "Clear History",
attr: {
Expand Down Expand Up @@ -205,9 +215,19 @@ export class KhojChatModal extends Modal {
}
}

async clearConversationHistory() {
flashStatusInChatInput(message: string) {
// Get chat input element and original placeholder
let chatInput = <HTMLInputElement>this.contentEl.getElementsByClassName("khoj-chat-input")[0];
let originalPlaceholder = chatInput.placeholder;
// Set placeholder to message
chatInput.placeholder = message;
// Reset placeholder after 2 seconds
setTimeout(() => {
chatInput.placeholder = originalPlaceholder;
}, 2000);
}

async clearConversationHistory() {
let chatBody = this.contentEl.getElementsByClassName("khoj-chat-body")[0];

let response = await request({
Expand All @@ -224,15 +244,84 @@ export class KhojChatModal extends Modal {
// If conversation history is cleared successfully, clear chat logs from modal
chatBody.innerHTML = "";
await this.getChatHistory();
chatInput.placeholder = result.message;
this.flashStatusInChatInput(result.message);
}
} catch (err) {
chatInput.placeholder = "Failed to clear conversation history";
} finally {
// Reset to original placeholder text after some time
setTimeout(() => {
chatInput.placeholder = originalPlaceholder;
}, 2000);
this.flashStatusInChatInput("Failed to clear conversation history");
}
}

mediaRecorder: MediaRecorder | undefined;
async speechToText() {
const transcribeButton = <HTMLButtonElement>this.contentEl.getElementsByClassName("khoj-transcribe")[0];
const chatInput = <HTMLInputElement>this.contentEl.getElementsByClassName("khoj-chat-input")[0];

const generateRequestBody = async (audioBlob: Blob, boundary_string: string) => {
const boundary = `------${boundary_string}`;
const chunks: ArrayBuffer[] = [];

chunks.push(new TextEncoder().encode(`${boundary}\r\n`));
chunks.push(new TextEncoder().encode(`Content-Disposition: form-data; name="file"; filename="blob"\r\nContent-Type: "application/octet-stream"\r\n\r\n`));
chunks.push(await audioBlob.arrayBuffer());
chunks.push(new TextEncoder().encode('\r\n'));

await Promise.all(chunks);
chunks.push(new TextEncoder().encode(`${boundary}--\r\n`));
return await new Blob(chunks).arrayBuffer();
};

const sendToServer = async (audioBlob: Blob) => {
const boundary_string = `Boundary${Math.random().toString(36).slice(2)}`;
const requestBody = await generateRequestBody(audioBlob, boundary_string);

const response = await requestUrl({
url: `${this.setting.khojUrl}/api/transcribe?client=obsidian`,
method: 'POST',
headers: { "Authorization": `Bearer ${this.setting.khojApiKey}` },
contentType: `multipart/form-data; boundary=----${boundary_string}`,
body: requestBody,
});

// Parse response from Khoj backend
if (response.status === 200) {
console.log(response);
chatInput.value += response.json.text;
} else if (response.status === 422) {
throw new Error("⛔️ Failed to transcribe audio");
} else {
throw new Error("⛔️ Configure speech-to-text model on server.");
}
};

const handleRecording = (stream: MediaStream) => {
const audioChunks: Blob[] = [];
const recordingConfig = { mimeType: 'audio/webm' };
this.mediaRecorder = new MediaRecorder(stream, recordingConfig);

this.mediaRecorder.addEventListener("dataavailable", function(event) {
if (event.data.size > 0) audioChunks.push(event.data);
});

this.mediaRecorder.addEventListener("stop", async function() {
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
await sendToServer(audioBlob);
});

this.mediaRecorder.start();
setIcon(transcribeButton, "mic-off");
};

// Toggle recording
if (!this.mediaRecorder || this.mediaRecorder.state === 'inactive') {
navigator.mediaDevices
.getUserMedia({ audio: true })
.then(handleRecording)
.catch((e) => {
this.flashStatusInChatInput("⛔️ Failed to access microphone");
});
} else if (this.mediaRecorder.state === 'recording') {
this.mediaRecorder.stop();
setIcon(transcribeButton, "mic");
}
}
}
2 changes: 1 addition & 1 deletion src/interface/obsidian/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ If your plugin does not need CSS, delete this file.
}
.khoj-input-row {
display: grid;
grid-template-columns: auto 32px;
grid-template-columns: auto 32px 32px;
grid-column-gap: 10px;
grid-row-gap: 10px;
background: var(--background-primary);
Expand Down
5 changes: 5 additions & 0 deletions src/khoj/database/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
OfflineChatProcessorConversationConfig,
OpenAIProcessorConversationConfig,
SearchModelConfig,
SpeechToTextModelOptions,
Subscription,
UserConversationConfig,
OpenAIProcessorConversationConfig,
Expand Down Expand Up @@ -344,6 +345,10 @@ async def get_openai_chat():
async def get_openai_chat_config():
return await OpenAIProcessorConversationConfig.objects.filter().afirst()

@staticmethod
async def get_speech_to_text_config():
return await SpeechToTextModelOptions.objects.filter().afirst()

@staticmethod
async def aget_conversation_starters(user: KhojUser):
all_questions = []
Expand Down
2 changes: 2 additions & 0 deletions src/khoj/database/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
OpenAIProcessorConversationConfig,
OfflineChatProcessorConversationConfig,
SearchModelConfig,
SpeechToTextModelOptions,
Subscription,
ReflectiveQuestion,
)

admin.site.register(KhojUser, UserAdmin)

admin.site.register(ChatModelOptions)
admin.site.register(SpeechToTextModelOptions)
admin.site.register(OpenAIProcessorConversationConfig)
admin.site.register(OfflineChatProcessorConversationConfig)
admin.site.register(SearchModelConfig)
Expand Down
Loading

0 comments on commit ebeae54

Please sign in to comment.