✨ feat: Add openai tts

lobehub · Nov 7, 2023 · 3dfcf6b · 3dfcf6b
1 parent a115c3b
commit 3dfcf6b
Show file tree

Hide file tree

Showing 30 changed files with 301 additions and 36 deletions.
diff --git a/.eslintignore b/.eslintignore
@@ -24,7 +24,6 @@ __test__
 # production
 dist
 es
-lib
 logs
 
 # misc

diff --git a/.prettierignore b/.prettierignore
@@ -35,7 +35,6 @@ __snapshots__
 # production
 dist
 es
-lib
 logs
 
 # umi

diff --git a/api/index.ts b/api/index.ts
@@ -1,8 +1,8 @@
 import qs from 'query-string';
 
 import cors from '../lib/cors';
+import { fetchMicrosoftSpeech } from '../lib/fetchMicrosoftSpeech';
 import { SsmlOptions } from '../lib/genSSML';
-import { postMicrosoftSpeech } from '../lib/postMicrosoftSpeech';
 
 export const config = {
  runtime: 'edge',
@@ -11,7 +11,7 @@ export const config = {
 export default async (req: Request) => {
  const { text, ...options }: SsmlOptions & { text: string } = qs.parseUrl(req.url).query as any;
 
- const res = await fetch(...postMicrosoftSpeech(text, options));
+ const res = await fetchMicrosoftSpeech(text, options);
 
  const newResponse = new Response(res.body, res);
 

diff --git a/lib/postMicrosoftSpeech.ts → lib/fetchMicrosoftSpeech.ts b/lib/postMicrosoftSpeech.ts → lib/fetchMicrosoftSpeech.ts
@@ -5,7 +5,7 @@ import { type SsmlOptions, genSSML } from './genSSML';
 const API =
  'https://southeastasia.api.speech.microsoft.com/accfreetrial/texttospeech/acc/v3.0-beta1/vcg/speak';
 
-export const postMicrosoftSpeech = (text: string, options: SsmlOptions): [any, any] => {
+export const fetchMicrosoftSpeech = async (text: string, options: SsmlOptions) => {
  const data = JSON.stringify({
  offsetInPlainText: 0,
  properties: {
@@ -15,7 +15,7 @@ export const postMicrosoftSpeech = (text: string, options: SsmlOptions): [any, a
  ttsAudioFormat: 'audio-24khz-160kbitrate-mono-mp3',
  });
 
- const DEFAULT_HEADERS = {
+ const DEFAULT_HEADERS = new Headers({
  'accept': '*/*',
  'accept-language': 'zh-CN,zh;q=0.9',
  'authority': 'southeastasia.api.speech.microsoft.com',
@@ -30,15 +30,13 @@ export const postMicrosoftSpeech = (text: string, options: SsmlOptions): [any, a
  'sec-fetch-site': 'same-site',
  'user-agent':
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
- };
+ });
 
- return [
- API,
- {
- body: data,
- headers: DEFAULT_HEADERS,
- method: 'POST',
- responseType: 'arraybuffer',
- },
- ];
+ return await fetch(API, {
+ body: data,
+ headers: DEFAULT_HEADERS,
+ method: 'POST',
+ // @ts-ignore
+ responseType: 'arraybuffer',
+ });
 };
diff --git a/package.json b/package.json
@@ -27,7 +27,7 @@
  "docs:dev": "dumi dev",
  "doctor": "father doctor",
  "postinstall": "npm run setup",
- "lint": "eslint \"{src,tests}/**/*.{js,jsx,ts,tsx}\" --fix",
+ "lint": "eslint \"{src,api,lib}/**/*.{js,jsx,ts,tsx}\" --fix",
  "lint:md": "remark . --quiet --frail --output",
  "prepare": "husky install",
  "prepublishOnly": "npm run build",
@@ -65,6 +65,7 @@
  "query-string": "^8",
  "ssml-document": "^1",
  "swr": "^2",
+ "url-join": "^5.0.0",
  "uuid": "^9"
  },
  "devDependencies": {

diff --git a/src/const/api.ts b/src/const/api.ts
@@ -0,0 +1,10 @@
+import urlJoin from 'url-join';
+
+export const MICROSOFT_SPEECH_PROXY_URL = process.env.MICROSOFT_SPEECH_PROXY_URL || '';
+export const AZURE_SPEECH_KEY = process.env.AZURE_SPEECH_KEY || '';
+export const AZURE_SPEECH_REGION = process.env.AZURE_SPEECH_REGION || '';
+export const OPENAI_API_KEY = process.env.OPENAI_API_KEY || '';
+export const OPENAI_PROXY_URL = process.env.OPENAI_PROXY_URL || 'https://api.openai.com/v1';
+export const OPENAI_TTS_URL = (api?: string) => urlJoin(api || OPENAI_PROXY_URL, 'audio/speech');
+export const OPENAI_STT_URL = (api: string) =>
+ urlJoin(api || OPENAI_PROXY_URL, 'audio/transcriptions');
diff --git a/src/data/openaiVoiceList.json b/src/data/openaiVoiceList.json
@@ -0,0 +1 @@
+["alloy", "echo", "fable", "onyx", "nova", "shimmer"]
diff --git a/src/index.ts b/src/index.ts
@@ -1,11 +1,18 @@
+export { fetchAzureSpeech } from './services/fetchAzureSpeech';
+export { fetchEdgeSpeech } from './services/fetchEdgeSpeech';
+export { fetchMicrosoftSpeech } from './services/fetchMicrosoftSpeech';
+export { fetchOpenaiSTT } from './services/fetchOpenaiSTT';
+export { fetchOpenaiTTS } from './services/fetchOpenaiTTS';
 export { useAzureSpeech } from './useAzureSpeech';
 export { useEdgeSpeech } from './useEdgeSpeech';
 export { useMicrosoftSpeech } from './useMicrosoftSpeech';
+export { useOpenaiTTS } from './useOpenaiTTS';
 export { usePersistedSpeechRecognition } from './useSpeechRecognition/usePersistedSpeechRecognition';
 export { useSpeechRecognition } from './useSpeechRecognition/useSpeechRecognition';
 export { useSpeechSynthes } from './useSpeechSynthes';
 export {
  getAzureVoiceList,
  getEdgeVoiceList,
+ getOpenaiVoiceList,
  getSpeechSynthesVoiceList,
 } from './utils/getVoiceList';
diff --git a/src/services/postAzureSpeech.ts → src/services/fetchAzureSpeech.ts b/src/services/postAzureSpeech.ts → src/services/fetchAzureSpeech.ts
@@ -8,6 +8,8 @@ import {
  SpeechSynthesizer,
 } from 'microsoft-cognitiveservices-speech-sdk';
 
+import { AZURE_SPEECH_KEY, AZURE_SPEECH_REGION } from '@/const/api';
+
 import { type SsmlOptions, genSSML } from '../utils/genSSML';
 
 export interface AzureSpeechOptions extends SsmlOptions {
@@ -18,12 +20,12 @@ export interface AzureSpeechOptions extends SsmlOptions {
 }
 
 // 纯文本生成语音
-export const postAzureSpeech = async (
+export const fetchAzureSpeech = async (
  text: string,
  { api, ...options }: AzureSpeechOptions,
 ): Promise<AudioBufferSourceNode> => {
- const key = api.key || process.env.AZURE_SPEECH_KEY || '';
- const region = api.key || process.env.AZURE_SPEECH_REGION || '';
+ const key = api.key || AZURE_SPEECH_KEY;
+ const region = api.key || AZURE_SPEECH_REGION;
  const speechConfig = SpeechConfig.fromSubscription(key, region);
  speechConfig.setProperty(PropertyId.SpeechServiceResponse_RequestSentenceBoundary, 'true');
  speechConfig.speechSynthesisOutputFormat = SpeechSynthesisOutputFormat.Webm24Khz16BitMonoOpus;

diff --git a/src/services/postEdgeSpeech.ts → src/services/fetchEdgeSpeech.ts b/src/services/postEdgeSpeech.ts → src/services/fetchEdgeSpeech.ts
@@ -8,7 +8,7 @@ import { getHeadersAndData } from '../utils/getHeadersAndData';
 const API = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1';
 const TOKEN = '6A5AA1D4EAFF4E9FB37E23D68491D6F4';
 
-export const postEdgeSpeech = async (text: string, options: SsmlOptions) => {
+export const fetchEdgeSpeech = async (text: string, options: SsmlOptions) => {
  const connectId = uuidv4().replaceAll('-', '');
  const date = new Date().toString();
  const audioContext = new AudioContext();

diff --git a/src/services/postMicrosoftSpeech.ts → src/services/fetchMicrosoftSpeech.ts b/src/services/postMicrosoftSpeech.ts → src/services/fetchMicrosoftSpeech.ts
@@ -1,22 +1,28 @@
 import qs from 'query-string';
 
+import { MICROSOFT_SPEECH_PROXY_URL } from '@/const/api';
+
 import { type SsmlOptions } from '../utils/genSSML';
 
 export interface MicrosoftSpeechOptions extends SsmlOptions {
  api?: string;
 }
 
-export const postMicrosoftSpeech = async (
+export const fetchMicrosoftSpeech = async (
  text: string,
  { api, ...options }: MicrosoftSpeechOptions,
 ): Promise<AudioBufferSourceNode> => {
  const response: Response = await fetch(
  qs.stringifyUrl({
  query: { text, ...options },
- url: api || process.env.MICROSOFT_SPEECH_PROXY_URL || '',
+ url: api || MICROSOFT_SPEECH_PROXY_URL,
  }),
  );
 
+ if (!response.ok) {
+ throw new Error('Network response was not ok');
+ }
+
  const audioData = await response.arrayBuffer();
  const audioContext = new AudioContext();
  const audioBufferSource = audioContext.createBufferSource();

diff --git a/src/services/fetchOpenaiSTT.ts b/src/services/fetchOpenaiSTT.ts
@@ -0,0 +1,39 @@
+import { v4 as uuidv4 } from 'uuid';
+
+import { OPENAI_API_KEY, OPENAI_STT_URL } from '@/const/api';
+
+export interface OpenaiTtsOptions {
+ api: {
+ key: string;
+ proxy: string;
+ };
+ model?: 'whisper-1';
+}
+
+// 纯文本生成语音
+export const fetchOpenaiSTT = async (
+ speech: Blob,
+ { api, model = 'whisper-1' }: OpenaiTtsOptions,
+): Promise<string> => {
+ const key = api.key || OPENAI_API_KEY;
+ const url = OPENAI_STT_URL(api.proxy);
+
+ const headers = new Headers({
+ 'Authorization': `Bearer ${key}`,
+ 'Content-Type': 'multipart/form-data',
+ });
+
+ const body = new FormData();
+ body.append('file', speech, `${uuidv4()}.webm`);
+ body.append('model', model);
+
+ const response: Response = await fetch(url, { body, headers, method: 'POST' });
+
+ if (!response.ok) {
+ throw new Error('Network response was not ok');
+ }
+
+ const json = await response.json();
+
+ return json?.text;
+};
diff --git a/src/services/fetchOpenaiTTS.ts b/src/services/fetchOpenaiTTS.ts
@@ -0,0 +1,47 @@
+import { OPENAI_API_KEY, OPENAI_TTS_URL } from '@/const/api';
+
+import { type SsmlOptions } from '../utils/genSSML';
+
+export type OpenaiVoice = 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer';
+
+export interface OpenaiTtsOptions extends SsmlOptions {
+ api: {
+ key: string;
+ proxy: string;
+ };
+ model?: 'tts-1' | 'tts-1-hd';
+ name: OpenaiVoice;
+}
+
+// 纯文本生成语音
+export const fetchOpenaiTTS = async (
+ text: string,
+ { api, model = 'tts-1', ...options }: OpenaiTtsOptions,
+): Promise<AudioBufferSourceNode> => {
+ const key = api.key || OPENAI_API_KEY;
+ const url = OPENAI_TTS_URL(api.proxy);
+
+ const headers = new Headers({
+ 'Authorization': `Bearer ${key}`,
+ 'Content-Type': 'application/json',
+ });
+
+ const body = JSON.stringify({
+ input: text,
+ model,
+ voice: options.name,
+ });
+
+ const response: Response = await fetch(url, { body, headers, method: 'POST' });
+
+ if (!response.ok) {
+ throw new Error('Network response was not ok');
+ }
+
+ const audioData = await response.arrayBuffer();
+ const audioContext = new AudioContext();
+ const audioBufferSource = audioContext.createBufferSource();
+ audioBufferSource.buffer = await audioContext.decodeAudioData(audioData);
+ audioBufferSource.connect(audioContext.destination);
+ return audioBufferSource;
+};
diff --git a/src/useAzureSpeech/demos/AzureSpeech.tsx → src/useAzureSpeech/demos/index.tsx b/src/useAzureSpeech/demos/AzureSpeech.tsx → src/useAzureSpeech/demos/index.tsx
diff --git a/src/useAzureSpeech/index.md b/src/useAzureSpeech/index.md
@@ -8,4 +8,4 @@ title: useAzureSpeech
 
 - ENV: `AZURE_SPEECH_KEY` `AZURE_SPEECH_REGION`
 
-<code src="./demos/AzureSpeech.tsx" nopadding></code>
+<code src="./demos/index.tsx" nopadding></code>
diff --git a/src/useAzureSpeech/index.ts b/src/useAzureSpeech/index.ts
@@ -1,7 +1,7 @@
 import { useState } from 'react';
 import useSWR from 'swr';
 
-import { AzureSpeechOptions, postAzureSpeech } from '../services/postAzureSpeech';
+import { AzureSpeechOptions, fetchAzureSpeech } from '../services/fetchAzureSpeech';
 
 export const useAzureSpeech = (defaultText: string, options: AzureSpeechOptions) => {
  const [data, setDate] = useState<AudioBufferSourceNode>();
@@ -11,7 +11,7 @@ export const useAzureSpeech = (defaultText: string, options: AzureSpeechOptions)
 
  const { isLoading } = useSWR(
  shouldFetch ? [options.name, text].join('-') : null,
- () => postAzureSpeech(text, options),
+ () => fetchAzureSpeech(text, options),
  {
  onError: () => setShouldFetch(false),
  onSuccess: (audioBufferSource) => {

diff --git a/src/useEdgeSpeech/demos/EdgeSpeech.tsx → src/useEdgeSpeech/demos/index.tsx b/src/useEdgeSpeech/demos/EdgeSpeech.tsx → src/useEdgeSpeech/demos/index.tsx
diff --git a/src/useEdgeSpeech/index.md b/src/useEdgeSpeech/index.md
@@ -6,4 +6,4 @@ title: useEdgeSpeech
 
 ## hooks
 
-<code src="./demos/EdgeSpeech.tsx" nopadding></code>
+<code src="./demos/index.tsx" nopadding></code>
diff --git a/src/useEdgeSpeech/index.ts b/src/useEdgeSpeech/index.ts
@@ -1,7 +1,7 @@
 import { useState } from 'react';
 import useSWR from 'swr';
 
-import { postEdgeSpeech } from '../services/postEdgeSpeech';
+import { fetchEdgeSpeech } from '../services/fetchEdgeSpeech';
 import { SsmlOptions } from '../utils/genSSML';
 
 export const useEdgeSpeech = (defaultText: string, options: SsmlOptions) => {
@@ -12,7 +12,7 @@ export const useEdgeSpeech = (defaultText: string, options: SsmlOptions) => {
 
  const { isLoading } = useSWR(
  shouldFetch ? [options.name, text].join('-') : null,
- () => postEdgeSpeech(text, options),
+ () => fetchEdgeSpeech(text, options),
  {
  onError: () => setShouldFetch(false),
  onSuccess: (audioBufferSource) => {

diff --git a/...MicrosoftSpeech/demos/MicrosoftSpeech.tsx → src/useMicrosoftSpeech/demos/index.tsx b/...MicrosoftSpeech/demos/MicrosoftSpeech.tsx → src/useMicrosoftSpeech/demos/index.tsx
@@ -6,14 +6,13 @@ import { Flexbox } from 'react-layout-kit';
 
 const defaultText = '这是一段使用 Microsoft Speech 的语音演示';
 
-const API = 'https://lobe-tts-preview.vercel.app/api/index';
 export default () => {
  const store = useCreateStore();
  const options: any = useControls(
  {
  api: {
  label: 'MICROSOFT_SPEECH_PROXY_URL',
- value: API,
+ value: '',
  },
  name: {
  options: getEdgeVoiceList(),

diff --git a/src/useMicrosoftSpeech/index.md b/src/useMicrosoftSpeech/index.md
@@ -11,4 +11,4 @@ title: useMicrosoftSpeech
 
 > ex: <https://lobe-tts-preview.vercel.app/api/index>
 
-<code src="./demos/MicrosoftSpeech.tsx" nopadding></code>
+<code src="./demos/index.tsx" nopadding></code>
diff --git a/src/useMicrosoftSpeech/index.ts b/src/useMicrosoftSpeech/index.ts
@@ -1,7 +1,10 @@
 import { useState } from 'react';
 import useSWR from 'swr';
 
-import { type MicrosoftSpeechOptions, postMicrosoftSpeech } from '../services/postMicrosoftSpeech';
+import {
+ type MicrosoftSpeechOptions,
+ fetchMicrosoftSpeech,
+} from '../services/fetchMicrosoftSpeech';
 
 export const useMicrosoftSpeech = (defaultText: string, options: MicrosoftSpeechOptions) => {
  const [data, setDate] = useState<AudioBufferSourceNode>();
@@ -11,7 +14,7 @@ export const useMicrosoftSpeech = (defaultText: string, options: MicrosoftSpeech
 
  const { isLoading } = useSWR(
  shouldFetch ? [options.name, text].join('-') : null,
- () => postMicrosoftSpeech(text, options),
+ () => fetchMicrosoftSpeech(text, options),
  {
  onError: () => setShouldFetch(false),
  onSuccess: (audioBufferSource) => {