Skip to content

Commit af1e85a

Browse files
committed
openai transcribe supports now byte buffers and input streams for the
audio data
1 parent f26e09b commit af1e85a

1 file changed

Lines changed: 63 additions & 18 deletions

File tree

src/main/resources/com/github/jlangch/venice/openai-java.venice

Lines changed: 63 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,7 @@
696696

697697
| [![width: 15%]] | [![width: 85%]] |
698698
| client | An OpenAI client |
699-
| data | A data `bytebuf` |
699+
| data | The data. A `bytebuf` or a Java `:InputStream`|
700700
| filename | A filename. E.g.: "planning.pdf" |
701701
| purpose | A purpose: `:USER_DATA`, `:VISION`, `:BATCH`, \
702702
`:ASSISTANTS`, `:FINE_TUNE` |
@@ -729,7 +729,7 @@
729729

730730
([client data filename purpose expires-after-seconds]
731731
{ :pre [(instance-of? :OpenAIClient client)
732-
(bytebuf? data)
732+
(or (bytebuf? data) (io/in-stream? data))
733733
(string? filename)
734734
(keyword? purpose)
735735
(long? expires-after-seconds)] }
@@ -1482,9 +1482,10 @@
14821482

14831483
(defn
14841484
^{ :arglists '(
1485-
"(transcribe client model file & options)" )
1485+
"(transcribe client model audio & options)" )
14861486
:doc """
1487-
Transcribes audio from the input file.
1487+
Transcribes audio data. The data data can be supplied as an
1488+
`io/file`, a `bytebuf`, or a Java `:InputStream`.
14881489

14891490
Returns a transcription object in json, diarized_json, or verbose_json
14901491
format, or a stream of transcript events.
@@ -1496,14 +1497,20 @@
14961497
| :model m | The model to use for transcription. One of \
14971498
`:GTP_4O_TRANSCRIBE`, `:GTP_4O_MINI_TRANSCRIBE`, or a \
14981499
`:WHISPER_1`, or `:GTP_4O_TRANSCRIBE_DIARIZE` |
1499-
| :file f | The audio file object (not file name) to transcribe, in one \
1500+
| :audio a | The audio data to transcribe, in one \
15001501
of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, \
1501-
or webm. |
1502+
or webm.¶\
1503+
The data data can be supplied as an `io/file`, a `bytebuf`, \
1504+
or a Java `:InputStream`. |
15021505

15031506

15041507
¶**Parameter «options»**
15051508

15061509
| [![width: 15%]] | [![width: 85%]] |
1510+
| :filename f | If the passed audio is a `bytebuf` or a Java `:InputStream` \
1511+
an explicit file name must be passed. E.g.: 'audio.wav' \
1512+
The filename's extensions must match the audio type \
1513+
`wav`, `mp3`, ... |
15071514
| :language l | The language of the input audio. Supplying the input \
15081515
language in ISO-639-1 (e.g. "en", "de", "fr", "it", ...) \
15091516
format will improve accuracy and latency.|
@@ -1552,34 +1559,72 @@
15521559
:response-format :TEXT)]
15531560
(println "Transcription:")
15541561
(println (openai-java/transcription-text response)))))
1562+
""",
1563+
"""
1564+
(do
1565+
(load-module :openai-java)
1566+
(let [client (openai-java/client)
1567+
is (openai-java/create-speech
1568+
client
1569+
:GPT_4O_MINI_TTS
1570+
"Today is a wonderful day to build something people love!"
1571+
:format :WAV
1572+
:voice "cedar")]
1573+
1574+
(println "Transcribing..." )
1575+
(let [response (openai-java/transcribe client
1576+
:GPT_4O_TRANSCRIBE
1577+
is
1578+
:filename "audio.wav"
1579+
:language "en"
1580+
:temperature 0.1
1581+
:response-format :TEXT)]
1582+
(println "Transcription:")
1583+
(println (openai-java/transcription-text response)))))
15551584
""")
15561585
:see-also '(
15571586
"openai-java/transcription-text"
15581587
"openai-java/usage"
15591588
"openai-java/client") }
15601589

1561-
transcribe [client model file & options]
1590+
transcribe [client model audio & options]
15621591

15631592
{ :pre [(instance-of? :OpenAIClient client)
15641593
(keyword? model)
1565-
(or (io/file? file) (bytebuf? file) (io/in-stream? file))] }
1594+
(or (io/file? audio) (bytebuf? audio) (io/in-stream? audio))] }
15661595

15671596
(let [opts (apply hash-map options)
1597+
filename (:filename opts)
15681598
chunking-strategy (:chunking-strategy opts)
15691599
language (:language opts)
15701600
prompt (:prompt opts)
15711601
response-format (:response-format opts)
15721602
temperature (:temperature opts)]
1573-
(let [params (. :TranscriptionCreateParams :builder)]
1574-
(. params :model (. :AudioModel model))
1575-
(. params :file (io/->path file))
1576-
(when language (. params :language language))
1577-
(when prompt (. params :prompt prompt))
1578-
(when response-format (. params :responseFormat (. :AudioResponseFormat response-format)))
1579-
(when temperature (. params :temperature temperature))
1580-
(-> (. client :audio)
1581-
(. :transcriptions)
1582-
(. :create (. params :build))))))
1603+
(when (and (not (io/file? audio)) (nil? filename))
1604+
(throw (ex :VncException
1605+
"""
1606+
If the passed audio data is a `bytebuf` or a Java
1607+
`:InputStream` an explicit 'filename' option must be passed!
1608+
The filename's extensions must match the audio
1609+
type `wav`, `mp3`, ...
1610+
E.g.: `:filename "audio.wav"`.
1611+
""")))
1612+
;; :TranscriptionCreateParams :file method throws exception when
1613+
;; when passing a byte array or an input stream.
1614+
;; The :TemporaryFile is a workaround until OpenAI fixes the API
1615+
(try-with [audio-file (if (io/file? audio)
1616+
(. :TemporaryFile :of audio)
1617+
(. :TemporaryFile :of audio filename))]
1618+
(let [params (. :TranscriptionCreateParams :builder)]
1619+
(. params :model (. :AudioModel model))
1620+
(. params :file (. audio-file :getPath))
1621+
(when language (. params :language language))
1622+
(when prompt (. params :prompt prompt))
1623+
(when response-format (. params :responseFormat (. :AudioResponseFormat response-format)))
1624+
(when temperature (. params :temperature temperature))
1625+
(-> (. client :audio)
1626+
(. :transcriptions)
1627+
(. :create (. params :build)))))))
15831628

15841629

15851630
(defn

0 commit comments

Comments
 (0)