ml-explore · anthonywu · Oct 3, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/whisper/README.md b/whisper/README.md
@@ -25,8 +25,8 @@ pip install mlx-whisper
 
 At its simplest:
 
-```
-mlx_whisper audio_file.mp3
+```sh
+mlx_whisper audio_file.mp3 # output name will re-use basename of audio file path
 ```
 
 This will make a text file `audio_file.txt` with the results.
@@ -35,6 +35,20 @@ Use `-f` to specify the output format and `--model` to specify the model. There
 are many other supported command line options. To see them all, run
 `mlx_whisper -h`.
 
+Alternatively, you can pipe in the audio content of other programs via stdin,
+useful when `mlx_whisper` acts as a composable command line utility.
+
+```sh
+# hypothetical demo of audio content via stdin
+# default output file name will be content.*
+some-process | mlx_whisper
+
+# hypothetical demo of media content via stdin
+# use --output-name to name your output artifacts
+some-downloader https://some.url/media?id=lecture42 | mlx_whisper --output-name mlx-demo
+```
+
+
 #### API
 
 Transcribe audio with:
@@ -103,7 +117,7 @@ python convert.py --help
 ```
 
 By default, the conversion script will make the directory `mlx_models`
-and save the converted `weights.npz` and `config.json` there. 
+and save the converted `weights.npz` and `config.json` there.
 
 Each time it is run, `convert.py` will overwrite any model in the provided
 path. To save different models, make sure to set `--mlx-path` to a unique

diff --git a/whisper/mlx_whisper/audio.py b/whisper/mlx_whisper/audio.py
@@ -3,7 +3,7 @@
 import os
 from functools import lru_cache
 from subprocess import CalledProcessError, run
-from typing import Union
+from typing import Optional, Union
 
 import mlx.core as mx
 import numpy as np
@@ -21,7 +21,7 @@
 TOKENS_PER_SECOND = SAMPLE_RATE // N_SAMPLES_PER_TOKEN # 20ms per audio token
 
 
-def load_audio(file: str, sr: int = SAMPLE_RATE):
+def load_audio(file: str = Optional[str], sr: int = SAMPLE_RATE, from_stdin=False):
  """
  Open an audio file and read as mono waveform, resampling as necessary
 
@@ -40,18 +40,20 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
 
  # This launches a subprocess to decode audio while down-mixing
  # and resampling as necessary. Requires the ffmpeg CLI in PATH.
+ if from_stdin:
+ cmd = ["ffmpeg", "-i", "pipe:0"]
+ else:
+ cmd = ["ffmpeg", "-nostdin", "-i", file]
+
  # fmt: off
- cmd = [
- "ffmpeg",
- "-nostdin",
+ cmd.extend([
  "-threads", "0",
- "-i", file,
  "-f", "s16le",
  "-ac", "1",
  "-acodec", "pcm_s16le",
  "-ar", str(sr),
  "-"
- ]
+ ])
  # fmt: on
  try:
  out = run(cmd, capture_output=True, check=True).stdout

diff --git a/whisper/mlx_whisper/cli.py b/whisper/mlx_whisper/cli.py
@@ -5,6 +5,7 @@
 import traceback
 import warnings
 
+from . import audio
 from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE
 from .transcribe import transcribe
 from .writers import get_writer
@@ -27,15 +28,21 @@ def str2bool(string):
  parser = argparse.ArgumentParser(
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
  )
- parser.add_argument(
-  "audio", nargs="+", type=str, help="Audio file(s) to transcribe"
- )
+
+ parser.add_argument("audio", nargs="+", help="Audio file(s) to transcribe")
+
  parser.add_argument(
  "--model",
  default="mlx-community/whisper-tiny",
  type=str,
  help="The model directory or hugging face repo",
  )
+ parser.add_argument(
+ "--output-name",
+ type=str,
+ default="{basename}",
+ help="logical name of transcription/translation output files, before --output-format extensions",
+ )
  parser.add_argument(
  "--output-dir",
  "-o",
@@ -200,9 +207,10 @@ def main():
  path_or_hf_repo: str = args.pop("model")
  output_dir: str = args.pop("output_dir")
  output_format: str = args.pop("output_format")
+ output_name_template: str = args.pop("output_name")
  os.makedirs(output_dir, exist_ok=True)
 
- writer = get_writer(output_format, output_dir)
+ writer = get_writer(output_format, output_dir, output_name_template)
  word_options = [
  "highlight_words",
  "max_line_count",
@@ -219,17 +227,22 @@ def main():
  warnings.warn("--max-line-count has no effect without --max-line-width")
  if writer_args["max_words_per_line"] and writer_args["max_line_width"]:
  warnings.warn("--max-words-per-line has no effect with --max-line-width")
- for audio_path in args.pop("audio"):
+
+ for audio_obj in args.pop("audio"):
+ if audio_obj == "-":
+ # receive the contents from stdin rather than read a file
+ audio_obj = audio.load_audio(from_stdin=True)
+
  try:
  result = transcribe(
- audio_path,
+ audio_obj,
  path_or_hf_repo=path_or_hf_repo,
  **args,
  )
- writer(result, audio_path, **writer_args)
+ writer(result, audio_obj, **writer_args)
  except Exception as e:
  traceback.print_exc()
- print(f"Skipping {audio_path} due to {type(e).__name__}: {str(e)}")
+ print(f"Skipping {audio_obj} due to {type(e).__name__}: {str(e)}")
 
 
 if __name__ == "__main__":

diff --git a/whisper/mlx_whisper/writers.py b/whisper/mlx_whisper/writers.py
@@ -1,10 +1,8 @@
 # Copyright © 2024 Apple Inc.
 
 import json
-import os
+import pathlib
 import re
-import sys
-import zlib
 from typing import Callable, List, Optional, TextIO
 
 
@@ -39,19 +37,26 @@ def get_start(segments: List[dict]) -> Optional[float]:
 class ResultWriter:
  extension: str
 
- def __init__(self, output_dir: str):
+ def __init__(self, output_dir: str, output_name_template: str):
  self.output_dir = output_dir
+ self.output_name_template = output_name_template
 
  def __call__(
- self, result: dict, audio_path: str, options: Optional[dict] = None, **kwargs
+ self, result: dict, audio_obj: str, options: Optional[dict] = None, **kwargs
  ):
- audio_basename = os.path.basename(audio_path)
- audio_basename = os.path.splitext(audio_basename)[0]
- output_path = os.path.join(
- self.output_dir, audio_basename + "." + self.extension
+ if isinstance(audio_obj, (str, pathlib.Path)):
+ basename = pathlib.Path(audio_obj).stem
+ else:
+ # mx.array, np.ndarray, etc
+ basename = "content"
+
+ output_basename = self.output_name_template.format(basename=basename)
+
+ output_path = (pathlib.Path(self.output_dir) / output_basename).with_suffix(
+ f".{self.extension}"
  )
 
- with open(output_path, "w", encoding="utf-8") as f:
+ with output_path.open("wt", encoding="utf-8") as f:
  self.write_result(result, file=f, options=options, **kwargs)
 
  def write_result(
@@ -248,7 +253,7 @@ def write_result(
 
 
 def get_writer(
- output_format: str, output_dir: str
+ output_format: str, output_dir: str, output_name_template: str
 ) -> Callable[[dict, TextIO, dict], None]:
  writers = {
  "txt": WriteTXT,
@@ -259,7 +264,9 @@ def get_writer(
  }
 
  if output_format == "all":
- all_writers = [writer(output_dir) for writer in writers.values()]
+ all_writers = [
+ writer(output_dir, output_name_template) for writer in writers.values()
+ ]
 
  def write_all(
  result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
@@ -269,4 +276,4 @@ def write_all(
 
  return write_all
 
- return writers[output_format](output_dir)
+ return writers[output_format](output_dir, output_name_template)
diff --git a/whisper/test_cli.sh b/whisper/test_cli.sh
@@ -0,0 +1,69 @@
+#!/bin/zsh -e
+
+set -o err_exit
+
+TEST_AUDIO="mlx_whisper/assets/ls_test.flac"
+TEST_OUTPUT_DIR=$(mktemp -d -t mlx_whisper_cli_test)
+
+# the control output - cli called with audio position arg
+# expected output file name is ls_test.json
+TEST_OUTPUT_NAME_FOR_ALL="--output-name arg is used for all output formats"
+mlx_whisper "$TEST_AUDIO" \
+ --output-dir "$TEST_OUTPUT_DIR" \
+ --output-format all \
+ --output-name '{basename}_transcribed' \
+ --temperature 0 \
+ --verbose=False
+if /bin/ls ${TEST_OUTPUT_DIR}/ls_test_transcribed.{json,srt,tsv,txt,vtt} > /dev/null; then
+ echo "[PASS] $TEST_OUTPUT_NAME_FOR_ALL"
+else
+ echo "[FAIL] $TEST_OUTPUT_NAME_FOR_ALL"
+fi
+
+
+TEST_OUTPUT_NAME_TEMPLATE="testing the output name template usage scenario"
+for test_val in $(seq 10 10 60); do
+ mlx_whisper "$TEST_AUDIO" \
+ --output-name "{basename}_mwpl_${test_val}" \
+ --output-dir "$TEST_OUTPUT_DIR" \
+ --output-format srt \
+ --max-words-per-line $test_val \
+ --word-timestamps True \
+ --verbose=False
+ TEST_DESC="testing output name template while varying --max-words-per-line=${test_val}"
+ if /bin/ls $TEST_OUTPUT_DIR/ls_test_mwpl_${test_val}.srt > /dev/null; then
+ echo "[PASS] $TEST_DESC"
+ else
+ echo "[FAIL] $TEST_DESC"
+ fi
+done
+
+
+TEST_STDIN_1="mlx_whisper produces identical output whether provided audio arg or stdin of same content"
+/bin/cat "$TEST_AUDIO" | mlx_whisper - \
+ --output-dir "$TEST_OUTPUT_DIR" \
+ --output-format json \
+ --temperature 0 \
+ --verbose=False
+if diff "${TEST_OUTPUT_DIR}/content.json" "${TEST_OUTPUT_DIR}/ls_test_transcribed.json"; then
+ echo "[PASS] $TEST_STDIN_1"
+else
+ echo "[FAIL] $TEST_STDIN_1"
+ echo "Check unexpected output in ${TEST_OUTPUT_DIR}"
+fi
+
+TEST_STDIN_2="mlx_whisper produces identical output when stdin comes via: cmd < file"、
+mlx_whisper - \
+ --output-name '{basename}_transcribed' \
+ --output-dir "$TEST_OUTPUT_DIR" \
+ --output-format tsv \
+ --temperature 0 \
+ --verbose=False < "$TEST_AUDIO"
+if diff "${TEST_OUTPUT_DIR}/content_transcribed.tsv" "${TEST_OUTPUT_DIR}/ls_test_transcribed.tsv"; then
+ echo "[PASS] $TEST_STDIN_2"
+else
+ echo "[FAIL] $TEST_STDIN_2"
+ echo "Check unexpected output in ${TEST_OUTPUT_DIR}"
+fi
+
+echo "Outputs can be verified in ${TEST_OUTPUT_DIR}"