Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 168 additions & 0 deletions tools/whisperx/download_models_3.4.2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import argparse
import os
import urllib.request
from pathlib import Path

import huggingface_hub

DEFAULT_ALIGN_MODELS_TORCH = {
"en": "WAV2VEC2_ASR_BASE_960H",
"fr": "VOXPOPULI_ASR_BASE_10K_FR",
"de": "VOXPOPULI_ASR_BASE_10K_DE",
"es": "VOXPOPULI_ASR_BASE_10K_ES",
"it": "VOXPOPULI_ASR_BASE_10K_IT",
}

DEFAULT_ALIGN_MODELS_HF = {
"ja": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese",
"zh": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn",
"nl": "jonatasgrosman/wav2vec2-large-xlsr-53-dutch",
"uk": "Yehor/wav2vec2-xls-r-300m-uk-with-small-lm",
"pt": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese",
"ar": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
"cs": "comodoro/wav2vec2-xls-r-300m-cs-250",
"ru": "jonatasgrosman/wav2vec2-large-xlsr-53-russian",
"pl": "jonatasgrosman/wav2vec2-large-xlsr-53-polish",
"hu": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian",
"fi": "jonatasgrosman/wav2vec2-large-xlsr-53-finnish",
"fa": "jonatasgrosman/wav2vec2-large-xlsr-53-persian",
"el": "jonatasgrosman/wav2vec2-large-xlsr-53-greek",
"tr": "mpoyraz/wav2vec2-xls-r-300m-cv7-turkish",
"da": "saattrupdan/wav2vec2-xls-r-300m-ftspeech",
"he": "imvladikon/wav2vec2-xls-r-300m-hebrew",
"vi": "nguyenvulebinh/wav2vec2-base-vi",
"ko": "kresnik/wav2vec2-large-xlsr-korean",
"ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu",
"te": "anuragshas/wav2vec2-large-xlsr-53-telugu",
"hi": "theainerd/Wav2Vec2-large-xlsr-hindi",
"ca": "softcatala/wav2vec2-large-xlsr-catala",
"ml": "gvs/wav2vec2-large-xlsr-malayalam",
"no": "NbAiLab/nb-wav2vec2-1b-bokmaal-v2",
"nn": "NbAiLab/nb-wav2vec2-1b-nynorsk",
"sk": "comodoro/wav2vec2-xls-r-300m-sk-cv8",
"sl": "anton-l/wav2vec2-large-xlsr-53-slovenian",
"hr": "classla/wav2vec2-xls-r-parlaspeech-hr",
"ro": "gigant/romanian-wav2vec2",
"eu": "stefan-it/wav2vec2-large-xlsr-53-basque",
"gl": "ifrz/wav2vec2-large-xlsr-galician",
"ka": "xsway/wav2vec2-large-xlsr-georgian",
"lv": "jimregan/wav2vec2-large-xlsr-latvian-cv",
"tl": "Khalsuu/filipino-wav2vec2-l-xls-r-300m-official",
"sv": "KBLab/wav2vec2-large-voxrex-swedish",
}

_MODELS = {
"tiny": "Systran/faster-whisper-tiny",
"base": "Systran/faster-whisper-base",
"small": "Systran/faster-whisper-small",
"medium": "Systran/faster-whisper-medium",
"large": "Systran/faster-whisper-large-v3",
"turbo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
}

TORCHAUDIO_MODEL_FILES = {
"WAV2VEC2_ASR_BASE_960H": "wav2vec2_fairseq_base_ls960_asr_ls960.pth",
"VOXPOPULI_ASR_BASE_10K_FR": "wav2vec2_voxpopuli_base_10k_asr_fr.pt",
"VOXPOPULI_ASR_BASE_10K_DE": "wav2vec2_voxpopuli_base_10k_asr_de.pt",
"VOXPOPULI_ASR_BASE_10K_ES": "wav2vec2_voxpopuli_base_10k_asr_es.pt",
"VOXPOPULI_ASR_BASE_10K_IT": "wav2vec2_voxpopuli_base_10k_asr_it.pt",
}

TORCHAUDIO_BASE_URL = "https://download.pytorch.org/torchaudio/models/"


def download_torchaudio_models(model_dir):
checkpoints_dir = Path(model_dir) / "hub" / "checkpoints"
checkpoints_dir.mkdir(parents=True, exist_ok=True)
for pipeline_name, filename in TORCHAUDIO_MODEL_FILES.items():
dest = checkpoints_dir / filename
if dest.exists():
print(f"Already exists, skipping: {filename}")
continue
url = TORCHAUDIO_BASE_URL + filename
print(f"Downloading {pipeline_name} from {url} ...")
try:
urllib.request.urlretrieve(url, dest)
print(f"Downloaded: {filename}")
except Exception as e:
print(f"Failed to download {pipeline_name}: {e}")


def download_hf_alignment_models(model_dir, token):
cache_dir = str(Path(model_dir) / "hub")
for lang, repo_id in DEFAULT_ALIGN_MODELS_HF.items():
print(f"Downloading HF alignment model [{lang}]: {repo_id} ...")
try:
huggingface_hub.snapshot_download(repo_id, cache_dir=cache_dir, token=token)
print(f"Downloaded: {repo_id}")
except Exception as e:
print(f"Failed to download {repo_id}: {e}")


def download_asr_models(model_dir, token):
cache_dir = str(Path(model_dir) / "hub")
for model_name, repo_id in _MODELS.items():
print(f"Downloading ASR model [{model_name}]: {repo_id} ...")
try:
huggingface_hub.snapshot_download(
repo_id,
cache_dir=cache_dir,
token=token,
allow_patterns=[
"config.json",
"preprocessor_config.json",
"model.bin",
"tokenizer.json",
"vocabulary.*",
],
)
print(f"Downloaded: {model_name}")
except Exception as e:
print(f"Failed to download {model_name}: {e}")


PYANNOTE_REPOS = [
"pyannote/speaker-diarization-3.1",
"pyannote/segmentation-3.0",
"pyannote/wespeaker-voxceleb-resnet34-LM",
]


def download_pyannote_model(model_dir, token):
cache_dir = str(Path(model_dir) / "hub")
for repo_id in PYANNOTE_REPOS:
print(f"Downloading {repo_id} ...")
try:
huggingface_hub.snapshot_download(repo_id, cache_dir=cache_dir, token=token)
print(f"Downloaded: {repo_id}")
except Exception as e:
print(f"Failed to download {repo_id}: {e}")


def main():
parser = argparse.ArgumentParser(
description="Download WhisperX models (lightweight, no torch required)"
)
parser.add_argument(
"--model-dir",
default="whisperx_models",
help="Directory to store downloaded models",
)
parser.add_argument(
"--hf-token",
default=os.getenv("HF_AUTH_TOKEN"),
help="HuggingFace token for gated models",
)
args = parser.parse_args()

model_dir = args.model_dir
token = args.hf_token

download_torchaudio_models(model_dir)
download_hf_alignment_models(model_dir, token)
download_asr_models(model_dir, token)
download_pyannote_model(model_dir, token)


if __name__ == "__main__":
main()
78 changes: 78 additions & 0 deletions tools/whisperx/tool-data/huggingface.loc.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# WhisperX model table — covers ASR and alignment models.
#
# This file uses the shared huggingface data table schema.
# The pipeline_tag column distinguishes model types: asr | alignment.
# The free_tag column is set to "whisperx" for all rows so the XML can
# filter specifically for WhisperX rows in the shared table.
#
# To configure:
# 1. Copy this file to huggingface.loc (removing .sample), or append to an
# existing huggingface.loc shared with other tools (e.g. Flux)
# 2. Replace /data/db/whisperx_models with the actual root directory
# created by download_models_3.4.2.py
# 3. Remove rows for models you have not downloaded
#
# Columns:
# value : unique row identifier
# name : display label shown in Galaxy UI
# pipeline_tag : asr | alignment
# domain : whisper size key (asr) or BCP-47 language code (alignment)
# free_tag : whisperx (used by the tool XML to filter rows)
# version : tool version this row belongs to
# path : root model directory (containing hub/, nltk_data/, etc.)
#
# Columns: value <TAB> name <TAB> pipeline_tag <TAB> domain <TAB> free_tag <TAB> version <TAB> path
#
# --- version 3.4.2 ---
#
# ── ASR models ──────────────────────────────────────────────────────────────────────
whisperx_3.4.2_asr_turbo Turbo (~8x faster than large) asr turbo whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_asr_tiny Tiny (~10x faster than large) asr tiny whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_asr_base Base (~7x faster than large) asr base whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_asr_small Small (~4x faster than large) asr small whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_asr_medium Medium (~2x faster than large) asr medium whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_asr_large Large asr large whisperx 3.4.2 /data/db/whisperx_models
#
# ── Alignment models — torchaudio (en, fr, de, es, it) ──────────────────────────────
whisperx_3.4.2_align_en English alignment en whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_fr French alignment fr whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_de German alignment de whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_es Spanish alignment es whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_it Italian alignment it whisperx 3.4.2 /data/db/whisperx_models
#
# ── Alignment models — HuggingFace ──────────────────────────────────────────────────────
whisperx_3.4.2_align_ja Japanese alignment ja whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_zh Chinese alignment zh whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_nl Dutch alignment nl whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_uk Ukrainian alignment uk whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_pt Portuguese alignment pt whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_ar Arabic alignment ar whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_cs Czech alignment cs whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_ru Russian alignment ru whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_pl Polish alignment pl whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_hu Hungarian alignment hu whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_fi Finnish alignment fi whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_fa Persian alignment fa whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_el Greek alignment el whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_tr Turkish alignment tr whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_da Danish alignment da whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_he Hebrew alignment he whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_vi Vietnamese alignment vi whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_ko Korean alignment ko whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_ur Urdu alignment ur whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_te Telugu alignment te whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_hi Hindi alignment hi whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_ca Catalan alignment ca whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_ml Malayalam alignment ml whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_no Norwegian alignment no whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_nn Nynorsk alignment nn whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_sk Slovak alignment sk whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_sl Slovenian alignment sl whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_hr Croatian alignment hr whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_ro Romanian alignment ro whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_eu Basque alignment eu whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_gl Galician alignment gl whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_ka Georgian alignment ka whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_lv Latvian alignment lv whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_tl Tagalog alignment tl whisperx 3.4.2 /data/db/whisperx_models
whisperx_3.4.2_align_sv Swedish alignment sv whisperx 3.4.2 /data/db/whisperx_models
13 changes: 13 additions & 0 deletions tools/whisperx/tool_data_table_conf.xml.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!-- Paths are relative to the value of `tool_data_path` in galaxy.ini -->
<!-- WhisperX uses the shared huggingface data table.
ASR rows have pipeline_tag=asr; alignment rows have pipeline_tag=alignment.
Column layout (7 columns):
value, name, pipeline_tag, domain, free_tag, version, path
domain is the whisper size key (asr) or BCP-47 language code (alignment).
path is the root model directory (containing hub/, nltk_data/, etc.). -->
<tables>
<table name="huggingface" comment_char="#" allow_duplicate_entries="False">
<columns>value, name, pipeline_tag, domain, free_tag, version, path</columns>
<file path="tool-data/huggingface.loc" />
</table>
</tables>
Loading