Skip to content

Commit

Permalink
Prepend prefix tokens with the initial timestamp token
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaumekln committed Jul 18, 2023
1 parent 3b4a6aa commit a9dc4cb
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 1 deletion.
2 changes: 2 additions & 0 deletions faster_whisper/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,8 @@ def get_prompt(

if prefix:
prefix_tokens = tokenizer.encode(" " + prefix.strip())
if not without_timestamps:
prefix_tokens.insert(0, tokenizer.timestamp_begin)
if len(prefix_tokens) >= self.max_length // 2:
prefix_tokens = prefix_tokens[: self.max_length // 2 - 1]
prompt.extend(prefix_tokens)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
av==10.*
ctranslate2>=3.10,<4
ctranslate2>=3.17,<4
huggingface_hub>=0.13
tokenizers==0.13.*
onnxruntime>=1.14,<2
18 changes: 18 additions & 0 deletions tests/test_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,24 @@ def test_transcribe(jfk_path):
assert segment.end == segment.words[-1].end


def test_prefix_with_timestamps(jfk_path):
model = WhisperModel("tiny")
segments, _ = model.transcribe(jfk_path, prefix="And so my fellow Americans")
segments = list(segments)

assert len(segments) == 1

segment = segments[0]

assert segment.text == (
" And so my fellow Americans ask not what your country can do for you "
"ask what you can do for your country."
)

assert segment.start == 0
assert 10 < segment.end < 11


def test_vad(jfk_path):
model = WhisperModel("tiny")
segments, info = model.transcribe(
Expand Down

0 comments on commit a9dc4cb

Please sign in to comment.