From 17c6e1cbf4fa6e4c6c6317492ecc20760c1feb41 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Mon, 26 Jun 2023 17:44:35 +0100 Subject: [PATCH 01/24] from private repo --- chapters/en/_toctree.yml | 42 +- chapters/en/chapter5/asr_models.mdx | 380 +++++++++++++ chapters/en/chapter5/choosing_dataset.mdx | 127 +++++ chapters/en/chapter5/evaluation.mdx | 310 +++++++++++ chapters/en/chapter5/fine-tuning.mdx | 526 ++++++++++++++++++ .../{unpublished => en}/chapter5/hands_on.mdx | 0 chapters/en/chapter5/introduction.mdx | 40 ++ .../{unpublished => en}/chapter5/quiz.mdx | 0 .../chapter5/supplemental_reading.mdx | 0 chapters/unpublished/chapter5/asr_models.mdx | 1 - .../unpublished/chapter5/choosing_dataset.mdx | 1 - chapters/unpublished/chapter5/evaluation.mdx | 1 - chapters/unpublished/chapter5/fine-tuning.mdx | 1 - .../unpublished/chapter5/introduction.mdx | 1 - .../chapter5/preprocessing_data.mdx | 1 - .../chapter5/speaker_diarization.mdx | 1 - 16 files changed, 1404 insertions(+), 28 deletions(-) create mode 100644 chapters/en/chapter5/asr_models.mdx create mode 100644 chapters/en/chapter5/choosing_dataset.mdx create mode 100644 chapters/en/chapter5/evaluation.mdx create mode 100644 chapters/en/chapter5/fine-tuning.mdx rename chapters/{unpublished => en}/chapter5/hands_on.mdx (100%) create mode 100644 chapters/en/chapter5/introduction.mdx rename chapters/{unpublished => en}/chapter5/quiz.mdx (100%) rename chapters/{unpublished => en}/chapter5/supplemental_reading.mdx (100%) delete mode 100644 chapters/unpublished/chapter5/asr_models.mdx delete mode 100644 chapters/unpublished/chapter5/choosing_dataset.mdx delete mode 100644 chapters/unpublished/chapter5/evaluation.mdx delete mode 100644 chapters/unpublished/chapter5/fine-tuning.mdx delete mode 100644 chapters/unpublished/chapter5/introduction.mdx delete mode 100644 chapters/unpublished/chapter5/preprocessing_data.mdx delete mode 100644 chapters/unpublished/chapter5/speaker_diarization.mdx diff --git a/chapters/en/_toctree.yml b/chapters/en/_toctree.yml index 2c6a9a20..f8288581 100644 --- a/chapters/en/_toctree.yml +++ b/chapters/en/_toctree.yml @@ -65,29 +65,29 @@ - local: chapter4/hands_on title: Hands-on exercise -#- title: Unit 5. Transcribe a meeting recording -# sections: -# - local: chapter5/introduction -# title: What you'll learn and what you'll build -# - local: chapter5/choosing_dataset -# title: Choosing a dataset -# - local: chapter5/asr_models -# title: Pre-trained models for automatic speech recognition -# - local: chapter5/preprocessing_data -# title: Loading and preprocessing data -# - local: chapter5/evaluation -# title: Evaluation metrics for ASR -# - local: chapter5/fine-tuning -# title: Fine-tuning the ASR model +- title: Unit 5. Automatic Speech Recognition + sections: + - local: chapter5/introduction + title: What you'll learn and what you'll build + - local: chapter5/asr_models + title: Pre-trained models for speech recognition + - local: chapter5/choosing_dataset + title: Choosing a dataset + - local: chapter5/evaluation + title: Evaluation and metrics for speech recognition + - local: chapter5/fine-tuning + title: How to fine-tune an ASR system with the Trainer API # - local: chapter5/speaker_diarization # title: Automatic speech recognition with speaker diarization -# - local: chapter5/quiz -# title: Quiz -# quiz: 5 -# - local: chapter5/hands_on -# title: Hands-on exercise -# - local: chapter5/supplemental_reading -# title: Supplemental reading and resources + - local: chapter4/demo + title: Building a demo + - local: chapter5/quiz + title: Quiz + quiz: 5 + - local: chapter5/hands_on + title: Hands-on exercise + - local: chapter5/supplemental_reading + title: Supplemental reading and resources # #- title: Unit 6. From text to speech # sections: diff --git a/chapters/en/chapter5/asr_models.mdx b/chapters/en/chapter5/asr_models.mdx new file mode 100644 index 00000000..28403a62 --- /dev/null +++ b/chapters/en/chapter5/asr_models.mdx @@ -0,0 +1,380 @@ +# Pre-trained models for automatic speech recognition + +In this section, we'll cover how to use the `pipeline()` to leverage pre-trained models for speech recognition. In [Unit 2](../chapter2/asr_pipeline.mdx), +we introduced the `pipeline()` as an easy way of running speech recognition tasks, with all pre- and post-processing handled under-the-hood +and the flexibility to quickly experiment with any pre-trained checkpoint on the Hugging Face Hub. In this Unit, we'll go a +level deeper and explore the different attributes of speech recognition models and how we can use them to tackle a range +of different tasks. + +As detailed in Unit 3, speech recognition model broadly fall into one of two categories: + +1. Connectionist Temporal Classification (CTC): _encoder-only_ models with a linear classification (CTC) head on top +2. Sequence-to-sequence (Seq2Seq): _encoder-decoder_ models, with a cross-attention mechanism between the encoder and decoder + +Prior to 2022, CTC was the more popular of the two architectures, with encoder-only models such as Wav2Vec2, HuBERT and XLSR achieving +breakthoughs in the pre-training / fine-tuning paradigm for speech. Big corporations, such as Meta and Microsoft, pre-trained +the encoder on vast amounts of unlabelled audio data for many days or weeks. Users could then take a pre-trained checkpoint, and +fine-tune it with a CTC head on as little as **10 minutes** of labelled speech data to achieve strong performance on a downstream +speech recognition task. + +However, CTC models have their shortcomings. Appending a simple linear layer to an encoder gives a small, fast overall model, but can +be prone to phonetic spelling errors. We'll demonstrate this for the Wav2Vec2 model below. + +## Probing CTC Models + +Let's load a small excerpt of the [LibriSpeech](https://huggingface.co/datasets/librispeech_pipe) dataset to demonstrate +Wav2Vec2's speech transcription capabilities: + +```python +from datasets import load_dataset + +dataset = load_dataset("hf-internal-testing/librispeech_pipe_demo", "clean", split="validation") +dataset +``` + +**Output:** +``` +Dataset({ + features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'], + num_rows: 73 +}) +``` + +We can pick one of the 73 audio samples and inspect the audio sample as well as the transcription: + +```python +from IPython.display import Audio + +sample = dataset[2] + +print(sample["text"]) +Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"]) +``` +**Output:** +``` +HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND +``` + +Alright! Christmas and roast beef, sounds great! 🎄 Having chosen a data sample, we now load a fine-tuned checkpoint into +the `pipeline()`. For this, we'll use the official [Wav2Vec2 base](facebook/wav2vec2-base-100h) checkpoint fine-tuned on +100 hours of LibriSpeech data: + +```python +from transformers import pipeline + +pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-100h") +``` + +Next, we'll take an example from the dataset and pass its raw data to the pipeline: + +```python +pipe(sample["audio"].copy()) +{'text': 'HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAUS AND ROSE BEEF LOOMING BEFORE US SIMALYIS DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND'} +``` + +We can see that the Wav2Vec2 model does a pretty good job at transcribing this sample - at a first glance it looks generally correct. +Let's put the target and prediction side-by-side and highlight the differences: + +``` +Target: HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND +Prediction: HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH **CHRISTMAUS** AND **ROSE** BEEF LOOMING BEFORE US **SIMALYIS** DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND +``` + +Comparing the target text to the predicted transcription, we can see that all words _sound_ correct, but some are not spelled accurately. For example: + +* _CHRISTMAUS_ vs. _CHRISTMAS_ +* _ROSE_ vs. _ROAST_ +* _SIMALYIS_ vs. _SIMILES_ + +This highlights the shortcoming of a CTC model. A CTC model is essentially an 'acoustic-only' model: it consists of an encoder +which forms hidden-state representations from the audio inputs, and a linear layer which maps the hidden-states to characters: + + + +This means that the system almost entirely bases its prediction on the acoustic input it was given (the phonetic sounds of the audio), +and so has a tendency to transcribe the audio in a phonetic way (e.g. _CHRISTMAUS_). It gives less importance to the +language modelling context of previous and successive letters, and so is prone to phonetic spelling errors. A more intelligent model +would identify that _CHRISTMAUS_ is not a valid word in the English vocabulary, and correct it to _CHRISTMAS_ when making +its predictions. We're also missing two big features in our prediction - casing and punctuation - which limits the usefulness of +the model's transcriptions to real-world applications. + +## Graduation to Seq2Seq + +Cue Seq2Seq models! As outlined in Unit 3, Seq2Seq models are formed of an encoder and decoder linked via a cross-attention +mechanism. The encoder plays the same role as before, computing hidden-state representations of the audio inputs, while the decoder +plays the role of a **language model**. The decoder processes the entire sequence of hidden-state representations +from the encoder and generates the corresponding text transcriptions. With global context of the audio input, the decoder +is able to use language modelling context as it makes its predictions, correcting for spelling mistakes on-the-fly and thus +circumventing the issue of phonetic predictions. + +There are two downsides to Seq2Seq models: +1. They are inherently slower at decoding, since the decoding process happens one step at a time, rather than all at once +2. They are more data hungry, requiring significantly more training data to reach convergence + +In particular, the need for large amounts of training data has been a bottleneck in the advancement of Seq2Seq architectures for +speech. Labelled speech data is difficult to come by, with the largest annotated datasets at the time clocking in at just +10,000 hours. This all changed in 2022 upon the release of **Whisper**. Whisper is a pre-trained model for speech recognition +published in [September 2022](https://openai.com/blog/whisper/) by the authors Alec Radford et al. from OpenAI. Unlike +its CTC predecessors, which were pre-trained entirely on **un-labelled** audio data, Whisper is pre-trained on a vast quantity of +**labelled** audio-transcription data, 680,000 hours to be precise. + +This is an order of magnitude more data than the un-labelled audio data used to train Wav2Vec 2.0 (60,000 hours). What is +more, 117,000 hours of this pre-training data is multilingual (or "non-English") data. This results in checkpoints that can be applied to +over 96 languages, many of which are considered _low-resource_, meaning the language lacks a large corpus of data suitable for training. + +When scaled to 680,000 hours of labelled pre-training data, Whisper models demonstrate a strong ability to generalise to +many datasets and domains. The pre-trained checkpoints achieve competitive results to state-of-the-art pipe systems, with +near 3% word error rate (WER) on the test-clean subset of LibriSpeech pipe and a new state-of-the-art on TED-LIUM with +4.7% WER (_c.f._ Table 8 of the [Whisper paper](https://cdn.openai.com/papers/whisper.pdf)). + +Of particular importance is Whisper's ability to handle long-form audio samples, its robustness to input noise and ability +to predict cased and punctuated transcriptions. This makes it a viable candidate for real-world speech recognition systems. + +The remainder of this section will show you how to use the pre-trained Whisper models for speech recognition using 🤗 +Transformers. In many situations, the pre-trained Whisper checkpoints are extremely performant and give great results, +thus we encourage you to try using the pre-trained checkpoints as a first step to solving any speech recognition problem. +Through fine-tuning, the pre-trained checkpoints can be adapted for specific datasets and languages to further improve +upon these results. We'll demonstrate how to do this in the upcoming subsection on [fine-tuning](fine-tuning.mdx). + +The Whisper checkpoints come in five configurations of varying model sizes. The smallest four are trained on either +English-only or multilingual data. The largest checkpoint is multilingual only. All nine of the pre-trained checkpoints +are available on the [Hugging Face Hub](https://huggingface.co/models?search=openai/whisper). The checkpoints are +summarised in the following table with links to the models on the Hub. "VRAM" denotes the required GPU memory to run the +model with the minimum batch size of 1. "Rel Speed" is the relative speed of a checkpoint compared to the largest model. +Based on this information, you can select a checkpoint that is best suited to your hardware. + +| Size | Parameters | VRAM / GB | Rel Speed | English-only | Multilingual | +|--------|------------|-----------|-----------|------------------------------------------------------|-----------------------------------------------------| +| tiny | 39 M | 1.4 | 32 | [✓](https://huggingface.co/openai/whisper-tiny.en) | [✓](https://huggingface.co/openai/whisper-tiny) | +| base | 74 M | 1.5 | 16 | [✓](https://huggingface.co/openai/whisper-base.en) | [✓](https://huggingface.co/openai/whisper-base) | +| small | 244 M | 2.3 | 6 | [✓](https://huggingface.co/openai/whisper-small.en) | [✓](https://huggingface.co/openai/whisper-small) | +| medium | 769 M | 4.2 | 2 | [✓](https://huggingface.co/openai/whisper-medium.en) | [✓](https://huggingface.co/openai/whisper-medium) | +| large | 1550 M | 7.5 | 1 | x | [✓](https://huggingface.co/openai/whisper-large-v2) | + +Let's load the [Whisper Base](https://huggingface.co/openai/whisper-base) checkpoint, which is of comparable size to the +Wav2Vec2 checkpoint we used previously. Preempting our move to multilingual speech recognition, we'll load the multilingual +variant of the base checkpoint. We'll also load the model on the GPU if available, or CPU otherwise. The `pipeline()` will +subsequently take care of moving all inputs / outputs from the CPU to the GPU as required: + +```python +import torch +from transformers import pipeline + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device) +``` + +Great! Now let's transcribe the audio as before. The only change we make is passing an extra argument, `max_new_tokens`, +which tells the model the maximum number of tokens to generate when making its prediction: + +```python +pipe(sample["audio"], max_new_tokens=256) +``` +**Output:** +``` +{'text': ' He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly is drawn from eating and its results occur most readily to the mind.'} +``` + +Easy enough! The first thing you'll notice is the presence of both casing and punctuation. Immediately this makes the +transcription easier to read compared to the un-cased and un-punctuated transcription from Wav2Vec2. Let's put the transcription +side-by-side with the target: + +``` +Target: HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND +Prediction: He tells us that at this festive season of the year, with **Christmas** and **roast** beef looming before us, **similarly** is drawn from eating and its results occur most readily to the mind. +``` + +Whisper has done a great job at correcting the phonetic errors we saw from Wav2Vec2 - both _Christmas_ and _roast_ are +spelled correctly. We see that the model still struggles with _SIMILES_, being incorrectly transcribed as _similarly_, but +this time the prediction is a valid word from the English vocabulary. Using a larger Whisper checkpoint can help further +reduce transcription errors, at the expense of requiring more compute and a longer transcription time. + +We've been promised a model that can handle 96 languages, so lets leave English speech recognition for now and go global 🌎! +The [Multilingual LibriSpeech](https://huggingface.co/datasets/facebook/multilingual_librispeech) (MLS) dataset is +the multilingual equivalent of the LibriSpeech dataset, with labelled audio data in six languages. We'll load one sample +from the Spanish split of the MLS dataset, making use of _streaming_ mode so that we don't have to download the entire dataset: + +```python +dataset = load_dataset("facebook/multilingual_librispeech", "spanish", split="validation", streaming=True) +sample = next(iter(dataset)) +``` + +Again, we'll inspect the text transcription and take a listen to the audio segment: + +```python +print(sample["text"]) +Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"]) +``` +**Output:** +``` +entonces te delelitarás en jehová y yo te haré subir sobre las alturas de la tierra y te daré á comer la heredad de jacob tu padre porque la boca de jehová lo ha hablado +``` + +This is the target text that we're aiming for with our Whisper transcription. Although we now know that we can +probably do better this, since our model is also going to predict punctuation and casing, neither of which are present in the +reference. Let's forward the audio sample to the pipeline to get our text prediction. One thing to note is that the +pipeline _consumes_ the dictionary of audio inputs that we input, meaning the dictionary can't be re-used. To circumvent +this, we'll pass a _copy_ of the audio sample, so that we can re-use the same audio sample in the proceeding code examples: + +```python +pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task": "transcribe"}) +``` +**Output:** +``` +{'text': ' Entonces te deleitarás en Jehová y yo te haré subir sobre las alturas de la tierra y te daré a comer la heredad de Jacob tu padre porque la boca de Jehová lo ha hablado.'} +``` + +Great - this looks very similar to our reference text (arguably better since it has punctuation and casing!). You'll notice +that we forwarded the `"task"` as a _generate key-word argument_ (generate kwarg). Setting the `"task"` to `"transcribe"` +forces Whisper to perform the task of _speech recognition_, where the audio is transcribed in the same language that the +speech was spoken in. Whisper is also capable of performing the closely related task of _speech translation_, where the +audio in Spanish can be translated to text in English. To achieve this, we set the `"task"` to `"translate"`: + +```python +pipe(sample["audio"], max_new_tokens=256, generate_kwargs={"task": "translate"}) +``` +**Output:** +``` +{'text': ' So you will choose in Jehovah and I will raise you on the heights of the earth and I will give you the honor of Jacob to your father because the voice of Jehovah has spoken to you.'} +``` + +Now that we know we can toggle between speech recognition and speech translation, we can pick our task depending on our +needs. Either we recognise from audio in language X to text in the same language X (e.g. Spanish audio to Spanish text), +or we translate from audio in any language X to text in English (e.g. Spanish audio to English text). + +To read more about how the `"task"` argument is used to control the properties of the generated text, refer to the +[model card](https://huggingface.co/openai/whisper-base#usage) for the Whisper base model. + +## Long-Form Transcription and Timestamps + +So far, we've focussed on transcribing short audio samples of less than 30 seconds. We mentioned that one of the appeals +of Whisper was its ability to work on long audio samples. We'll tackle this task here! + +Let's create a long audio file by concatenating sequential samples from the MLS dataset. Since the MLS dataset is +curated by splitting long audiobook recordings into shorter segments, concatenating samples is one way of reconstructing +longer audiobook passages. Consequently, the resulting audio should be coherent across the entire sample. + +We'll set our target audio length to 5 minutes, and stop concatenating samples once we hit this value: + +```python +import numpy as np + +target_length_in_m = 5 + +# convert from minutes to seconds (* 60) to num samples (* sampling rate) +sampling_rate = pipe.feature_extractor.sampling_rate +target_length_in_samples = target_length_in_m * 60 * sampling_rate + +# iterate over our streaming dataset, concatenating samples until we hit our target +long_audio = [] +for sample in dataset: + long_audio.extend(sample["audio"]["array"]) + if len(long_audio) > target_length_in_samples: + break + +long_audio = np.asarray(long_audio) + +# how did we do? +seconds = len(long_audio) / 16000 +minutes, seconds = divmod(seconds, 60) +print(f"Length of audio sample is {minutes} minutes {seconds:.2f} seconds") +``` +**Output:** +``` +Length of audio sample is 5.0 minutes 17.22 seconds +``` + +Alright! 5 minutes and 17 seconds of audio to transcribe. There are two problems with forwarding this long audio sample +directly to the model: +1. Whisper is inherently designed to work with 30 second samples: anything shorter than 30s is padded to 30s with silence, anything longer than 30s is truncated to 30s by cutting of the extra audio, so if we pass our audio directly we'll only get the transcription for the first 30s +2. Memory in a transformer network scales with the sequence length squared: doubling the input length quadruples the memory requirement, so passing super long audio files is bound to lead to an out-of-memory (OOM) error + +The way long-form transcription works in 🤗 Transformers is by _chunking_ the input audio into smaller, more manageable segments. +Each segment has a small amount of overlap with the previous one. This allows us to accurately stitch the segments back together +at the boundaries, since we can find the overlap between segments and merge the transcriptions accordingly: + + + +The advantage of chunking the samples is that we don't need the result of chunk \\( i \\) to transcribe the subsequent +chunk \\( i + 1 \\). The stitching is done after we have transcribed all the chunks at the chunk boundaries, so it doesn't +matter which order we transcribe chunks in. The algorithm is entirely **stateless**, so we can even do chunk \\( i + 1 \\) +at the same time as chunk \\( i \\)! This allows us to _batch_ the chunks and run them through the model in parallel, +providing a large computational speed-up compared to transcribing them sequentially. To read more about chunking in 🤗 Transformers, +you can refer to this [blog post](https://huggingface.co/blog/asr-chunking). + +To activate long-form transcriptions, we have to add one additional argument when we call the pipeline. This argument, +`chunk_length_s`, controls the length of the chunked segments in seconds. For Whisper, 30 second chunks are optimal, +since this matches the input length Whisper expects. + +To activate batching, we need to pass the argument `batch_size` to the pipeline. Putting it all together, we can transcribe the +long audio sample with chunking and batching as follows: + +```python +pipe(long_audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"}, chunk_length_s=30, batch_size=8) +``` +**Output:** +``` +{'text': ' Entonces te deleitarás en Jehová, y yo te haré subir sobre las alturas de la tierra, y te daré a comer la +heredad de Jacob tu padre, porque la boca de Jehová lo ha hablado. nosotros curados. Todos nosotros nos descarriamos +como bejas, cada cual se apartó por su camino, mas Jehová cargó en él el pecado de todos nosotros... +``` + +We won't print the entire output here since it's pretty long (312 words total)! On a 16GB V100 GPU, you can expect the above +line to take approximately 3.45 seconds to run, which is pretty good for a 317 second audio sample. On a CPU, expect +closer to 30 seconds. + +Whisper is also able to predict segment-level _timestamps_ for the audio data. These timestamps indicate the start and end +time for a short passage of audio, and are particularly useful for aligning a transcription with the input audio. Suppose +we want to provide closed captions for a video - we need these timestamps to know which part of the transcription corresponds +to a certain segment of video, in order to display the correct transcription for that time. + +Activating timestamp prediction is straightforward, we just need to set the argument `return_timestamps=True`. Timestamps +are compatible with both the chunking and batching methods we used previously, so we can simply append the timestamp +argument to our previous call: + +```python +pipe(long_audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"}, chunk_length_s=30, batch_size=8, return_timestamps=True)["chunks"] +``` +**Output:** +``` +[{'timestamp': (0.0, 26.4), + 'text': ' Entonces te deleitarás en Jehová, y yo te haré subir sobre las alturas de la tierra, y te daré a comer la heredad de Jacob tu padre, porque la boca de Jehová lo ha hablado. nosotros curados. Todos nosotros nos descarriamos como bejas, cada cual se apartó por su camino,'}, + {'timestamp': (26.4, 32.48), + 'text': ' mas Jehová cargó en él el pecado de todos nosotros. No es que partas tu pan con el'}, + {'timestamp': (32.48, 38.4), + 'text': ' hambriento y a los hombres herrantes metas en casa, que cuando vieres al desnudo lo cubras y no'}, + ... +``` + +And voila! We have our predicted text as well as corresponding timestamps. + +## Summary + +Whisper is a strong pre-trained model for speech recognition and translation. Compared to Wav2Vec2, it has higher +transcription accuracy, with outputs that contain punctuation and casing. It can be used to transcribe speech in English +as well as 96 other languages, both on short audio segments and longer ones through _chunking_. These attributes make it +a viable model for many speech recognition and translation tasks without the need for fine-tuning. The `pipeline()` method +provides an easy way of running inference in one-line API calls with control over the generated predictions. + +While the Whisper model performs extremely well on many high-resource languages, it has lower transcription and translation +accuracy on low-resource languages, i.e. those with less readily available training data. There is also varying performance +across different accents and dialects of certain languages, including lower accuracy for speakers of different genders, +races, ages or other demographic criteria (_c.f._ [Whisper paper](https://arxiv.org/pdf/2212.04356.pdf)). + +To boost the performance on low-resource languages, accents or dialects, we can take the pre-trained Whisper model and +train it on a small corpus of appropriately selected data, in a process called _fine-tuning_. We'll show that with +as little of 10 hours of additional data, we can improve the performance of the Whisper model by X% on a low-resource +language. TODO(SG): update with results when available. In the next section, we'll cover the process behind selecting a +dataset for fine-tuning. diff --git a/chapters/en/chapter5/choosing_dataset.mdx b/chapters/en/chapter5/choosing_dataset.mdx new file mode 100644 index 00000000..b2230724 --- /dev/null +++ b/chapters/en/chapter5/choosing_dataset.mdx @@ -0,0 +1,127 @@ +# Choosing a dataset + +As with any machine learning problem, our model is only as good as the data that we train it on. Speech recognition +datasets vary considerably in how they are curated and the domains that they cover. To pick the right dataset, we need +to match our criteria with the features that a dataset offers. + +Before we pick a dataset, we first need to understand the key defining features. + +### 1. Number of hours +Simply put, the number of training hours indicates how large the dataset is. It’s analogous to the number of training +examples in an NLP dataset. However, bigger datasets aren’t necessarily better. If we want a model that generalises well, +we want a **diverse** dataset with lots of different speakers, domains and speaking styles. + +### 2. Domain +The domain entails where the data was sourced from, whether it be audiobooks, podcasts, YouTube or financial meetings. +Each domain has a different distribution of data. For example, audiobooks are recorded in high-quality studio conditions +(with no background noise) and text that is taken from written literature. Whereas for YouTube, the audio likely contains +more background noise and a more informal style of speech. + +We need to match our domain to the conditions we anticipate at inference time. For instance, if we train our model on +audiobooks, we can’t expect it to perform well in noisy environments. + +### 3. Speaking style +The speaking style falls into one of two categories: + +* Narrated: read from a script +* Spontaneous: un-scripted, conversational speech + +The audio and text data reflect the style of speaking. Since narrated text is scripted, it tends to be spoken articulately +and without any errors: + +``` +“Consider the task of training a model on a speech recognition dataset” +``` + +Whereas for spontaneous speech, we can expect a more colloquial style of speech, with the inclusion of repetitions, +hesitations and false-starts: + +``` +“Let’s uhh let's take a look at how you'd go about training a model on uhm a sp- speech recognition dataset” +``` + +### 4. Transcription style +The transcription style refers to whether the target text has punctuation, casing or both. If we want a system to generate +fully formatted text that could be used for a publication or meeting transcription, we require training data with punctuation +and casing. If we just require the spoken words in an un-formatted structure, neither punctuation nor casing are necessary. +In this case, we can either pick a dataset without punctuation or casing, or pick one that has punctuation and casing and +then subsequently remove them from the target text through pre-processing. + +## A summary of datasets on the Hub + +Here is a summary of the most popular English speech recognition datasets on the Hugging Face Hub: + +| Dataset | Train Hours | Domain | Speaking Style | Casing | Punctuation | License | Recommended Use | +|-----------------------------------------------------------------------------------------|-------------|-----------------------------|-----------------------|--------|-------------|-----------------|----------------------------------| +| [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) | 960 | Audiobook | Narrated | ❌ | ❌ | CC-BY-4.0 | Academic benchmarks | +| [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) | 3000 | Wikipedia | Narrated | ✅ | ✅ | CC0-1.0 | Non-native speakers | +| [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | 540 | European Parliament | Oratory | ❌ | ✅ | CC0 | Non-native speakers | +| [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium) | 450 | TED talks | Oratory | ❌ | ❌ | CC-BY-NC-ND 3.0 | Technical topics | +| [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | 10000 | Audiobook, podcast, YouTube | Narrated, spontaneous | ❌ | ✅ | apache-2.0 | Robustness over multiple domains | +| [SPGISpeech](https://huggingface.co/datasets/kensho/spgispeech) | 5000 | Financial meetings | Oratory, spontaneous | ✅ | ✅ | User Agreement | Fully formatted transcriptions | +| [Earnings-22](https://huggingface.co/datasets/revdotcom/earnings22) | 119 | Financial meetings | Oratory, spontaneous | ✅ | ✅ | CC-BY-SA-4.0 | Diversity of accents | +| [AMI](https://huggingface.co/datasets/edinburghcstr/ami) | 100 | Meetings | Spontaneous | ✅ | ✅ | CC-BY-4.0 | Noisy speech conditions | + +This table serves as a reference for selecting a dataset based on your criterion. Below is an equivalent table for +multilingual speech recognition. Note that we omit the train hours column, since this varies depending on the language +for each dataset, and replace it with the number of languages per dataset: + +| Dataset | Languages | Domain | Speaking Style | Casing | Punctuation | License | Recommended Usage | +|-----------------------------------------------------------------------------------------------|-----------|---------------------------------------|----------------|--------|-------------|-----------|-------------------------| +| [Multilingual LibriSpeech](https://huggingface.co/datasets/facebook/multilingual_librispeech) | 6 | Audiobooks | Narrated | ❌ | ❌ | CC-BY-4.0 | Academic benchmarks | +| [Common Voice 13](https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0) | 108 | Wikipedia text & crowd-sourced speech | Narrated | ✅ | ✅ | CC0-1.0 | Diverse speaker set | +| [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | 15 | European Parliament recordings | Spontaneous | ❌ | ✅ | CC0 | European languages | +| [FLEURS](https://huggingface.co/datasets/google/fleurs) | 101 | European Parliament recordings | Spontaneous | ❌ | ❌ | CC-BY-4.0 | Multilingual evaluation | + +For a detailed breakdown of the audio datasets covered in both tables, refer to the blog post [A Complete Guide to Audio Datasets](https://huggingface.co/blog/audio-datasets#a-tour-of-audio-datasets-on-the-hub). +While there are over 180 speech recognition datasets on the Hub, it may be possible that there isn't a dataset that matches +your needs. In this case, it's also possible to use your own audio data with 🤗 Datasets. To create a custom audio dataset, +refer to the guide [Create an audio dataset](https://huggingface.co/docs/datasets/audio_dataset). When creating a custom +audio dataset, consider sharing the final dataset on the Hub so that others in the community can benefit from your +efforts - the audio community is inclusive and wide-ranging, and others will appreciate your work as you do theirs. + +Alright! Now that we've gone through all the criterion for selecting an ASR dataset, let's pick one for the purpose of this tutorial. +We know that Whisper already does a pretty good job at transcribing data in high-resource languages (such as English and Spanish), so +we'll focus ourselves on low-resource multilingual transcription. We want to retain Whisper's ability to predict punctuation and casing, +so it seems from the second table that Common Voice 13 is a great candidate dataset! + +## Common Voice 13 + +Common Voice 13 is a crowd-sourced dataset where speakers record text from Wikipedia in various languages. It forms part of +the Common Voice series, a collection of Common Voice datasets released by Mozilla Foundation. At the time of writing, +Common Voice 13 is the latest edition of the dataset, with the most languages and hours per language out of any release to date. + +We can get the full list of languages for the Common Voice 13 dataset by checking-out the dataset page on the Hub: +[mozilla-foundation/common_voice_13_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0). +The first time you view this page, you'll be asked to accept the terms of use. After that, you'll be given full access to the dataset. + +Once we've provided authentication to use the dataset, we'll be presented with the dataset preview. The dataset preview +shows us the first 100 samples of the dataset for each language. What's more, it's loaded up with audio samples ready for us +to listen to in real time. For this Unit, we'll select [_Dhivehi_](https://en.wikipedia.org/wiki/Maldivian_language) +(or _Maldivian_), an Indo-Aryan language spoken in the South Asian island country of the Maldives. While we're selecting +Dhivehi for this tutorial, the steps covered here apply to any one of the 108 languages in the Common Voice 13 dataset, and +more generally to any one of the 180+ audio datasets on the Hugging Face Hub, so there's no restriction on language or dialect. + +We can select the Dhivehi subset of Common Voice 13 by setting the subset to `dv` using the dropdown menu (`dv` being the language +identifier code for Dhivehi): + + + +If we hit the play button on the first sample, we can listen to the audio and see the corresponding text. Have a scroll +through the samples for the train and test sets to get a better feel for the audio and text data that we're dealing with. +You can tell from the intonation and style that the recordings are taken from narrated speech. You'll also likely notice +the large variation in speakers and recording quality, a common trait of crowd-sourced data. + +The Dataset Preview is a brilliant way of experiencing audio datasets before committing to using them. You can pick any +dataset on the Hub, scroll through the samples and listen to the audio for the different subsets and splits, gauging whether +it's the right dataset for your needs. Once you've selected a dataset, it's trivial to load the data so that you can +start using it. + +Now, I personally don't speak Dhivehi, and expect the vast majority of readers not to either! To know if our fine-tuned model +is any good, we'll need a rigorous way of _evaluating_ it on unseen data and measuring its transcription accuracy. +We'll cover exactly this in the next section! diff --git a/chapters/en/chapter5/evaluation.mdx b/chapters/en/chapter5/evaluation.mdx new file mode 100644 index 00000000..fb26f069 --- /dev/null +++ b/chapters/en/chapter5/evaluation.mdx @@ -0,0 +1,310 @@ +# Evaluation metrics for ASR + +If you're familiar with the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) from NLP, the +metrics for assessing speech recognition systems will be familiar! Don't worry if you're not, we'll go through the +explanations start-to-finish to make sure you know the different metrics and understand what they mean. + +When assessing speech recognition systems, we compare the system's predictions to the target text transcriptions, +annotating any errors that are present. We categorise these errors into one of three categories: +1. Substitutions (S): where we transcribe the **wrong word** in our prediction ("sit" instead of "sat") +2. Insertions (I): where we add an **extra word** in our prediction +3. Deletions (D): where we **remove a word** in our prediction + +These error categories are the same for all speech recognition metrics. What differs is the level at which we compute +these errors: we can either compute them on the _word level_ or on the _character level_. + +We'll use a running example for each of the metric definitions. Here, we have a _ground truth_ or _reference_ text sequence: + +```python +reference = "the cat sat on the mat" +``` + +And a predicted sequence from the speech recognition system that we're trying to assess: + +```python +prediction = "the cat sit on the" +``` + +We can see that the prediction is pretty close! But some words are not quite right. We'll evaluate this prediction +against the reference for the three most popular speech recognition metrics and see what sort of numbers we get for each. + +## Word Error Rate +The _word error rate (WER)_ metric is the 'de facto' metric for speech recognition. It calculates substitutions, +insertions and deletions on the _word level_. This means errors are annotated on a word-by-word basis. Take our example: + + +| Reference: | the | cat | sat | on | the | mat | +|-------------|-----|-----|---------|-----|-----|-----| +| Prediction: | the | cat | **sit** | on | the | | | +| Label: | ✅ | ✅ | S | ✅ | ✅ | D | + +Here, we have: +* 1 substitution ("sit" instead of "sat") +* 0 insertions +* 1 deletion ("mat" is missing) + +This gives 2 errors in total. To get our error rate, we divide the number of errors by the total number of words in our +reference (N), which for this example is 6: + +\begin{aligned} +WER &= \frac{S + I + D}{N} \\ +&= \frac{1 + 0 + 1}{6} \\ +&= 0.333 +\end{aligned} + +Alright! So we have a WER of 0.333, or 33.3%. Notice how the word "sit" only has one character that is wrong, but the +entire word is marked incorrect! This is a defining feature of the WER: spelling errors are penalised heavily, no matter +how minor they are. + +The WER is defined such that _lower is better_: a lower WER means there are fewer errors in our prediction, so a perfect +speech recognition system would have a WER of zero (no errors). + +Let's see how we can compute the WER using 🤗 Evaluate. We'll need two packages to compute our WER metric: 🤗 Evaluate +for the API interface, and JIWER to do the heavy lifting of running the calculation: +``` +pip install --upgrade evaluate jiwer +``` + +Great! We can now load up the WER metric and compute the figure for our example: + +```python +from evaluate import load + +wer_metric = load("wer") + +wer = wer_metric.compute(references=[reference], predictions=[prediction]) + +print(wer) +``` +**Print Output:** +``` +0.3333333333333333 +``` + +0.33, or 33.3%, as expected! We now know what's going on under-the-hood with this WER calculation. + +Now, here's something that's quite confusing! What do you think the upper limit of the WER is? You would expect it to be +1 or 100% right? Nuh uh! Since the WER is the ratio of errors to number of words (N), there is no upper limit on the WER! +Let's take an example were we predict 10 words and the target only has 2 words. If all of our predictions were wrong (10 errors), +we'd have a WER of 10 / 2 = 5, or 500%! This is something to bear in mind if you train an ASR system and see a WER of over +100%. Although if you're seeing this, something has likely gone wrong... 😅 + +## Word Accuracy + +We can flip the WER around to give us a metric where _higher is better_. Rather than measuring the word error rate, +we can measure the _word accuracy (WAcc)_ of our system: + +\begin{equation} +WAcc = 1 - WER \nonumber +\end{equation} + +The WAcc is also measured on the word-level, it's just the WER reformulated as an accuracy metric rather than an error +metric. The WAcc is very infrequently quoted in the speech literature - we think of our system predictions in terms of +word errors, and so prefer error rate metrics that are more associated with these error type annotations. + +## Character Error Rate +It seems a bit unfair that we marked the entire word for "sit" wrong when in fact only one letter was incorrect! +That's because we were evaluating our system on the word level, thereby annotating errors on a word-by-word basis. +The _character error rate (CER)_ assesses systems on the _character level_. This means we divide up our words into their +individual characters, and annotate errors on a character-by-character basis: + +| Reference: | t | h | e | | c | a | t | | s | a | t | | o | n | | t | h | e | | m | a | t | +|-------------|-----|-----|-----|-----|-----|-----|-----|-----|-----|-------|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----| +| Prediction: | t | h | e | | c | a | t | | s | **i** | t | | o | n | | t | h | e | | | | | +| Label: | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | | ✅ | S | ✅ | | ✅ | ✅ | | ✅ | ✅ | ✅ | | D | D | D | + +We can see now that for the word "sit", the "s" and "t" are marked as correct! It's only the "i" which is labelled as a +substitution error (S). Thus, we reward our system for the partially correct prediction 🤝 + +In our example, we have 1 character substitution, 0 insertions, and 3 deletions. In total, we have 14 characters. So, our CER is: + +\begin{aligned} +CER &= \frac{S + I + D}{N} \\ +&= \frac{1 + 0 + 3}{14} \\ +&= 0.286 +\end{aligned} + +Right! We have a CER of 0.286, or 28.6%! Notice how this is lower than our WER - we penalised the spelling error much less. + + + +```python +cer_metric = load("cer") + +cer = cer_metric.compute(references=[reference], predictions=[prediction]) + +print(cer) +``` +**Print Output:** +``` +TODO(SG) +``` + +## Which metric should I use? + +In general, the WER is used far more than the CER for assessing speech systems. This is because the WER requires systems +to have greater understanding of the context of the predictions. In our example, "sit" is in the wrong tense. A system +that understands the relationship between the verb and tense of the sentence would have predicted the correct verb tense +of "sat". We want to encourage this level of understanding from our speech systems. So although the WER is less forgiving than +the CER, it's also more conducive to the kinds of intelligible systems we want to develop. Therefore, we typically use +the WER and would encourage you to as well! However, there are circumstances where it is not possible to use the WER. +Certain languages, such as Mandarin and Japanese, have no notion of 'words', and so the WER is meaningless. Here, we revert +to using the CER. + +In our example, we only used one sentence when computing the WER. We would typically use an entire test set consisting +of several thousand sentences when evaluating a real system. When evaluating over multiple sentences, we aggregate S, I, D +and N across all sentences, and then compute the WER according to the formula defined above. This gives a better estimate +of the WER for unseen data. + +## Normalisation + +If we train an ASR model on data with punctuation and casing, it will learn to predict casing and punctuation in its +transcriptions. This is great when we want to use out model for actual speech recognition applications, such as +transcribing meetings or dictation, since the predicted transcriptions will be fully formatted with casing and punctuation, +a style referred to as _orthographic_. + +However, we also have the option of _normalising_ the dataset to remove any casing and punctuation. Normalising the +dataset makes the speech recognition task easier: the model no longer needs to distinguish between upper and lower case +characters, or have to predict punctuation from the audio data alone (e.g. what sound does a semi-colon make?). +Because of this, the word error rates are naturally lower (meaning the results are better). The Whisper paper demonstrates +the drastic effect that normalising transcriptions can have on WER results (_c.f._ Section 4.4 of the [Whisper paper](https://cdn.openai.com/papers/whisper.pdf)). +While we get lower WERs, the model isn't necessarily better for production. The lack of casing and punctuation makes the predicted +text from the model significantly harder to read. Take the example from the [previous section](#asr_models), where we ran +Wav2Vec2 and Whisper on the same audio sample from the LibriSpeech dataset. The Wav2Vec2 model predicts neither punctuation +nor casing, whereas Whisper predicts both. Comparing the transcriptions side-by-side, we see that the Whisper transcription +is far easier to read: + +``` +Wav2Vec2: HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAUS AND ROSE BEEF LOOMING BEFORE US SIMALYIS DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND +Whisper: He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly is drawn from eating and its results occur most readily to the mind. +``` + +The Whisper transcription is orthographic and thus ready to go - it's formatted as we'd expect for a meeting transcription +or dictation script with both punctuation and casing. On the contrary, we would need to use additional post-processing +to restore punctuation and casing in our Wav2Vec2 predictions if we wanted to use it for downstream applications. + +There is a happy medium between normalising and not normalising: we can train our systems on orthographic transcriptions, +and then normalise the predictions and targets before computing the WER. This way, we train our systems to predict fully +formatted text, but also benefit from the WER improvements we get by normalising the transcriptions. + +The Whisper model was released with a normaliser that effectively handles the normalisation of casing, punctuation and +number formatting among others. Let's apply the normaliser to the Whisper transcriptions to demonstrate how we can +normalise them: + +```python +from transformers.models.whisper.english_normalizer import BasicTextNormalizer + +normalizer = BasicTextNormalizer() + +prediction = " He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly is drawn from eating and its results occur most readily to the mind." +normalized_prediction = normalizer(prediction) + +normalized_prediction +``` +**Output:** +``` +' he tells us that at this festive season of the year with christmas and roast beef looming before us similarly is drawn from eating and its results occur most readily to the mind ' +``` + +Great! We can see that the text has been fully lower-cased and all punctuation removed. Let's now define the reference +transcription and then compute the normalised WER between the reference and target: + +```python +reference = "HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND" +normalized_referece = normalizer(reference) + +wer = wer_metric.compute(references=[normalized_referece], predictions=[normalized_prediction]) +wer +``` +**Output:** +``` +0.0625 +``` + +6.25% - that's about what we'd expect for the Whisper base model on the LibriSpeech validation set. As we see here, +we've predicted an orthographic transcription, but benefited from the WER boost obtained by normalising the reference and +prediction prior to computing the WER. + +The choice of how you normalise the transcriptions is ultimately down to your needs. We recommend training on +orthographic text and evaluating on normalised text to get the best of both worlds. + +## Putting it all together + +Alright! We've covered three topics so far in this Unit: pre-trained models, dataset selection and evaluation. +Let's have some fun and put them together in one end-to-end example 🚀 We're going to set ourselves up for the next +section on fine-tuning by evaluating the pre-trained Whisper model on the Common Voice 13 Dhivehi test set. We'll use +the WER number we get as a _baseline_ for our fine-tuning run, or a target number that we'll try and beat 🥊 + +First, we'll load the pre-trained Whisper model using the `pipeline()` class. This process will be extremely familiar by now! + +```python +from transformers import pipeline +import torch + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device) +``` + +Next, we'll load the Dhivehi test split of Common Voice 13: + +```python +from datasets import load_dataset + +common_voice_test = load_dataset("mozilla-foundation/common_voice_13_0", "dv", split="test") +``` + +Evaluating over an entire dataset can be done in much the same way as over a single example - all we have to do is **loop** +over the predictions, rather than inferring just the first one: + +```python +from tqdm import tqdm + +all_predictions = [] +for prediction in tqdm(pipe(common_voice_test, max_new_tokens=256, generate_kwargs={"task": "transcribe"})): + all_predictions.append(prediction["text"]) +``` + +And finally, we can compute the WER. Let's first compute the orthographic WER, i.e. the WER without any post-processing: + +```python +from evaluate import load + +wer_metric = load("wer") + +all_references = common_voice_test["sentence"] +wer_ortho = 100 * wer_metric.compute(references=all_references, predictions=all_predictions) +wer_ortho +``` +**Output:** +``` +TODO(SG) +``` + +Next, we'll evaluate the normalised WER, i.e. the WER with normalisation post-processing. We have to filter our samples +that would be empty after normalisation, as otherwise the total number of words in our reference (N) would be zero, which +would give a divide by zero error in our calculation: + +```python +from transformers.models.whisper.english_normalizer import BasicTextNormalizer + +normalizer = BasicTextNormalizer() + +# compute normalised WER +all_predictions_norm = [normalizer(pred) for pred in all_predictions] +all_references_norm = [normalizer(label) for label in all_references] + +# filtering step to only evaluate the samples that correspond to non-zero references +all_predictions_norm = [all_predictions_norm[i] for i in range(len(all_predictions_norm)) if len(all_references_norm[i]) > 0] +all_references_norm = [all_references_norm[i] for i in range(len(all_references_norm)) if len(all_references_norm[i]) > 0] + +wer = 100 * wer_metric.compute(references=all_references_norm, predictions=all_predictions_norm) + +wer +``` +**Output:** +``` +TODO(SG) +``` + +Great! So the baseline model achieves an orthographic test WER of %, and a normalised WER of %. These are the numbers that +we want to try and beat when we fine-tune the model, in order to improve the Whisper model for Dhivehi. \ No newline at end of file diff --git a/chapters/en/chapter5/fine-tuning.mdx b/chapters/en/chapter5/fine-tuning.mdx new file mode 100644 index 00000000..274af1d6 --- /dev/null +++ b/chapters/en/chapter5/fine-tuning.mdx @@ -0,0 +1,526 @@ +# Fine-tuning the ASR model + +In this section, we'll cover a step-by-step guide on fine-tuning Whisper for speech recognition on the Common Voice 13 +dataset. We'll use the 'small' version of the model and a relatively lightweight dataset, enabling you to run fine-tuning +fairly quickly on any 16GB+ GPU with low disk space requirements, such as the 16GB T4 GPU provided in the Google Colab free +tier. + +Should you have a smaller GPU or encounter memory issues during training, you can follow the suggestions provided for +reducing memory usage. Conversely, should you have access to a larger GPU, you can amend the training arguments to maximise +your throughput. Thus, this guide is accessible regardless of your GPU specifications! + +Likewise, this guide outlines how to fine-tune the Whisper model for the Dhivehi language. However, the steps covered here +generalise to any language in the Common Voice dataset, and more generally to any ASR dataset on the Hugging Face Hub. +You can tweak the code to quickly switch to a language of your choice and fine-tune a Whisper model in your native tongue 🌍 + +Right! Now that's out the way, let's get started and kick-off our fine-tuning pipeline! + +## Prepare Environment + +We strongly advise you to upload model checkpoints directly the [Hugging Face Hub](https://huggingface.co/) while training. +The Hub provides: +- Integrated version control: you can be sure that no model checkpoint is lost during training. +- Tensorboard logs: track important metrics over the course of training. +- Model cards: document what a model does and its intended use cases. +- Community: an easy way to share and collaborate with the community! 🤗 + +Linking the notebook to the Hub is straightforward - it simply requires entering your Hub authentication token when prompted. +Find your Hub authentication token [here](https://huggingface.co/settings/tokens): + +```python +from huggingface_hub import notebook_login + +notebook_login() +``` + +**Output:** +```bash +Login successful +Your token has been saved to /root/.huggingface/token +``` + +## Load Dataset + +Common Voice 13 contains approximately ten hours of labelled Dhivehi data, three of which is held-out test data. This is +extremely little data for fine-tuning, so we'll be relying on leveraging the extensive multilingual ASR knowledge acquired +by Whisper during pre-training for the low-resource Dhivehi language. + +Using 🤗 Datasets, downloading and preparing data is extremely simple. We can download and prepare the Common Voice 13 +splits in just one line of code. Since Dhivehi is very low-resource, we'll combine the `train` and `validation` splits +to give approximately 7 hours of training data. We'll use the 3 hours of `test` data as our held-out test set: + +```python +from datasets import load_dataset, DatasetDict + +common_voice = DatasetDict() + +common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "dv", split="train+validation") +common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "dv", split="test") + +print(common_voice) +``` + +**Output:** +``` +DatasetDict({ + train: Dataset({ + features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'], + num_rows: 4904 + }) + test: Dataset({ + features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'], + num_rows: 2212 + }) +}) +``` + + + You can change the language identifier from `"dv"` to a language identifier of your choice. To see all possible languages + in Common Voice 13, check out the dataset card on the Hugging Face Hub: [mozilla-foundation/common_voice_13_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0) + + +Most ASR datasets only provide input audio samples (`audio`) and the corresponding transcribed text (`sentence`). +Common Voice contains additional metadata information, such as `accent` and `locale`, which we can disregard for ASR. +Keeping the notebook as general as possible, we only consider the input audio and transcribed text for fine-tuning, +discarding the additional metadata information: + +```python +common_voice = common_voice.select_columns(["audio", "sentence"]) +``` + +## Feature Extractor, Tokenizer and Processor + +The ASR pipeline can be de-composed into three stages: + +1. The feature extractor which pre-processes the raw audio-inputs to log-mel spectrograms +2. The model which performs the sequence-to-sequence mapping +3. The tokenizer which post-processes the predicted tokens to text + +In 🤗 Transformers, the Whisper model has an associated feature extractor and tokenizer, called [WhisperFeatureExtractor](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperFeatureExtractor) and [WhisperTokenizer](https://huggingface.co/docs/transformers/main/model_doc/whisper#transformers.WhisperTokenizer) +respectively. To make our lives simple, these two objects are wrapped under a single class, called the [WhisperProcessor](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperProcessor). +We can call the WhisperProcessor to perform both the audio pre-processing and the text token post-processing. In doing +so, we only need to keep track of two objects during training: the processor and the model. + +When performing multilingual fine-tuning, we need to set the `"language"` and `"task"` when instantiating the processor. +The `"language"` should be set to the source audio language, and the task to `"transcribe"` for speech recognition or +`"translate"` for speech translation. These arguments modify the behaviour of the tokenizer, and should be set correctly +to ensure the target labels are encoded properly. + +We can see all possible languages supported by Whisper by importing the list of languages: + +``` +from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE + +TO_LANGUAGE_CODE +``` + +If you scroll through this list, you'll notice that many languages are present, but Dhivehi is one of few that is not! +This means that Whisper was not pre-trained on Dhivehi. However, this doesn't mean that we can't fine tune Whisper on it. +In doing so, we'll be teaching Whisper a new language, one that the pre-trained checkpoint does not support. That's pretty +cool, right! + +When you fine-tune it on a new language, Whisper does a good job at leveraging its knowledge of the other 96 languages +it’s pre-trained on. Largely speaking, all modern languages will be linguistically similar to at least one of the +96 languages Whisper already knows, so we'll fall under this paradigm of cross-lingual knowledge representation. + +What we need to do to fine-tune Whisper on a new language is find the language **most similar** that Whisper was +pre-trained on. The Wikipedia article for Dhivehi states that Dhivehi is closely related to the Sinhalese language of Sri Lanka. +If we check the language codes again, we can see that Sinhalese is present in the Whisper language set, +so we can safely set our language argument to `"sinhalese"`. + +Right! We'll load our processor from the pre-trained checkpoint, setting the language to `"sinhalese"` and task to `"transcribe"` +as explained above: + +```python +from transformers import WhisperProcessor + +processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="sinhalese", task="transcribe") +``` + +It's worth reiterating that in most circumstances, you'll find that the language you want to fine-tune on is in the set of +pre-training languages, in which case you can simply set the language directly as your source audio language! Note that +both of these arguments should be omitted for English-only fine-tuning, where there is only one option for the language +(`"English"`) and task (`"transcribe"`). + +## Pre-Process the Data + +Let's have a look at the dataset features. Pay particular attention to the `"audio"` column - this details the sampling +rate of our audio inputs: + +```python +common_voice["train"].features +``` +**Output:** +``` +{'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None), + 'sentence': Value(dtype='string', id=None)} +``` +Since our input audio is sampled at 48kHz, we need to _downsample_ it to 16kHz prior to passing it to the Whisper feature +extractor, 16kHz being the sampling rate expected by the Whisper model. + +We'll set the audio inputs to the correct sampling rate using dataset's [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=cast_column#datasets.DatasetDict.cast_column) +method. This operation does not change the audio in-place, but rather signals to datasets to resample audio samples +on-the-fly when they are loaded: + +```python +from datasets import Audio + +sampling_rate = processor.feature_extractor.sampling_rate +common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate)) +``` + +Now we can write a function to prepare our data ready for the model: + +1. We load and resample the audio data on a sample-by-sample basis by calling `sample["audio"]`. As explained above, 🤗 Datasets performs any necessary resampling operations on the fly. +2. We use the feature extractor to compute the log-mel spectrogram input features from our 1-dimensional audio array. +3. We encode the transcriptions to label ids through the use of the tokenizer. + +```python +def prepare_dataset(example): + audio = example["audio"] + + example = processor( + audio=audio["array"], + sampling_rate=audio["sampling_rate"], + text=example["sentence"], + ) + + # compute input length of audio sample in seconds + example["input_length"] = len(audio["array"]) / audio["sampling_rate"] + + return example +``` + +We can apply the data preparation function to all of our training examples using 🤗 Datasets' `.map` method. We'll +remove the columns from the raw training data (the audio and text), leaving just the columns returned by the +`prepare_dataset` function: + +```python +common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2) +``` + +Finally, we filter any training data with audio samples longer than 30s. These samples would otherwise be truncated by +the Whisper feature-extractor which could affect the stability of training. We define a function that returns `True` for +samples that are less than 30s, and `False` for those that are longer: + +```python +max_input_length = 30.0 + +def is_audio_in_length_range(length): + return length < max_input_length +``` + +We apply our filter function to all samples of our training dataset through 🤗 Datasets' `.filter` method: + +```python +common_voice["train"] = common_voice["train"].filter( + is_audio_in_length_range, + input_columns=["input_length"], +) +``` + +Let's check how much training data we removed through this filtering step: + +```python +common_voice["train"] +``` +**Output** +``` +Dataset({ + features: ['input_features', 'labels', 'input_length'], + num_rows: 4904 +}) +``` + +Alright! In this case we actually have the same number of samples as before, so there were no samples longer than 30s. +This might not be the case if you switch languages, so it's best to keep this filter step in-place for robustness. With +that, we have our data fully prepared for training! Let's continue and take a look at how we can use this data to fine-tune +Whisper. + +## Training and Evaluation + +Now that we've prepared our data, we're ready to dive into the training pipeline. +The [🤗 Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer) +will do much of the heavy lifting for us. All we have to do is: + +- Define a data collator: the data collator takes our pre-processed data and prepares PyTorch tensors ready for the model. + +- Evaluation metrics: during evaluation, we want to evaluate the model using the word error rate (WER) metric. We need to define a `compute_metrics` function that handles this computation. + +- Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training. + +- Define the training arguments: these will be used by the 🤗 Trainer in constructing the training schedule. + +Once we've fine-tuned the model, we will evaluate it on the test data to verify that we have correctly trained it +to transcribe speech in Dhivehi. + +### Define a Data Collator + +The data collator for a sequence-to-sequence speech model is unique in the sense that it treats the `input_features` +and `labels` independently: the `input_features` must be handled by the feature extractor and the `labels` by the tokenizer. + +The `input_features` are already padded to 30s and converted to a log-Mel spectrogram of fixed dimension, so all we +have to do is convert them to batched PyTorch tensors. We do this using the feature extractor's `.pad` method with +`return_tensors=pt`. Note that no additional padding is applied here since the inputs are of fixed dimension, the +`input_features` are simply converted to PyTorch tensors. + +On the other hand, the `labels` are un-padded. We first pad the sequences to the maximum length in the batch using +the tokenizer's `.pad` method. The padding tokens are then replaced by `-100` so that these tokens are **not** taken +into account when computing the loss. We then cut the start of transcript token from the beginning of the label sequence +as we append it later during training. + +We can leverage the `WhisperProcessor` we defined earlier to perform both the feature extractor and the tokenizer operations: + +```python +import torch + +from dataclasses import dataclass +from typing import Any, Dict, List, Union + +@dataclass +class DataCollatorSpeechSeq2SeqWithPadding: + processor: Any + + def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: + # split inputs and labels since they have to be of different lengths and need different padding methods + # first treat the audio inputs by simply returning torch tensors + input_features = [{"input_features": feature["input_features"][0]} for feature in features] + batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt") + + # get the tokenized label sequences + label_features = [{"input_ids": feature["labels"]} for feature in features] + # pad the labels to max length + labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt") + + # replace padding with -100 to ignore loss correctly + labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) + + # if bos token is appended in previous tokenization step, + # cut bos token here as it's append later anyways + if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item(): + labels = labels[:, 1:] + + batch["labels"] = labels + + return batch +``` + +We can now initialise the data collator we've just defined: + +```python +data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor) +``` + +Onwards! + +### Evaluation Metrics + +Next, we define the evaluation metric we'll use on our evaluation set. We'll use the Word Error Rate (WER) metric introduced +in the section on [Evaluation](#evaluation), the 'de-facto' metric for assessing ASR systems. + +We'll load the WER metric from 🤗 Evaluate: + +```python +import evaluate + +metric = evaluate.load("wer") +``` + +We then simply have to define a function that takes our model predictions and returns the WER metric. This function, called +`compute_metrics`, first replaces `-100` with the `pad_token_id` in the `label_ids` (undoing the step we applied in the +data collator to ignore padded tokens correctly in the loss). It then decodes the predicted and label ids to strings. Finally, +it computes the WER between the predictions and reference labels. Here, we have the option of evaluating with the 'normalised' +transcriptions and predictions, which have punctuation and casing removed. We recommend you follow this to benefit +from the WER improvement obtained by normalising the transcriptions. + +```python +from transformers.models.whisper.english_normalizer import BasicTextNormalizer + +normalizer = BasicTextNormalizer() + +def compute_metrics(pred): + pred_ids = pred.predictions + label_ids = pred.label_ids + + # replace -100 with the pad_token_id + label_ids[label_ids == -100] = processor.tokenizer.pad_token_id + + # we do not want to group tokens when computing the metrics + pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True) + label_str = processor.batch_decode(label_ids, skip_special_tokens=True) + + # compute orthographic wer + wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str) + + # compute normalised WER + pred_str_norm = [normalizer(pred) for pred in pred_str] + label_str_norm = [normalizer(label) for label in label_str] + # filtering step to only evaluate the samples that correspond to non-zero references: + pred_str_norm = [pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0] + label_str_norm = [label_str_norm[i] for i in range(len(label_str_norm)) if len(label_str_norm[i]) > 0] + + wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm) + + return {"wer_ortho": wer_ortho, "wer": wer} +``` + +### Load a Pre-Trained Checkpoint + +Now let's load the pre-trained Whisper small checkpoint. Again, this is trivial through use of 🤗 Transformers! + +```python +from transformers import WhisperForConditionalGeneration + +model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") +``` + +We'll set `use_cache` to `False` for training since we're using [gradient checkpointing](https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing) +and the two are incompatible. We'll also override two generation arguments to control the behaviour of the model during inference: +we'll force the language and task tokens during generation by setting the `language` and `task` arguments in the `generation_config` +(see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)). +We also re-enable cache for generate to speed-up inference time: + +```python +from functools import partial + +# disable cache during training since it's incompatible with gradient checkpointing +model.config.use_cache = False + +# set language and task for generation +model.generation_config.language = "sinhalese" +model.generation_config.task = "transcribe" + +# re-enable cache for inference +model.generate = partial(model.generate, use_cache=True) +``` + +## Define the Training Configuration + +In the final step, we define all the parameters related to training. Here, you can set the `max_steps` to train for longer. +For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments). + +```python +from transformers import Seq2SeqTrainingArguments + +training_args = Seq2SeqTrainingArguments( + output_dir="./whisper-small-dv", # name on the HF Hub + per_device_train_batch_size=16, + gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size + learning_rate=1e-5, + warmup_steps=500, + max_steps=4000, + gradient_checkpointing=True, + fp16=True, + evaluation_strategy="steps", + per_device_eval_batch_size=8, + predict_with_generate=True, + generation_max_length=225, + save_steps=1000, + eval_steps=1000, + logging_steps=25, + report_to=["tensorboard"], + load_best_model_at_end=True, + metric_for_best_model="wer", + greater_is_better=False, + push_to_hub=True, +) +``` + + + If you do not want to upload the model checkpoints to the Hub, set `push_to_hub=False`. + + +We can forward the training arguments to the 🤗 Trainer along with our model, dataset, data collator and `compute_metrics` function: + +```python +from transformers import Seq2SeqTrainer + +trainer = Seq2SeqTrainer( + args=training_args, + model=model, + train_dataset=common_voice["train"], + eval_dataset=common_voice["test"], + data_collator=data_collator, + compute_metrics=compute_metrics, + tokenizer=processor, +) +``` + +And with that, we're ready to start training! + +### Training + +To launch training, simply execute: + +```python +trainer.train() +``` + +Training will take approximately 5-10 hours depending on your GPU or the one allocated to the Google Colab. Depending on +your GPU, it is possible that you will encounter a CUDA `"out-of-memory"` error when you start training. In this case, +you can reduce the `per_device_train_batch_size` incrementally by factors of 2 and employ [`gradient_accumulation_steps`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments.gradient_accumulation_steps) +to compensate. + +**Output:** + +| Step | Training Loss | Epoch | Validation Loss | WER | +|:----:|:-------------:|:-----:|:---------------:|:-----:| + +Our best WER is X% - not bad for 7h of training data (I hope...)! The big question is how this +compares to other ASR systems. For that, we can view the autoevaluate [leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards?dataset=mozilla-foundation%2Fcommon_voice_13_0&only_verified=0&task=automatic-speech-recognition&config=hi&split=test&metric=wer), +a leaderboard that categorises models by language and dataset, and subsequently ranks them according to their WER. + +We see... + +We can automatically submit our checkpoint to the leaderboard when we push the training results to the Hub - we simply +have to set the appropriate key-word arguments (kwargs). You can change these values to match your dataset, language and +model name accordingly: + +```python +kwargs = { + "dataset_tags": "mozilla-foundation/common_voice_13_0", + "dataset": "Common Voice 13", # a 'pretty' name for the training dataset + "language": "dv", + "model_name": "Whisper Small Dv - Sanchit Gandhi", # a 'pretty' name for your model + "finetuned_from": "openai/whisper-small", + "tasks": "automatic-speech-recognition", +} +``` + +The training results can now be uploaded to the Hub. To do so, execute the `push_to_hub` command: + +```python +trainer.push_to_hub(**kwargs) +``` + +This will save the training logs and model weights under `"your-username/the-name-you-picked"`. For this example, check +out the upload at `sanchit-gandhi/whisper-small-dv`. + +While the fine-tuned model yields satisfactory results on the Common Voice 13 Dhivehi test data, it is by no means optimal. +The purpose of this guide is to demonstrate how to fine-tune an ASR model using the 🤗 Trainer for multilingual speech +recognition. The results could likely be improved by optimising the training hyperparameters, such as _learning rate_ and +_dropout_, and using a larger pre-trained checkpoint (`medium` or `large`). + +## Sharing Your Model + +You can now share this model with anyone using the link on the Hub. They can load it with the identifier `"your-username/the-name-you-picked"` +directly into the `pipeline()` object. For instance, to load the fine-tuned checkpoint ["sanchit-gandhi/whisper-small-dv"](https://huggingface.co/sanchit-gandhi/whisper-small-dv): + +```python +from transformers import pipeline + +pipe = pipeline("automatic-speech-recognition", model="sanchit-gandhi/whisper-small-dv") +``` + +## Conclusion + +In this section, we covered a step-by-step guide on fine-tuning the Whisper model for speech recognition 🤗 Datasets, +Transformers and the Hugging Face Hub. We first loaded the Dhivehi subset of the Common Voice 13 dataset and pre-processed +it by computing log-mel spectrograms and tokenising the text. We then defined a data collator, evaluation metric and +training arguments, before using the 🤗 Trainer to train and evaluate our model. We finished by uploading the fine-tuned +model to the Hugging Face Hub, and showcased how to share and use it with the `pipeline()` class. + +If you followed through to this point, you should now have a fine-tuned checkpoint for speech recognition, well done! 🥳 +Even more importantly, you're equipped with all the tools you need to fine-tune the Whisper model on any speech recognition +dataset or domain. So what are you waiting for! Pick one of the datasets covered in the section [Choosing a Dataset](./choosing_dataset) +or select a dataset of your own, and see whether you can get state-of-the-art performance! The leaderboard is waiting for you... diff --git a/chapters/unpublished/chapter5/hands_on.mdx b/chapters/en/chapter5/hands_on.mdx similarity index 100% rename from chapters/unpublished/chapter5/hands_on.mdx rename to chapters/en/chapter5/hands_on.mdx diff --git a/chapters/en/chapter5/introduction.mdx b/chapters/en/chapter5/introduction.mdx new file mode 100644 index 00000000..c27ca526 --- /dev/null +++ b/chapters/en/chapter5/introduction.mdx @@ -0,0 +1,40 @@ +# What you'll learn and what you'll build + +In this section, we’ll take a look at how Transformers can be used to convert spoken speech into text, a task known _speech recognition_. + +
+ Diagram of speech to text +
+ +Speech recognition, also known as automatic speech recognition (ASR) or speech-to-text (STT), is one of the most popular +and exciting spoken language processing tasks. It’s used in a wide range of applications, including dictation, voice assistants, +video captioning and meeting transcriptions. + +You’ve probably made use of a speech recognition system many times before without realising! Consider the digital +assistant in your smartphone device (Siri, Google Assistant, Alexa). When you use these assistants, the first thing that +they do is transcribe your spoken speech to written text, ready to be used for any downstream tasks (such as finding you +the weather 🌤️). + +Have a play with the speech recognition demo below. You can either record yourself using your microphone, or drag and +drop an audio sample for transcription: + + + +Speech recognition is a challenging task as it requires joint knowledge of audio and text. The input audio might have +lots of background noise and be spoken by speakers with different accents, making it difficult to pick out the spoken +speech. The written text might have characters which don’t have an acoustic sound, such as punctuation, which are difficult +to infer from audio alone. These are all hurdles we have to tackle when building effective speech recognition systems! + +Now that we’ve defined our task, we can begin looking into speech recognition in more detail. By the end of this Unit, +you'll have a good fundamental understanding of the different pre-trained speech recognition models available and how to +use them with the 🤗 Transformers library. You'll also know the procedure for fine-tuning an ASR model on a domain or +language of choice, enabling you to build a performant system for whatever task you encounter. You'll be able to showcase +your model to your friends and family by building a live demo, one that takes any spoken speech and converts it to text! + +Specifically, we’ll cover: + +* [Pre-trained models for speech recognition](#asr_models) +* [Choosing a dataset](#choosing_dataset) +* [Evaluation and metrics for speech recognition](#evaluation) +* [How to fine-tune an ASR system with the Trainer API](#fine-tuning) +* [Building a demo](#demo) diff --git a/chapters/unpublished/chapter5/quiz.mdx b/chapters/en/chapter5/quiz.mdx similarity index 100% rename from chapters/unpublished/chapter5/quiz.mdx rename to chapters/en/chapter5/quiz.mdx diff --git a/chapters/unpublished/chapter5/supplemental_reading.mdx b/chapters/en/chapter5/supplemental_reading.mdx similarity index 100% rename from chapters/unpublished/chapter5/supplemental_reading.mdx rename to chapters/en/chapter5/supplemental_reading.mdx diff --git a/chapters/unpublished/chapter5/asr_models.mdx b/chapters/unpublished/chapter5/asr_models.mdx deleted file mode 100644 index fe3ef1a6..00000000 --- a/chapters/unpublished/chapter5/asr_models.mdx +++ /dev/null @@ -1 +0,0 @@ -# Pre-trained models for automatic speech recognition \ No newline at end of file diff --git a/chapters/unpublished/chapter5/choosing_dataset.mdx b/chapters/unpublished/chapter5/choosing_dataset.mdx deleted file mode 100644 index e2a0fdbb..00000000 --- a/chapters/unpublished/chapter5/choosing_dataset.mdx +++ /dev/null @@ -1 +0,0 @@ -# Choosing a dataset \ No newline at end of file diff --git a/chapters/unpublished/chapter5/evaluation.mdx b/chapters/unpublished/chapter5/evaluation.mdx deleted file mode 100644 index 79f5854b..00000000 --- a/chapters/unpublished/chapter5/evaluation.mdx +++ /dev/null @@ -1 +0,0 @@ -# Evaluation metrics for ASR \ No newline at end of file diff --git a/chapters/unpublished/chapter5/fine-tuning.mdx b/chapters/unpublished/chapter5/fine-tuning.mdx deleted file mode 100644 index 7b59286d..00000000 --- a/chapters/unpublished/chapter5/fine-tuning.mdx +++ /dev/null @@ -1 +0,0 @@ -# Fine-tuning the ASR model \ No newline at end of file diff --git a/chapters/unpublished/chapter5/introduction.mdx b/chapters/unpublished/chapter5/introduction.mdx deleted file mode 100644 index 538dcf0d..00000000 --- a/chapters/unpublished/chapter5/introduction.mdx +++ /dev/null @@ -1 +0,0 @@ -# What you'll learn and what you'll build \ No newline at end of file diff --git a/chapters/unpublished/chapter5/preprocessing_data.mdx b/chapters/unpublished/chapter5/preprocessing_data.mdx deleted file mode 100644 index 0e3982a9..00000000 --- a/chapters/unpublished/chapter5/preprocessing_data.mdx +++ /dev/null @@ -1 +0,0 @@ -# Preprocessing data \ No newline at end of file diff --git a/chapters/unpublished/chapter5/speaker_diarization.mdx b/chapters/unpublished/chapter5/speaker_diarization.mdx deleted file mode 100644 index e51a5ce4..00000000 --- a/chapters/unpublished/chapter5/speaker_diarization.mdx +++ /dev/null @@ -1 +0,0 @@ -# Automatic speech recognition with speaker diarization \ No newline at end of file From 9686d232968a0723b5268fbf79b22ae63e2749ef Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Mon, 26 Jun 2023 18:18:26 +0100 Subject: [PATCH 02/24] zero-shot eval --- chapters/en/chapter5/evaluation.mdx | 113 +++++++++++++++++++++++----- 1 file changed, 95 insertions(+), 18 deletions(-) diff --git a/chapters/en/chapter5/evaluation.mdx b/chapters/en/chapter5/evaluation.mdx index fb26f069..ea678fcb 100644 --- a/chapters/en/chapter5/evaluation.mdx +++ b/chapters/en/chapter5/evaluation.mdx @@ -213,7 +213,9 @@ transcription and then compute the normalised WER between the reference and targ reference = "HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND" normalized_referece = normalizer(reference) -wer = wer_metric.compute(references=[normalized_referece], predictions=[normalized_prediction]) +wer = wer_metric.compute( + references=[normalized_referece], predictions=[normalized_prediction] +) wer ``` **Output:** @@ -236,34 +238,92 @@ section on fine-tuning by evaluating the pre-trained Whisper model on the Common the WER number we get as a _baseline_ for our fine-tuning run, or a target number that we'll try and beat 🥊 First, we'll load the pre-trained Whisper model using the `pipeline()` class. This process will be extremely familiar by now! +The only thing new we'll do is load the model in half-precision (float16) if running on a GPU - this will speed up +inference at almost no cost to WER accuracy. ```python from transformers import pipeline import torch -device = "cuda:0" if torch.cuda.is_available() else "cpu" -pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device) +if torch.cuda.is_available(): + device = "cuda:0" + torch_dtype = torch.float16 +else: + device = "cpu" + torch_dtype = torch.float32 + +pipe = pipeline( + "automatic-speech-recognition", + model="openai/whisper-small", + torch_dtype=torch_dtype, + device=device, +) ``` -Next, we'll load the Dhivehi test split of Common Voice 13: +Next, we'll load the Dhivehi test split of Common Voice 13. You'll remember from the previous section that the Common +Voice 13 is *gated*, meaning we had to agree to the dataset terms of use before gaining access to the dataset. We can +now link our Hugging Face account to our notebook, so that we have access to the dataset from the machine we're currently +using. + +Linking the notebook to the Hub is straightforward - it simply requires entering your Hub authentication token when prompted. +Find your Hub authentication token [here](https://huggingface.co/settings/tokens) and enter it when prompted: + +```python +from huggingface_hub import notebook_login + +notebook_login() +``` + +Great! Once we've linked the notebook to our Hugging Face account, we can proceed with downloading the Common Voice +dataset. This will take a few minutes to download and pre-process, fetching the data from the Hugging Face Hub and +preparing it automatically on your notebook: ```python from datasets import load_dataset -common_voice_test = load_dataset("mozilla-foundation/common_voice_13_0", "dv", split="test") +common_voice_test = load_dataset( + "mozilla-foundation/common_voice_13_0", "dv", split="test" +) ``` Evaluating over an entire dataset can be done in much the same way as over a single example - all we have to do is **loop** -over the predictions, rather than inferring just the first one: +over the input audios, rather than inferring just the first one. We first define a helper function that yields examples +from our dataset using a generator object, and then pass the generator through the pipeline to get our predictions and +references. The following code cell will take approximately five minutes if running on a GPU with half-precision, peaking +at 12GB memory: ```python from tqdm import tqdm + +def data(dataset): + """Helper function that wraps a dataset as a generator object for compatibility with pipeline""" + for i, item in enumerate(dataset): + yield {**item["audio"], "reference": item["sentence"]} + + all_predictions = [] -for prediction in tqdm(pipe(common_voice_test, max_new_tokens=256, generate_kwargs={"task": "transcribe"})): - all_predictions.append(prediction["text"]) +all_references = [] + +# run streamed inference +for out in tqdm( + pipe( + data(common_voice_test), + max_new_tokens=128, + generate_kwargs={"task": "transcribe", "language": "sinhalese"}, + batch_size=32, + ), + total=len(common_voice_test), +): + all_predictions.append(out["text"]) + all_references.append(out["reference"][0]) ``` + + If you experience a CUDA out-of-memory (OOM) when running the above cell, incrementally reduce the `batch_size` by + factors of 2 until you find a batch size that fits in your device. + + And finally, we can compute the WER. Let's first compute the orthographic WER, i.e. the WER without any post-processing: ```python @@ -271,15 +331,19 @@ from evaluate import load wer_metric = load("wer") -all_references = common_voice_test["sentence"] -wer_ortho = 100 * wer_metric.compute(references=all_references, predictions=all_predictions) +wer_ortho = 100 * wer_metric.compute( + references=all_references, predictions=all_predictions +) wer_ortho ``` **Output:** ``` -TODO(SG) +167.83202172853262 ``` +Okay... 168% essentially means our model is outputting garbage 😜 Not to worry, it'll be our aim to improve this by +fine-tuning the model on the Dhivehi training set! + Next, we'll evaluate the normalised WER, i.e. the WER with normalisation post-processing. We have to filter our samples that would be empty after normalisation, as otherwise the total number of words in our reference (N) would be zero, which would give a divide by zero error in our calculation: @@ -294,17 +358,30 @@ all_predictions_norm = [normalizer(pred) for pred in all_predictions] all_references_norm = [normalizer(label) for label in all_references] # filtering step to only evaluate the samples that correspond to non-zero references -all_predictions_norm = [all_predictions_norm[i] for i in range(len(all_predictions_norm)) if len(all_references_norm[i]) > 0] -all_references_norm = [all_references_norm[i] for i in range(len(all_references_norm)) if len(all_references_norm[i]) > 0] - -wer = 100 * wer_metric.compute(references=all_references_norm, predictions=all_predictions_norm) +all_predictions_norm = [ + all_predictions_norm[i] + for i in range(len(all_predictions_norm)) + if len(all_references_norm[i]) > 0 +] +all_references_norm = [ + all_references_norm[i] + for i in range(len(all_references_norm)) + if len(all_references_norm[i]) > 0 +] + +wer = 100 * wer_metric.compute( + references=all_references_norm, predictions=all_predictions_norm +) wer ``` **Output:** ``` -TODO(SG) +125.72243279897069 ``` -Great! So the baseline model achieves an orthographic test WER of %, and a normalised WER of %. These are the numbers that -we want to try and beat when we fine-tune the model, in order to improve the Whisper model for Dhivehi. \ No newline at end of file +Again we see the drastic reduction in WER we achieve by normalising our references and predictions: the baseline model +achieves an orthographic test WER of 168%, while the normalised WER is 126%. + +Right then! These are the numbers that we want to try and beat when we fine-tune the model, in order to improve the Whisper +model for Dhivehi. From d7b9a0f1f4464eb7080a31cfe7d3429f45e7622f Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Mon, 26 Jun 2023 18:18:30 +0100 Subject: [PATCH 03/24] make style --- chapters/en/chapter5/asr_models.mdx | 33 ++++++++++++++++++---- chapters/en/chapter5/fine-tuning.mdx | 41 ++++++++++++++++++++++------ 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/chapters/en/chapter5/asr_models.mdx b/chapters/en/chapter5/asr_models.mdx index 28403a62..e883ace7 100644 --- a/chapters/en/chapter5/asr_models.mdx +++ b/chapters/en/chapter5/asr_models.mdx @@ -28,7 +28,9 @@ Wav2Vec2's speech transcription capabilities: ```python from datasets import load_dataset -dataset = load_dataset("hf-internal-testing/librispeech_pipe_demo", "clean", split="validation") +dataset = load_dataset( + "hf-internal-testing/librispeech_pipe_demo", "clean", split="validation" +) dataset ``` @@ -69,7 +71,9 @@ Next, we'll take an example from the dataset and pass its raw data to the pipeli ```python pipe(sample["audio"].copy()) -{'text': 'HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAUS AND ROSE BEEF LOOMING BEFORE US SIMALYIS DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND'} +{ + "text": "HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAUS AND ROSE BEEF LOOMING BEFORE US SIMALYIS DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND" +} ``` We can see that the Wav2Vec2 model does a pretty good job at transcribing this sample - at a first glance it looks generally correct. @@ -167,7 +171,9 @@ import torch from transformers import pipeline device = "cuda:0" if torch.cuda.is_available() else "cpu" -pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device) +pipe = pipeline( + "automatic-speech-recognition", model="openai/whisper-base", device=device +) ``` Great! Now let's transcribe the audio as before. The only change we make is passing an extra argument, `max_new_tokens`, @@ -201,7 +207,9 @@ the multilingual equivalent of the LibriSpeech dataset, with labelled audio data from the Spanish split of the MLS dataset, making use of _streaming_ mode so that we don't have to download the entire dataset: ```python -dataset = load_dataset("facebook/multilingual_librispeech", "spanish", split="validation", streaming=True) +dataset = load_dataset( + "facebook/multilingual_librispeech", "spanish", split="validation", streaming=True +) sample = next(iter(dataset)) ``` @@ -322,7 +330,13 @@ To activate batching, we need to pass the argument `batch_size` to the pipeline. long audio sample with chunking and batching as follows: ```python -pipe(long_audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"}, chunk_length_s=30, batch_size=8) +pipe( + long_audio, + max_new_tokens=256, + generate_kwargs={"task": "transcribe"}, + chunk_length_s=30, + batch_size=8, +) ``` **Output:** ``` @@ -345,7 +359,14 @@ are compatible with both the chunking and batching methods we used previously, s argument to our previous call: ```python -pipe(long_audio, max_new_tokens=256, generate_kwargs={"task": "transcribe"}, chunk_length_s=30, batch_size=8, return_timestamps=True)["chunks"] +pipe( + long_audio, + max_new_tokens=256, + generate_kwargs={"task": "transcribe"}, + chunk_length_s=30, + batch_size=8, + return_timestamps=True, +)["chunks"] ``` **Output:** ``` diff --git a/chapters/en/chapter5/fine-tuning.mdx b/chapters/en/chapter5/fine-tuning.mdx index 274af1d6..f4372acd 100644 --- a/chapters/en/chapter5/fine-tuning.mdx +++ b/chapters/en/chapter5/fine-tuning.mdx @@ -54,8 +54,12 @@ from datasets import load_dataset, DatasetDict common_voice = DatasetDict() -common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "dv", split="train+validation") -common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "dv", split="test") +common_voice["train"] = load_dataset( + "mozilla-foundation/common_voice_13_0", "dv", split="train+validation" +) +common_voice["test"] = load_dataset( + "mozilla-foundation/common_voice_13_0", "dv", split="test" +) print(common_voice) ``` @@ -134,7 +138,9 @@ as explained above: ```python from transformers import WhisperProcessor -processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="sinhalese", task="transcribe") +processor = WhisperProcessor.from_pretrained( + "openai/whisper-small", language="sinhalese", task="transcribe" +) ``` It's worth reiterating that in most circumstances, you'll find that the language you want to fine-tune on is in the set of @@ -196,7 +202,9 @@ remove the columns from the raw training data (the audio and text), leaving just `prepare_dataset` function: ```python -common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2) +common_voice = common_voice.map( + prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2 +) ``` Finally, we filter any training data with audio samples longer than 30s. These samples would otherwise be truncated by @@ -206,6 +214,7 @@ samples that are less than 30s, and `False` for those that are longer: ```python max_input_length = 30.0 + def is_audio_in_length_range(length): return length < max_input_length ``` @@ -277,14 +286,19 @@ import torch from dataclasses import dataclass from typing import Any, Dict, List, Union + @dataclass class DataCollatorSpeechSeq2SeqWithPadding: processor: Any - def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: + def __call__( + self, features: List[Dict[str, Union[List[int], torch.Tensor]]] + ) -> Dict[str, torch.Tensor]: # split inputs and labels since they have to be of different lengths and need different padding methods # first treat the audio inputs by simply returning torch tensors - input_features = [{"input_features": feature["input_features"][0]} for feature in features] + input_features = [ + {"input_features": feature["input_features"][0]} for feature in features + ] batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt") # get the tokenized label sequences @@ -293,7 +307,9 @@ class DataCollatorSpeechSeq2SeqWithPadding: labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt") # replace padding with -100 to ignore loss correctly - labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) + labels = labels_batch["input_ids"].masked_fill( + labels_batch.attention_mask.ne(1), -100 + ) # if bos token is appended in previous tokenization step, # cut bos token here as it's append later anyways @@ -338,6 +354,7 @@ from transformers.models.whisper.english_normalizer import BasicTextNormalizer normalizer = BasicTextNormalizer() + def compute_metrics(pred): pred_ids = pred.predictions label_ids = pred.label_ids @@ -356,8 +373,14 @@ def compute_metrics(pred): pred_str_norm = [normalizer(pred) for pred in pred_str] label_str_norm = [normalizer(label) for label in label_str] # filtering step to only evaluate the samples that correspond to non-zero references: - pred_str_norm = [pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0] - label_str_norm = [label_str_norm[i] for i in range(len(label_str_norm)) if len(label_str_norm[i]) > 0] + pred_str_norm = [ + pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0 + ] + label_str_norm = [ + label_str_norm[i] + for i in range(len(label_str_norm)) + if len(label_str_norm[i]) > 0 + ] wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm) From a460e4354020a80e9576582750a6854846a91a18 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Mon, 26 Jun 2023 18:27:50 +0100 Subject: [PATCH 04/24] add todo --- chapters/en/chapter5/fine-tuning.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chapters/en/chapter5/fine-tuning.mdx b/chapters/en/chapter5/fine-tuning.mdx index f4372acd..0e3f65d5 100644 --- a/chapters/en/chapter5/fine-tuning.mdx +++ b/chapters/en/chapter5/fine-tuning.mdx @@ -489,7 +489,7 @@ to compensate. | Step | Training Loss | Epoch | Validation Loss | WER | |:----:|:-------------:|:-----:|:---------------:|:-----:| -Our best WER is X% - not bad for 7h of training data (I hope...)! The big question is how this +Our best WER is X% TODO(SG) - not bad for 7h of training data (I hope...)! The big question is how this compares to other ASR systems. For that, we can view the autoevaluate [leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards?dataset=mozilla-foundation%2Fcommon_voice_13_0&only_verified=0&task=automatic-speech-recognition&config=hi&split=test&metric=wer), a leaderboard that categorises models by language and dataset, and subsequently ranks them according to their WER. From c708888dd833dc1690967ec4f25a1c04f09b531f Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Mon, 26 Jun 2023 18:29:59 +0100 Subject: [PATCH 05/24] add todo --- chapters/en/chapter5/choosing_dataset.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chapters/en/chapter5/choosing_dataset.mdx b/chapters/en/chapter5/choosing_dataset.mdx index b2230724..0d53bc09 100644 --- a/chapters/en/chapter5/choosing_dataset.mdx +++ b/chapters/en/chapter5/choosing_dataset.mdx @@ -105,7 +105,7 @@ more generally to any one of the 180+ audio datasets on the Hugging Face Hub, so We can select the Dhivehi subset of Common Voice 13 by setting the subset to `dv` using the dropdown menu (`dv` being the language identifier code for Dhivehi): - +
+ Selecting the Dhivehi split from the Dataset's Preview +
If we hit the play button on the first sample, we can listen to the audio and see the corresponding text. Have a scroll through the samples for the train and test sets to get a better feel for the audio and text data that we're dealing with. From 6c28858b22870739b9378b11a55e93f93899ed07 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 15:05:30 +0100 Subject: [PATCH 09/24] render figs --- chapters/en/chapter5/asr_models.mdx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/chapters/en/chapter5/asr_models.mdx b/chapters/en/chapter5/asr_models.mdx index e883ace7..b4dd17bc 100644 --- a/chapters/en/chapter5/asr_models.mdx +++ b/chapters/en/chapter5/asr_models.mdx @@ -307,14 +307,10 @@ The way long-form transcription works in 🤗 Transformers is by _chunking_ the Each segment has a small amount of overlap with the previous one. This allows us to accurately stitch the segments back together at the boundaries, since we can find the overlap between segments and merge the transcriptions accordingly: - - The advantage of chunking the samples is that we don't need the result of chunk \\( i \\) to transcribe the subsequent chunk \\( i + 1 \\). The stitching is done after we have transcribed all the chunks at the chunk boundaries, so it doesn't matter which order we transcribe chunks in. The algorithm is entirely **stateless**, so we can even do chunk \\( i + 1 \\) From 2d0ab9194c416f9ea4b0e78e68311e9b02332e65 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 15:11:46 +0100 Subject: [PATCH 10/24] render equations(?) --- chapters/en/chapter5/evaluation.mdx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/chapters/en/chapter5/evaluation.mdx b/chapters/en/chapter5/evaluation.mdx index ea678fcb..6a7877f0 100644 --- a/chapters/en/chapter5/evaluation.mdx +++ b/chapters/en/chapter5/evaluation.mdx @@ -46,11 +46,13 @@ Here, we have: This gives 2 errors in total. To get our error rate, we divide the number of errors by the total number of words in our reference (N), which for this example is 6: +$$ \begin{aligned} WER &= \frac{S + I + D}{N} \\ &= \frac{1 + 0 + 1}{6} \\ &= 0.333 \end{aligned} +$$ Alright! So we have a WER of 0.333, or 33.3%. Notice how the word "sit" only has one character that is wrong, but the entire word is marked incorrect! This is a defining feature of the WER: spelling errors are penalised heavily, no matter @@ -94,9 +96,11 @@ we'd have a WER of 10 / 2 = 5, or 500%! This is something to bear in mind if you We can flip the WER around to give us a metric where _higher is better_. Rather than measuring the word error rate, we can measure the _word accuracy (WAcc)_ of our system: +$$ \begin{equation} WAcc = 1 - WER \nonumber \end{equation} +$$ The WAcc is also measured on the word-level, it's just the WER reformulated as an accuracy metric rather than an error metric. The WAcc is very infrequently quoted in the speech literature - we think of our system predictions in terms of @@ -118,16 +122,16 @@ substitution error (S). Thus, we reward our system for the partially correct pre In our example, we have 1 character substitution, 0 insertions, and 3 deletions. In total, we have 14 characters. So, our CER is: +$$ \begin{aligned} CER &= \frac{S + I + D}{N} \\ &= \frac{1 + 0 + 3}{14} \\ &= 0.286 \end{aligned} +$$ Right! We have a CER of 0.286, or 28.6%! Notice how this is lower than our WER - we penalised the spelling error much less. - - ```python cer_metric = load("cer") From f4f294e153b85937c8f2a91df2631d9589bf76d1 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 15:22:29 +0100 Subject: [PATCH 11/24] style --- chapters/en/chapter5/fine-tuning.mdx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/chapters/en/chapter5/fine-tuning.mdx b/chapters/en/chapter5/fine-tuning.mdx index 12ba82ee..a5c3a8f2 100644 --- a/chapters/en/chapter5/fine-tuning.mdx +++ b/chapters/en/chapter5/fine-tuning.mdx @@ -409,7 +409,9 @@ from functools import partial model.config.use_cache = False # set language and task for generation and re-enable cache -model.generate = partial(model.generate, language="sinhalese", task="transcribe", use_cache=True) +model.generate = partial( + model.generate, language="sinhalese", task="transcribe", use_cache=True +) ``` ## Define the Training Configuration From cec9a630636fe8f58c445d8acb1d2fcedf9ffe73 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 15:31:59 +0100 Subject: [PATCH 12/24] add demo --- chapters/en/_toctree.yml | 2 +- chapters/en/chapter5/demo.mdx | 77 +++++++++++++++++++++++++++ chapters/en/chapter5/introduction.mdx | 2 +- 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 chapters/en/chapter5/demo.mdx diff --git a/chapters/en/_toctree.yml b/chapters/en/_toctree.yml index f8288581..d6bcfa21 100644 --- a/chapters/en/_toctree.yml +++ b/chapters/en/_toctree.yml @@ -79,7 +79,7 @@ title: How to fine-tune an ASR system with the Trainer API # - local: chapter5/speaker_diarization # title: Automatic speech recognition with speaker diarization - - local: chapter4/demo + - local: chapter5/demo title: Building a demo - local: chapter5/quiz title: Quiz diff --git a/chapters/en/chapter5/demo.mdx b/chapters/en/chapter5/demo.mdx new file mode 100644 index 00000000..9600663a --- /dev/null +++ b/chapters/en/chapter5/demo.mdx @@ -0,0 +1,77 @@ +# Build a demo with Gradio + +Now that we've fine-tuned a really strong model for Dhivehi speech recognition, let's go ahead and build a +[Gradio]((https://gradio.app)) demo to showcase it to the community! + +The first thing to do is load up the fine-tuned checkpoint using the `pipeline()` class - this is very familiar now from +the section on [pre-trained models](asr_models.mdx). You can change the `model_id` to the namespace of your fine-tuned +model on the Hugging Face Hub, or one of the pre-trained [Whisper models](https://huggingface.co/models?sort=downloads&search=openai%2Fwhisper-) +to perform zero-shot speech recognition: + +```python +from transformers import pipeline + +model_id = "sanchit-gandhi/whisper-small-dv" +pipe = pipeline("automatic-speech-recognition", model=model_id) +``` + +Secondly, we'll define a function that takes the filepath for an audio input and passes it through the pipeline. Here, +the pipeline automatically takes care of loading the audio file, resampling it to the correct sampling rate, and running +inference with the model. We can then simply return the transcribed text as the output of the function. To ensure our +model can handle audio inputs of arbitrary length, we'll enable *chunking* as described in the section +on [pre-trained models](asr_models.mdx): + +```python +def transcribe_speech(filepath): + output = pipe(filepath, max_new_tokens=256, + generate_kwargs={"task": "transcribe", "language": "sinhalese"}, + chunk_length_s=30, + batch_size=8, +) + return output["text"] +``` + +We'll use the Gradio [blocks](https://gradio.app/docs/#blocks) feature to launch two tabs on our demo: one for microphone +transcription, and the other for file upload. + +```python +import gradio as gr + +demo = gr.Blocks() + +mic_transcribe = gr.Interface( + fn=transcribe_speech, inputs=gr.Audio(source="microphone", type="filepath"), outputs=gr.outputs.Textbox() +) + +file_transcribe = gr.Interface( + fn=transcribe_speech, inputs=gr.Audio(source="upload", type="filepath"), outputs=gr.outputs.Textbox() +) +``` + +Finally, we launch the Gradio demo using the two blocks that we've just defined: + +```python +with demo: + gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"]) + +demo.launch(debug=True) +``` + +This will launch a Gradio demo similar to the one running on the Hugging Face Space: + + + +Should you wish to host your demo on the Hugging Face Hub, you can use this Space as a template for your fine-tuned model. + +Click the link to duplicate the template demo to your account: https://huggingface.co/spaces/course-demos/whisper-small?duplicate=true + +We recommend giving your space a similar name to your fine-tuned model (e.g. whisper-small-dv-demo) and setting the visibility to "Public". + +Once you've duplicated the Space to your account, click `"Files and versions"` -> `"app.py"` -> `"edit"`. Then change the +model identifier to your fine-tuned model (line 6). Scroll to the bottom of the page and click `"Commit changes to main"`. +he demo will reboot, this time using your fine-tuned model. You can share this demo with your friends and family so that +they can use the model that you've trained! + +Checkout our video tutorial to get a better understanding of how to duplicate the Space 👉️ [YouTube Video](https://www.youtube.com/watch?v=VQYuvl6-9VE) + +We look forward to seeing your demos on the Hub! \ No newline at end of file diff --git a/chapters/en/chapter5/introduction.mdx b/chapters/en/chapter5/introduction.mdx index 463652f8..b6fbbbb6 100644 --- a/chapters/en/chapter5/introduction.mdx +++ b/chapters/en/chapter5/introduction.mdx @@ -18,7 +18,7 @@ the weather 🌤️). Have a play with the speech recognition demo below. You can either record yourself using your microphone, or drag and drop an audio sample for transcription: - + Speech recognition is a challenging task as it requires joint knowledge of audio and text. The input audio might have lots of background noise and be spoken by speakers with different accents, making it difficult to pick out the spoken From 369f1fbf4a6a3043e92a5e21c3fa16b08a093478 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 15:32:08 +0100 Subject: [PATCH 13/24] make style --- chapters/en/chapter5/demo.mdx | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/chapters/en/chapter5/demo.mdx b/chapters/en/chapter5/demo.mdx index 9600663a..f1effd17 100644 --- a/chapters/en/chapter5/demo.mdx +++ b/chapters/en/chapter5/demo.mdx @@ -23,11 +23,13 @@ on [pre-trained models](asr_models.mdx): ```python def transcribe_speech(filepath): - output = pipe(filepath, max_new_tokens=256, - generate_kwargs={"task": "transcribe", "language": "sinhalese"}, - chunk_length_s=30, - batch_size=8, -) + output = pipe( + filepath, + max_new_tokens=256, + generate_kwargs={"task": "transcribe", "language": "sinhalese"}, + chunk_length_s=30, + batch_size=8, + ) return output["text"] ``` @@ -40,11 +42,15 @@ import gradio as gr demo = gr.Blocks() mic_transcribe = gr.Interface( - fn=transcribe_speech, inputs=gr.Audio(source="microphone", type="filepath"), outputs=gr.outputs.Textbox() + fn=transcribe_speech, + inputs=gr.Audio(source="microphone", type="filepath"), + outputs=gr.outputs.Textbox(), ) file_transcribe = gr.Interface( - fn=transcribe_speech, inputs=gr.Audio(source="upload", type="filepath"), outputs=gr.outputs.Textbox() + fn=transcribe_speech, + inputs=gr.Audio(source="upload", type="filepath"), + outputs=gr.outputs.Textbox(), ) ``` @@ -52,7 +58,10 @@ Finally, we launch the Gradio demo using the two blocks that we've just defined: ```python with demo: - gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"]) + gr.TabbedInterface( + [mic_transcribe, file_transcribe], + ["Transcribe Microphone", "Transcribe Audio File"], + ) demo.launch(debug=True) ``` From 988e6fdaa3b384d2414485ea260576bf6a97f4a0 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 15:33:11 +0100 Subject: [PATCH 14/24] small tips --- chapters/en/chapter5/demo.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chapters/en/chapter5/demo.mdx b/chapters/en/chapter5/demo.mdx index f1effd17..85ea87a3 100644 --- a/chapters/en/chapter5/demo.mdx +++ b/chapters/en/chapter5/demo.mdx @@ -11,7 +11,7 @@ to perform zero-shot speech recognition: ```python from transformers import pipeline -model_id = "sanchit-gandhi/whisper-small-dv" +model_id = "sanchit-gandhi/whisper-small-dv" # update with your model id pipe = pipeline("automatic-speech-recognition", model=model_id) ``` @@ -26,7 +26,7 @@ def transcribe_speech(filepath): output = pipe( filepath, max_new_tokens=256, - generate_kwargs={"task": "transcribe", "language": "sinhalese"}, + generate_kwargs={"task": "transcribe", "language": "sinhalese"}, # update with the language you've fine-tuned on chunk_length_s=30, batch_size=8, ) From 6ec341d163033e0f2dcf69f750d73cb1bc375810 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 15:54:52 +0100 Subject: [PATCH 15/24] hands-on --- chapters/en/_toctree.yml | 3 --- chapters/en/chapter5/hands_on.mdx | 24 +++++++++++++++++++++++- chapters/en/chapter5/quiz.mdx | 3 --- 3 files changed, 23 insertions(+), 7 deletions(-) delete mode 100644 chapters/en/chapter5/quiz.mdx diff --git a/chapters/en/_toctree.yml b/chapters/en/_toctree.yml index d6bcfa21..e6b6ee8b 100644 --- a/chapters/en/_toctree.yml +++ b/chapters/en/_toctree.yml @@ -81,9 +81,6 @@ # title: Automatic speech recognition with speaker diarization - local: chapter5/demo title: Building a demo - - local: chapter5/quiz - title: Quiz - quiz: 5 - local: chapter5/hands_on title: Hands-on exercise - local: chapter5/supplemental_reading diff --git a/chapters/en/chapter5/hands_on.mdx b/chapters/en/chapter5/hands_on.mdx index 75cb5a68..e1eda3f8 100644 --- a/chapters/en/chapter5/hands_on.mdx +++ b/chapters/en/chapter5/hands_on.mdx @@ -1 +1,23 @@ -# Hands-on exercise \ No newline at end of file +# Hands-on exercise + +In this unit, we explored the challenges of fine-tuning ASR models, acknowledging the time and resources required to +fine-tune a model like Whisper (even a small checkpoint) on a new language. To provide a hands-on experience, we have +designed an exercise that allows you to navigate the process of fine-tuning an ASR model while using a smaller dataset. +The main goal of this exercise is to familiarize you with the process rather than expecting production-level results. +We have intentionally set a low metric to ensure that even with limited resources, you should be able to achieve it. + +Here are the instructions: +* Fine-tune the `”openai/whisper-tiny”` model using the American English ("en-US") subset of the `”PolyAI/minds14”` dataset. Use the first **450 examples for training**, and the rest for evaluation. +* To evaluate the model, use the `wer` and `wer_ortho` metrics as described in this Unit. However, *do not* convert the metric into percentages by multiplying by 100 (E.g. if WER is 42%, we’ll expect to see the value of 0.42 in this exercise). + +Once you have fine-tuned a model, make sure to upload it to the :hugging_face: Hub with the following `kwargs`: +``` +kwargs = { + "dataset_tags": "PolyAI/minds14", + "finetuned_from": "openai/whisper-tiny", + "tasks": "automatic-speech-recognition", +} +``` +You will pass this assignment if your model’s WER is lower than **0.37**. + +Feel free to build a demo of your model, and share it on Discord! If you have questions, post them in the #audio-study-group channel. diff --git a/chapters/en/chapter5/quiz.mdx b/chapters/en/chapter5/quiz.mdx deleted file mode 100644 index 62962841..00000000 --- a/chapters/en/chapter5/quiz.mdx +++ /dev/null @@ -1,3 +0,0 @@ - - -# Check your understanding of the course material \ No newline at end of file From c002152e490c8da4d77d2fcf5c35b6ff3dff63f0 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 15:54:57 +0100 Subject: [PATCH 16/24] make style --- chapters/en/chapter5/demo.mdx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/chapters/en/chapter5/demo.mdx b/chapters/en/chapter5/demo.mdx index 85ea87a3..5d58cd52 100644 --- a/chapters/en/chapter5/demo.mdx +++ b/chapters/en/chapter5/demo.mdx @@ -26,7 +26,10 @@ def transcribe_speech(filepath): output = pipe( filepath, max_new_tokens=256, - generate_kwargs={"task": "transcribe", "language": "sinhalese"}, # update with the language you've fine-tuned on + generate_kwargs={ + "task": "transcribe", + "language": "sinhalese", + }, # update with the language you've fine-tuned on chunk_length_s=30, batch_size=8, ) From e63c13f87fb12dc1493feb4dce56b2b26e2ee9dd Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 16:00:34 +0100 Subject: [PATCH 17/24] clarify instructions --- chapters/en/chapter5/hands_on.mdx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/chapters/en/chapter5/hands_on.mdx b/chapters/en/chapter5/hands_on.mdx index e1eda3f8..1fde7049 100644 --- a/chapters/en/chapter5/hands_on.mdx +++ b/chapters/en/chapter5/hands_on.mdx @@ -7,7 +7,8 @@ The main goal of this exercise is to familiarize you with the process rather tha We have intentionally set a low metric to ensure that even with limited resources, you should be able to achieve it. Here are the instructions: -* Fine-tune the `”openai/whisper-tiny”` model using the American English ("en-US") subset of the `”PolyAI/minds14”` dataset. Use the first **450 examples for training**, and the rest for evaluation. +* Fine-tune the `”openai/whisper-tiny”` model using the American English ("en-US") subset of the `”PolyAI/minds14”` dataset. +* Use the first **450 examples for training**, and the rest for evaluation. Ensure you set `num_proc=1` when pre-processing the dataset using the `.map` method (this will ensure your model is submitted correctly for assessment). * To evaluate the model, use the `wer` and `wer_ortho` metrics as described in this Unit. However, *do not* convert the metric into percentages by multiplying by 100 (E.g. if WER is 42%, we’ll expect to see the value of 0.42 in this exercise). Once you have fine-tuned a model, make sure to upload it to the :hugging_face: Hub with the following `kwargs`: @@ -18,6 +19,6 @@ kwargs = { "tasks": "automatic-speech-recognition", } ``` -You will pass this assignment if your model’s WER is lower than **0.37**. +You will pass this assignment if your model’s normalised WER (`wer`) is lower than **0.37**. Feel free to build a demo of your model, and share it on Discord! If you have questions, post them in the #audio-study-group channel. From 302c3e7c6d18fe555923e9440fb4a6c41e901d57 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 16:15:52 +0100 Subject: [PATCH 18/24] fix intro --- chapters/en/chapter5/introduction.mdx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/chapters/en/chapter5/introduction.mdx b/chapters/en/chapter5/introduction.mdx index b6fbbbb6..f914b484 100644 --- a/chapters/en/chapter5/introduction.mdx +++ b/chapters/en/chapter5/introduction.mdx @@ -33,8 +33,9 @@ your model to your friends and family by building a live demo, one that takes an Specifically, we’ll cover: -* [Pre-trained models for speech recognition](#asr_models) -* [Choosing a dataset](#choosing_dataset) -* [Evaluation and metrics for speech recognition](#evaluation) -* [How to fine-tune an ASR system with the Trainer API](#fine-tuning) -* [Building a demo](#demo) +* [Pre-trained models for speech recognition](asr_models.mdx) +* [Choosing a dataset](choosing_dataset.mdx) +* [Evaluation and metrics for speech recognition](evaluation.mdx) +* [How to fine-tune an ASR system with the Trainer API](fine-tuning.mdx) +* [Building a demo](demo.mdx) +* [Hands-on exercise](hands_on.mdx) From f0e626f03e6e9c3625f5335d67040bc6e1a0c858 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 16:55:01 +0100 Subject: [PATCH 19/24] from readthrough --- chapters/en/chapter5/choosing_dataset.mdx | 2 + chapters/en/chapter5/evaluation.mdx | 49 +++++++++-------------- chapters/en/chapter5/fine-tuning.mdx | 20 ++++----- 3 files changed, 32 insertions(+), 39 deletions(-) diff --git a/chapters/en/chapter5/choosing_dataset.mdx b/chapters/en/chapter5/choosing_dataset.mdx index 3a2cd197..36e4927c 100644 --- a/chapters/en/chapter5/choosing_dataset.mdx +++ b/chapters/en/chapter5/choosing_dataset.mdx @@ -6,6 +6,8 @@ to match our criteria with the features that a dataset offers. Before we pick a dataset, we first need to understand the key defining features. +## Features of speech datasets + ### 1. Number of hours Simply put, the number of training hours indicates how large the dataset is. It’s analogous to the number of training examples in an NLP dataset. However, bigger datasets aren’t necessarily better. If we want a model that generalises well, diff --git a/chapters/en/chapter5/evaluation.mdx b/chapters/en/chapter5/evaluation.mdx index 6a7877f0..a0101231 100644 --- a/chapters/en/chapter5/evaluation.mdx +++ b/chapters/en/chapter5/evaluation.mdx @@ -25,12 +25,12 @@ And a predicted sequence from the speech recognition system that we're trying to prediction = "the cat sit on the" ``` -We can see that the prediction is pretty close! But some words are not quite right. We'll evaluate this prediction +We can see that the prediction is pretty close, but some words are not quite right. We'll evaluate this prediction against the reference for the three most popular speech recognition metrics and see what sort of numbers we get for each. ## Word Error Rate -The _word error rate (WER)_ metric is the 'de facto' metric for speech recognition. It calculates substitutions, -insertions and deletions on the _word level_. This means errors are annotated on a word-by-word basis. Take our example: +The *word error rate (WER)* metric is the 'de facto' metric for speech recognition. It calculates substitutions, +insertions and deletions on the *word level*. This means errors are annotated on a word-by-word basis. Take our example: | Reference: | the | cat | sat | on | the | mat | @@ -55,10 +55,10 @@ WER &= \frac{S + I + D}{N} \\ $$ Alright! So we have a WER of 0.333, or 33.3%. Notice how the word "sit" only has one character that is wrong, but the -entire word is marked incorrect! This is a defining feature of the WER: spelling errors are penalised heavily, no matter +entire word is marked incorrect. This is a defining feature of the WER: spelling errors are penalised heavily, no matter how minor they are. -The WER is defined such that _lower is better_: a lower WER means there are fewer errors in our prediction, so a perfect +The WER is defined such that *lower is better*: a lower WER means there are fewer errors in our prediction, so a perfect speech recognition system would have a WER of zero (no errors). Let's see how we can compute the WER using 🤗 Evaluate. We'll need two packages to compute our WER metric: 🤗 Evaluate @@ -85,7 +85,7 @@ print(wer) 0.33, or 33.3%, as expected! We now know what's going on under-the-hood with this WER calculation. -Now, here's something that's quite confusing! What do you think the upper limit of the WER is? You would expect it to be +Now, here's something that's quite confusing... What do you think the upper limit of the WER is? You would expect it to be 1 or 100% right? Nuh uh! Since the WER is the ratio of errors to number of words (N), there is no upper limit on the WER! Let's take an example were we predict 10 words and the target only has 2 words. If all of our predictions were wrong (10 errors), we'd have a WER of 10 / 2 = 5, or 500%! This is something to bear in mind if you train an ASR system and see a WER of over @@ -93,8 +93,8 @@ we'd have a WER of 10 / 2 = 5, or 500%! This is something to bear in mind if you ## Word Accuracy -We can flip the WER around to give us a metric where _higher is better_. Rather than measuring the word error rate, -we can measure the _word accuracy (WAcc)_ of our system: +We can flip the WER around to give us a metric where *higher is better*. Rather than measuring the word error rate, +we can measure the *word accuracy (WAcc)* of our system: $$ \begin{equation} @@ -107,9 +107,10 @@ metric. The WAcc is very infrequently quoted in the speech literature - we think word errors, and so prefer error rate metrics that are more associated with these error type annotations. ## Character Error Rate -It seems a bit unfair that we marked the entire word for "sit" wrong when in fact only one letter was incorrect! + +It seems a bit unfair that we marked the entire word for "sit" wrong when in fact only one letter was incorrect. That's because we were evaluating our system on the word level, thereby annotating errors on a word-by-word basis. -The _character error rate (CER)_ assesses systems on the _character level_. This means we divide up our words into their +The *character error rate (CER)* assesses systems on the *character level*. This means we divide up our words into their individual characters, and annotate errors on a character-by-character basis: | Reference: | t | h | e | | c | a | t | | s | a | t | | o | n | | t | h | e | | m | a | t | @@ -117,7 +118,7 @@ individual characters, and annotate errors on a character-by-character basis: | Prediction: | t | h | e | | c | a | t | | s | **i** | t | | o | n | | t | h | e | | | | | | Label: | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | | ✅ | S | ✅ | | ✅ | ✅ | | ✅ | ✅ | ✅ | | D | D | D | -We can see now that for the word "sit", the "s" and "t" are marked as correct! It's only the "i" which is labelled as a +We can see now that for the word "sit", the "s" and "t" are marked as correct. It's only the "i" which is labelled as a substitution error (S). Thus, we reward our system for the partially correct prediction 🤝 In our example, we have 1 character substitution, 0 insertions, and 3 deletions. In total, we have 14 characters. So, our CER is: @@ -130,19 +131,7 @@ CER &= \frac{S + I + D}{N} \\ \end{aligned} $$ -Right! We have a CER of 0.286, or 28.6%! Notice how this is lower than our WER - we penalised the spelling error much less. - -```python -cer_metric = load("cer") - -cer = cer_metric.compute(references=[reference], predictions=[prediction]) - -print(cer) -``` -**Print Output:** -``` -TODO(SG) -``` +Right! We have a CER of 0.286, or 28.6%. Notice how this is lower than our WER - we penalised the spelling error much less. ## Which metric should I use? @@ -165,15 +154,15 @@ of the WER for unseen data. If we train an ASR model on data with punctuation and casing, it will learn to predict casing and punctuation in its transcriptions. This is great when we want to use out model for actual speech recognition applications, such as transcribing meetings or dictation, since the predicted transcriptions will be fully formatted with casing and punctuation, -a style referred to as _orthographic_. +a style referred to as *orthographic*. -However, we also have the option of _normalising_ the dataset to remove any casing and punctuation. Normalising the +However, we also have the option of *normalising* the dataset to remove any casing and punctuation. Normalising the dataset makes the speech recognition task easier: the model no longer needs to distinguish between upper and lower case characters, or have to predict punctuation from the audio data alone (e.g. what sound does a semi-colon make?). Because of this, the word error rates are naturally lower (meaning the results are better). The Whisper paper demonstrates -the drastic effect that normalising transcriptions can have on WER results (_c.f._ Section 4.4 of the [Whisper paper](https://cdn.openai.com/papers/whisper.pdf)). +the drastic effect that normalising transcriptions can have on WER results (*c.f.* Section 4.4 of the [Whisper paper](https://cdn.openai.com/papers/whisper.pdf)). While we get lower WERs, the model isn't necessarily better for production. The lack of casing and punctuation makes the predicted -text from the model significantly harder to read. Take the example from the [previous section](#asr_models), where we ran +text from the model significantly harder to read. Take the example from the [previous section](asr_models.mdx), where we ran Wav2Vec2 and Whisper on the same audio sample from the LibriSpeech dataset. The Wav2Vec2 model predicts neither punctuation nor casing, whereas Whisper predicts both. Comparing the transcriptions side-by-side, we see that the Whisper transcription is far easier to read: @@ -325,7 +314,7 @@ for out in tqdm( If you experience a CUDA out-of-memory (OOM) when running the above cell, incrementally reduce the `batch_size` by - factors of 2 until you find a batch size that fits in your device. + factors of 2 until you find a batch size that fits your device. And finally, we can compute the WER. Let's first compute the orthographic WER, i.e. the WER without any post-processing: @@ -388,4 +377,4 @@ Again we see the drastic reduction in WER we achieve by normalising our referenc achieves an orthographic test WER of 168%, while the normalised WER is 126%. Right then! These are the numbers that we want to try and beat when we fine-tune the model, in order to improve the Whisper -model for Dhivehi. +model for Dhivehi speech recognition. Continue reading to get hands-on with a fine-tuning example 🚀 diff --git a/chapters/en/chapter5/fine-tuning.mdx b/chapters/en/chapter5/fine-tuning.mdx index a5c3a8f2..959244bf 100644 --- a/chapters/en/chapter5/fine-tuning.mdx +++ b/chapters/en/chapter5/fine-tuning.mdx @@ -25,7 +25,7 @@ The Hub provides: - Community: an easy way to share and collaborate with the community! 🤗 Linking the notebook to the Hub is straightforward - it simply requires entering your Hub authentication token when prompted. -Find your Hub authentication token [here](https://huggingface.co/settings/tokens): +Find your Hub authentication token [here](https://huggingface.co/settings/tokens) and enter it when prompted: ```python from huggingface_hub import notebook_login @@ -41,13 +41,14 @@ Your token has been saved to /root/.huggingface/token ## Load Dataset -Common Voice 13 contains approximately ten hours of labelled Dhivehi data, three of which is held-out test data. This is -extremely little data for fine-tuning, so we'll be relying on leveraging the extensive multilingual ASR knowledge acquired -by Whisper during pre-training for the low-resource Dhivehi language. +[Common Voice 13]((https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0)) contains approximately ten +hours of labelled Dhivehi data, three of which is held-out test data. This is extremely little data for fine-tuning, so +we'll be relying on leveraging the extensive multilingual ASR knowledge acquired by Whisper during pre-training for the +low-resource Dhivehi language. Using 🤗 Datasets, downloading and preparing data is extremely simple. We can download and prepare the Common Voice 13 splits in just one line of code. Since Dhivehi is very low-resource, we'll combine the `train` and `validation` splits -to give approximately 7 hours of training data. We'll use the 3 hours of `test` data as our held-out test set: +to give approximately seven hours of training data. We'll use the three hours of `test` data as our held-out test set: ```python from datasets import load_dataset, DatasetDict @@ -80,7 +81,7 @@ DatasetDict({ You can change the language identifier from `"dv"` to a language identifier of your choice. To see all possible languages - in Common Voice 13, check out the dataset card on the Hugging Face Hub: [mozilla-foundation/common_voice_13_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0) + in Common Voice 13, check out the dataset card on the Hugging Face Hub: https://huggingface.co/datasets/mozilla-foundation/common_voice_13_0 Most ASR datasets only provide input audio samples (`audio`) and the corresponding transcribed text (`sentence`). @@ -112,7 +113,7 @@ to ensure the target labels are encoded properly. We can see all possible languages supported by Whisper by importing the list of languages: -``` +```python from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE TO_LANGUAGE_CODE @@ -332,7 +333,7 @@ Onwards! ### Evaluation Metrics Next, we define the evaluation metric we'll use on our evaluation set. We'll use the Word Error Rate (WER) metric introduced -in the section on [Evaluation](#evaluation), the 'de-facto' metric for assessing ASR systems. +in the section on [Evaluation](evaluation.mdx), the 'de-facto' metric for assessing ASR systems. We'll load the WER metric from 🤗 Evaluate: @@ -427,6 +428,7 @@ training_args = Seq2SeqTrainingArguments( per_device_train_batch_size=16, gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size learning_rate=1e-5, + lr_scheduler_type="constant_with_warmup", warmup_steps=500, max_steps=4000, gradient_checkpointing=True, @@ -543,5 +545,5 @@ model to the Hugging Face Hub, and showcased how to share and use it with the `p If you followed through to this point, you should now have a fine-tuned checkpoint for speech recognition, well done! 🥳 Even more importantly, you're equipped with all the tools you need to fine-tune the Whisper model on any speech recognition -dataset or domain. So what are you waiting for! Pick one of the datasets covered in the section [Choosing a Dataset](./choosing_dataset) +dataset or domain. So what are you waiting for! Pick one of the datasets covered in the section [Choosing a Dataset](choosing_dataset.mdx) or select a dataset of your own, and see whether you can get state-of-the-art performance! The leaderboard is waiting for you... From 56b41729a968bc41f854d459313f7c0a6d964c96 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 16:59:23 +0100 Subject: [PATCH 20/24] rephrase --- chapters/en/chapter5/demo.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chapters/en/chapter5/demo.mdx b/chapters/en/chapter5/demo.mdx index 5d58cd52..d1c10eda 100644 --- a/chapters/en/chapter5/demo.mdx +++ b/chapters/en/chapter5/demo.mdx @@ -1,7 +1,7 @@ # Build a demo with Gradio -Now that we've fine-tuned a really strong model for Dhivehi speech recognition, let's go ahead and build a -[Gradio]((https://gradio.app)) demo to showcase it to the community! +Now that we've fine-tuned a Whisper model for Dhivehi speech recognition, let's go ahead and build a [Gradio]((https://gradio.app)) +demo to showcase it to the community! The first thing to do is load up the fine-tuned checkpoint using the `pipeline()` class - this is very familiar now from the section on [pre-trained models](asr_models.mdx). You can change the `model_id` to the namespace of your fine-tuned From c7f5d18974f872deda4c1bab213c38bc86f5ffec Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 17:00:05 +0100 Subject: [PATCH 21/24] don't codify in instructions --- chapters/en/chapter5/demo.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chapters/en/chapter5/demo.mdx b/chapters/en/chapter5/demo.mdx index d1c10eda..28150a90 100644 --- a/chapters/en/chapter5/demo.mdx +++ b/chapters/en/chapter5/demo.mdx @@ -79,8 +79,8 @@ Click the link to duplicate the template demo to your account: https://huggingfa We recommend giving your space a similar name to your fine-tuned model (e.g. whisper-small-dv-demo) and setting the visibility to "Public". -Once you've duplicated the Space to your account, click `"Files and versions"` -> `"app.py"` -> `"edit"`. Then change the -model identifier to your fine-tuned model (line 6). Scroll to the bottom of the page and click `"Commit changes to main"`. +Once you've duplicated the Space to your account, click "Files and versions" -> "app.py" -> "edit". Then change the +model identifier to your fine-tuned model (line 6). Scroll to the bottom of the page and click "Commit changes to main". he demo will reboot, this time using your fine-tuned model. You can share this demo with your friends and family so that they can use the model that you've trained! From 9171b6c80f0b4f0fbd382eb75cf2336a919683b6 Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 17:48:48 +0100 Subject: [PATCH 22/24] training tips --- chapters/en/chapter5/fine-tuning.mdx | 49 ++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/chapters/en/chapter5/fine-tuning.mdx b/chapters/en/chapter5/fine-tuning.mdx index 959244bf..20129a30 100644 --- a/chapters/en/chapter5/fine-tuning.mdx +++ b/chapters/en/chapter5/fine-tuning.mdx @@ -417,8 +417,10 @@ model.generate = partial( ## Define the Training Configuration -In the final step, we define all the parameters related to training. Here, you can set the `max_steps` to train for longer. -For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments). +In the final step, we define all the parameters related to training. Here, we set the number of training steps to 500. +This is enough steps to see a big WER improvement compared to the pre-trained Whisper model, while ensuring that fine-tuning can +be run in approximately 45 minutes on a Google Colab free tier. For more detail on the training arguments, refer to the +Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments). ```python from transformers import Seq2SeqTrainingArguments @@ -429,8 +431,8 @@ training_args = Seq2SeqTrainingArguments( gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size learning_rate=1e-5, lr_scheduler_type="constant_with_warmup", - warmup_steps=500, - max_steps=4000, + warmup_steps=50, + max_steps=500, # increase to 4000 if you have your own GPU or a Colab paid plan gradient_checkpointing=True, fp16=True, fp16_full_eval=True, @@ -438,8 +440,8 @@ training_args = Seq2SeqTrainingArguments( per_device_eval_batch_size=16, predict_with_generate=True, generation_max_length=225, - save_steps=1000, - eval_steps=1000, + save_steps=500, + eval_steps=500, logging_steps=25, report_to=["tensorboard"], load_best_model_at_end=True, @@ -479,21 +481,32 @@ To launch training, simply execute: trainer.train() ``` -Training will take approximately 5-10 hours depending on your GPU or the one allocated to the Google Colab. Depending on +Training will take approximately 45 minutes depending on your GPU or the one allocated to the Google Colab. Depending on your GPU, it is possible that you will encounter a CUDA `"out-of-memory"` error when you start training. In this case, you can reduce the `per_device_train_batch_size` incrementally by factors of 2 and employ [`gradient_accumulation_steps`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments.gradient_accumulation_steps) to compensate. **Output:** -| Step | Training Loss | Epoch | Validation Loss | WER | -|:----:|:-------------:|:-----:|:---------------:|:-----:| +| Training Loss | Epoch | Step | Validation Loss | Wer Ortho | Wer | +|:-------------:|:-----:|:----:|:---------------:|:---------:|:-------:| +| 0.136 | 1.63 | 500 | 0.1727 | 63.8972 | 14.0661 | -Our best WER is X% TODO(SG) - not bad for 7h of training data (I hope...)! The big question is how this -compares to other ASR systems. For that, we can view the autoevaluate [leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards?dataset=mozilla-foundation%2Fcommon_voice_13_0&only_verified=0&task=automatic-speech-recognition&config=hi&split=test&metric=wer), +Our final WER is 14.1% - not bad for seven hours of training data and just 500 training steps! That amounts to a 112% +improvement versus the pre-trained model! That means we've taken a model that previously had no knowledge about Dhivehi, +and fine-tuned it to recognise Dhivehi speech with adequate accuracy in under one hour 🤯 + +The big question is how this compares to other ASR systems. For that, we can view the autoevaluate [leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards?dataset=mozilla-foundation%2Fcommon_voice_13_0&only_verified=0&task=automatic-speech-recognition&config=dv&split=test&metric=wer), a leaderboard that categorises models by language and dataset, and subsequently ranks them according to their WER. -We see... +Looking at the leaderboard, we see that our model trained for 500 steps convincingly beats the pre-trained [Whisper Small](hf.co/openai/whisper-small) +checkpoint that we evaluated in the previous section. Nice job 👏 + +We see that there are a few checkpoints that do better than the one we trained. The beauty of the Hugging Face Hub is that +it's a *collaborative* platform - if we don't have the time or resources to perform a longer training run ourselves, we +can load a checkpoint that someone else in the community has trained and been kind enough to share (making sure to thank them for it!). +You'll be able to load these checkpoints in exactly the same way as the pre-trained ones using the `pipeline` class as we +did previously! So there's nothing stopping you cherry-picking the best model on the leaderboard to use for your task! We can automatically submit our checkpoint to the leaderboard when we push the training results to the Hub - we simply have to set the appropriate key-word arguments (kwargs). You can change these values to match your dataset, language and @@ -521,8 +534,16 @@ out the upload at `sanchit-gandhi/whisper-small-dv`. While the fine-tuned model yields satisfactory results on the Common Voice 13 Dhivehi test data, it is by no means optimal. The purpose of this guide is to demonstrate how to fine-tune an ASR model using the 🤗 Trainer for multilingual speech -recognition. The results could likely be improved by optimising the training hyperparameters, such as _learning rate_ and -_dropout_, and using a larger pre-trained checkpoint (`medium` or `large`). +recognition. + +If you have access to your own GPU or are subscribed to a Google Colab paid plan, you can increase `max_steps` to 4000 steps +to improve the WER further by training for more steps. Training for 4000 steps will take approximately 3-5 hours depending +on your GPU and yield WER results approximately 3% lower than training for 500 steps. If you decide to train for 4000 steps, +we also recommend changing the learning rate scheduler to a *linear* schedule (set `lr_scheduler_type="linear"`), as this will +yield an additional performance boost over long training runs. + +The results could likely be improved further by optimising the training hyperparameters, such as _learning rate_ and +_dropout_, and using a larger pre-trained checkpoint (`medium` or `large`). We leave this as an exercise to the reader. ## Sharing Your Model From da913343d32e52410d6b471eb7a171ba5f1121ee Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 17:49:42 +0100 Subject: [PATCH 23/24] last todo --- chapters/en/chapter5/asr_models.mdx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/chapters/en/chapter5/asr_models.mdx b/chapters/en/chapter5/asr_models.mdx index b4dd17bc..2de161e8 100644 --- a/chapters/en/chapter5/asr_models.mdx +++ b/chapters/en/chapter5/asr_models.mdx @@ -392,6 +392,5 @@ races, ages or other demographic criteria (_c.f._ [Whisper paper](https://arxiv. To boost the performance on low-resource languages, accents or dialects, we can take the pre-trained Whisper model and train it on a small corpus of appropriately selected data, in a process called _fine-tuning_. We'll show that with -as little of 10 hours of additional data, we can improve the performance of the Whisper model by X% on a low-resource -language. TODO(SG): update with results when available. In the next section, we'll cover the process behind selecting a -dataset for fine-tuning. +as little of ten hours of additional data, we can improve the performance of the Whisper model by over 100% on a low-resource +language. In the next section, we'll cover the process behind selecting a dataset for fine-tuning. From 7b556e3ab4e30dd944fdf368cb619921ce04790e Mon Sep 17 00:00:00 2001 From: sanchit-gandhi Date: Tue, 27 Jun 2023 18:00:00 +0100 Subject: [PATCH 24/24] add supplemental reading resources --- chapters/en/chapter5/supplemental_reading.mdx | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/chapters/en/chapter5/supplemental_reading.mdx b/chapters/en/chapter5/supplemental_reading.mdx index 88ff75ab..e350c07b 100644 --- a/chapters/en/chapter5/supplemental_reading.mdx +++ b/chapters/en/chapter5/supplemental_reading.mdx @@ -1 +1,10 @@ -# Supplemental reading and resources \ No newline at end of file +# Supplemental reading and resources + +This unit provided a hands-on introduction to speech recognition, one of the most popular tasks in the audio domain. +Want to learn more? Here you will find additional resources that will help you deepen your understanding of the topics and +enhance your learning experience. + +* [Whisper Talk](https://www.youtube.com/live/fZMiD8sDzzg?feature=share) by Jong Wook Kim: a presentation on the Whisper model, explaining the motivation, architecture, training and results, delivered by Whisper author Jong Wook Kim +* [Fine-Tuning Whisper for Multilingual ASR](https://huggingface.co/blog/fine-tune-whisper): an in-depth blog post that explains how the Whisper model works in more detail, and the pre- and post-processing steps involved with the feature extractor and tokenizer +* [Fine-tuning MMS Adapter Models for Multi-Lingual ASR](https://huggingface.co/blog/mms_adapters): an end-to-end guide for fine-tuning Meta AI's new [MMS](https://ai.facebook.com/blog/multilingual-model-speech-recognition/) speech recognition models, freezing the base model weights and only fine-tuning a small number of *adapter* layers +* [Boosting Wav2Vec2 with n-grams in 🤗 Transformers](https://huggingface.co/blog/wav2vec2-with-ngram): a blog post for combining CTC models with external language models (LMs) to combat spelling and punctuation errors