diff --git a/chapters/en/_toctree.yml b/chapters/en/_toctree.yml index af5b5c35..2698b1fc 100644 --- a/chapters/en/_toctree.yml +++ b/chapters/en/_toctree.yml @@ -103,27 +103,23 @@ - local: chapter6/supplemental_reading title: Supplemental reading and resources -#- title: Unit 7. Audio to audio -# sections: -# - local: chapter7/introduction -# title: What you'll learn and what you'll build -# - local: chapter7/tasks -# title: Examples of audio-to-audio tasks -# - local: chapter7/choosing_dataset -# title: Choosing a dataset -# - local: chapter7/preprocessing -# title: Loading and preprocessing data -# - local: chapter7/evaluation -# title: Evaluation metrics for audio-to-audio -# - local: chapter7/fine-tuning -# title: Fine-tuning the model +- title: Unit 7. Putting it all together + sections: + - local: chapter7/introduction + title: What you'll learn and what you'll build + - local: chapter7/speech-to-speech + title: Speech-to-speech translation + - local: chapter7/voice-assistant + title: Creating a voice assistant + - local: chapter7/transcribe-meeting + title: Transcribe a meeting # - local: chapter7/quiz # title: Quiz # quiz: 7 -# - local: chapter7/hands_on -# title: Hands-on exercise -# - local: chapter7/supplemental_reading -# title: Supplemental reading and resources + - local: chapter7/hands-on + title: Hands-on exercise + - local: chapter7/supplemental_reading + title: Supplemental reading and resources # #- title: Unit 8. Finish line # sections: diff --git a/chapters/en/chapter7/hands-on.mdx b/chapters/en/chapter7/hands-on.mdx new file mode 100644 index 00000000..691c0c33 --- /dev/null +++ b/chapters/en/chapter7/hands-on.mdx @@ -0,0 +1,47 @@ +# Hands-on exercise + +In this Unit, we consolidated the material covered in the previous six units of the course to build three integrated +audio applications. As you've experienced, building more involved audio tools is fully within reach by using the +foundational skills you've acquired in this course. + +The hands-on exercise takes one of the applications covered in this Unit, and extends it with a few multilingual +tweaks 🌍 Your objective is to take the [cascaded speech-to-speech translation Gradio demo](https://huggingface.co/spaces/course-demos/speech-to-speech-translation) +from the first section in this Unit, and update it to translate to any **non-English** language. That is to say, the +demo should take speech in language X, and translate it to speech in language Y, where the target language Y is not +English. You should start by [duplicating](https://huggingface.co/spaces/course-demos/speech-to-speech-translation?duplicate=true) +the template under your Hugging Face namespace. There's no requirement to use a GPU accelerator device - the free CPU +tier works just fine πŸ€— However, you should ensure that the visibility of your demo is set to **public**. This is required +such that your demo is accessible to us and can thus be checked for correctness. + +Tips for updating the speech translation function to perform multilingual speech translation are provided in the +section on [speech-to-speech translation](speech-to-speech.mdx). By following these instructions, you should be able +to update the demo to translate from speech in language X to text in language Y, which is half of the task! + +To synthesise from text in language Y to speech in language Y, where Y is a multilingual language, you will need +to use a multilingual TTS checkpoint. For this, you can either use the SpeechT5 TTS checkpoint that you fine-tuned +in the previous hands-on exercise, or a pre-trained multilingual TTS checkpoint. There are two options for pre-trained +checkpoints, either the checkpoint [sanchit-gandhi/speecht5_tts_vox_nl](https://huggingface.co/sanchit-gandhi/speecht5_tts_vox_nl), +which is a SpeechT5 checkpoint fine-tuned on the Dutch split of the [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) +dataset, or an MMS TTS checkpoint (see section on [pretrained models for TTS](../chapter6/pre-trained_models.mdx)). + + + In our experience experimenting with the Dutch language, using an MMS TTS checkpoint results in better performance than a + fine-tuned SpeechT5 one, but you might find that your fine-tuned TTS checkpoint is preferable in your language. + If you decide to use an MMS TTS checkpoint, you will need to update the requirements.txt + file of your demo to install transformers from the PR branch: +

git+https://github.com/hollance/transformers.git@6900e8ba6532162a8613d2270ec2286c3f58f57b

+
+ + +Your demo should take as input an audio file, and return as output another audio file, matching the signature of the +[`speech_to_speech_translation`](https://huggingface.co/spaces/course-demos/speech-to-speech-translation/blob/3946ba6705a6632a63de8672ac52a482ab74b3fc/app.py#L35) +function in the template demo. Therefore, we recommend that you leave the main function `speech_to_speech_translation` +as is, and only update the [`translate`](https://huggingface.co/spaces/course-demos/speech-to-speech-translation/blob/a03175878f522df7445290d5508bfb5c5178f787/app.py#L24) +and [`synthesise`](https://huggingface.co/spaces/course-demos/speech-to-speech-translation/blob/a03175878f522df7445290d5508bfb5c5178f787/app.py#L29) +functions as required. + +Once you have built your demo as a Gradio demo on the Hugging Face Hub, you can submit it for assessment. Head to the +Space [audio-course-u7-assessment](https://huggingface.co/spaces/huggingface-course/audio-course-u7-assessment) and +provide the repository id of your demo when prompted. This Space will check that your demo has been built correctly by +sending a sample audio file to your demo and checking that the returned audio file is indeed non-English. If your demo +works correctly, you'll get a green tick next to your name on the overall [progress space](https://huggingface.co/spaces/MariaK/Check-my-progress-Audio-Course) βœ… diff --git a/chapters/en/chapter7/introduction.mdx b/chapters/en/chapter7/introduction.mdx new file mode 100644 index 00000000..3e0de22d --- /dev/null +++ b/chapters/en/chapter7/introduction.mdx @@ -0,0 +1,16 @@ +# Unit 7. Putting it all together πŸͺ’ + +Well done on making it to Unit 7 πŸ₯³ You're just a few steps away from completing the course and acquiring the final few +skills you need to navigate the field of Audio ML. In terms of understanding, you already know everything there is to know! +Together, we've comprehensively covered the main topics that constitute the audio domain and their accompanying theory +(audio data, audio classification, speech recognition and text-to-speech). What this Unit aims to deliver is a framework +for **putting it all together**: now that you know how each of these tasks work in isolation, we're going to explore how +you can combine them together to build some real-world applications. + +## What you'll learn and what you'll build + +In this Unit, we'll cover the following three topics: + +* [Speech-to-speech translation](speech-to-speech): translate speech from one language into speech in a different language +* [Creating a voice assistant](voice-assistant): build your own voice assistant that works in a similar way to Alexa or Siri +* [Transcribing meetings](transcribe-meeting): transcribe a meeting and label the transcript with who spoke when diff --git a/chapters/en/chapter7/speech-to-speech.mdx b/chapters/en/chapter7/speech-to-speech.mdx new file mode 100644 index 00000000..05049fde --- /dev/null +++ b/chapters/en/chapter7/speech-to-speech.mdx @@ -0,0 +1,262 @@ +# Speech-to-speech translation + +Speech-to-speech translation (STST or S2ST) is a relatively new spoken language processing task. It involves translating +speech from one langauge into speech in a **different** language: + +
+ Diagram of speech to speech translation +
+ +STST can be viewed as an extension of the traditional machine translation (MT) task: instead of translating **text** from one +language into another, we translate **speech** from one language into another. STST holds applications in the field of +multilingual communication, enabling speakers in different languages to communicate with one another through the medium +of speech. + +Suppose you want to communicate with another individual across a langauge barrier. Rather +than writing the information that you want to convey and then translating it to text in the target language, you +can speak it directly and have a STST system convert your spoken speech into the target langauge. The recipient can then +respond by speaking back at the STST system, and you can listen to their response. This is a more natural way of communicating +compared to text-based machine translation. + +In this chapter, we'll explore a *cascaded* approach to STST, piecing together the knowledge you've acquired in Units +5 and 6 of the course. We'll use a *speech translation (ST)* system to transcribe the source speech into text in the target +language, then *text-to-speech (TTS)* to generate speech in the target language from the translated text: + +
+ Diagram of cascaded speech to speech translation +
+ +We could also have used a three stage approach, where first we use an automatic speech recognition (ASR) system to +transcribe the source speech into text in the same language, then machine translation to translate the transcribed text +into the target language, and finally text-to-speech to generate speech in the target language. However, adding more +components to the pipeline lends itself to *error propagation*, where the errors introduced in one system are compounded +as they flow through the remaining systems, and also increases latency, since inference has to be conducted for more models. + +While this cascaded approach to STST is pretty straightforward, it results in very effective STST systems. The three-stage +cascaded system of ASR + MT + TTS was previously used to power many commercial STST products, including [Google Translate](https://ai.googleblog.com/2019/05/introducing-translatotron-end-to-end.html). +It's also a very data and compute efficient way of developing a STST system, since existing speech recognition and +text-to-speech systems can be coupled together to yield a new STST model without any additional training. + +In the remainder of this Unit, we'll focus on creating a STST system that translates speech from any language X to speech +in English. The methods covered can be extended to STST systems that translate from any language X to any +langauge Y, but we leave this as an extension to the reader and provide pointers where applicable. We further divide up the +task of STST into its two constituent components: ST and TTS. We'll finish by piecing them together to build a Gradio +demo to showcase our system. + +## Speech translation + +We'll use the Whisper model for our speech translation system, since it's capable of translating from over 96 languages +to English. Specifically, we'll load the [Whisper Base](https://huggingface.co/openai/whisper-base) checkpoint, which +clocks in at 74M parameters. It's by no means the most performant Whisper model, with the [largest Whisper checkpoint](https://huggingface.co/openai/whisper-large-v2) +being over 20x larger, but since we're concatenating two auto-regressive systems together (ST + TTS), we want to ensure +each model can generate relatively quickly so that we get reasonable inference speed: + +```python +import torch +from transformers import pipeline + +device = "cuda:0" if torch.cuda.is_available() else "cpu" +pipe = pipeline( + "automatic-speech-recognition", model="openai/whisper-base", device=device +) +``` + +Great! To test our STST system, we'll load an audio sample in a non-English language. Let's load the first example of the +Italian (`it`) split of the [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) dataset: + +```python +from datasets import load_dataset + +dataset = load_dataset("facebook/voxpopuli", "it", split="validation", streaming=True) +sample = next(iter(dataset)) +``` + +To listen to this sample, we can either play it using the dataset viewer on the Hub: [facebook/voxpopuli/viewer](https://huggingface.co/datasets/facebook/voxpopuli/viewer/it/validation?row=0) + +Or playback using the ipynb audio feature: + +```python +from IPython.display import Audio + +Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"]) +``` + +Now let's define a function that takes this audio input and returns the translated text. You'll remember that we have to +pass the generation key-word argument for the `"task"`, setting it to `"translate"` to ensure that Whisper performs +speech translation and not speech recognition: + +```python +def translate(audio): + outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"}) + return outputs["text"] +``` + + + + Whisper can also be 'tricked' into translating from speech in any language X to any language Y. Simply set the task to + `"transcribe"` and the `"language"` to your target language in the generation key-word arguments, + e.g. for Spanish, one would set: + + `generate_kwargs={"task": "transcribe", "language": "es"}` + + + +Great! Let's quickly check that we get a sensible result from the model: + +```python +translate(sample["audio"].copy()) +``` +``` +' psychological and social. I think that it is a very important step in the construction of a juridical space of freedom, circulation and protection of rights.' +``` + +Alright! If we compare this to the source text: + +```python +sample["raw_text"] +``` +``` +'Penso che questo sia un passo in avanti importante nella costruzione di uno spazio giuridico di libertΓ  di circolazione e di protezione dei diritti per le persone in Europa.' +``` + +We see that the translation more or less lines up (you can double check this using Google Translate), barring a small +extra few words at the start of the transcription where the speaker was finishing off their previous sentence. + +With that, we've completed the first half of our cascaded STST pipeline, putting into practice the skills we gained in Unit 5 +when we learnt how to use the Whisper model for speech recognition and translation. If you want a refresher on any of the +steps we covered, have a read through the section on [Pre-trained models for ASR](../chapter5/asr_models.mdx) from Unit 5. + +## Text-to-speech + +The second half of our cascaded STST system involves mapping from English text to English speech. For this, we'll use +the pre-trained [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for English TTS. πŸ€— Transformers currently doesn't +have a TTS `pipeline`, so we'll have to use the model directly ourselves. This is no biggie, you're all experts on using +the model for inference following Unit 6! + +First, let's load the SpeechT5 processor, model and vocoder from the pre-trained checkpoint: + +```python +from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan + +processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") + +model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") +vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") +``` + + + Here we're using SpeechT5 checkpoint trained specifically for English TTS. Should you wish to translate into a language + other than English, either swap the checkpoint for a SpeechT5 TTS model fine-tuned on your language of choice, or + use an MMS TTS checkpoint pre-trained in your target langauge. + + +As with the Whisper model, we'll place the SpeechT5 model and vocoder on our GPU accelerator device if we have one: +```python +model.to(device) +vocoder.to(device) +``` + +Great! Let's load up the speaker embeddings: + +```python +embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") +speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) +``` + +We can now write a function that takes a text prompt as input, and generates the corresponding speech. We'll first pre-process +the text input using the SpeechT5 processor, tokenizing the text to get our input ids. We'll then pass the input ids and +speaker embeddings to the SpeechT5 model, placing each on the accelerator device if available. Finally, we'll return the +generated speech, bringing it back to the CPU so that we can play it back in our ipynb notebook: + +```python +def synthesise(text): + inputs = processor(text=text, return_tensors="pt") + speech = model.generate_speech( + inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder + ) + return speech.cpu() +``` + +Let's check it works with a dummy text input: +```python +speech = synthesise("Hey there! This is a test!") + +Audio(speech, rate=16000) +``` + +Sounds good! Now for the exciting part - piecing it all together. + +## Creating a STST demo + +Before we create a [Gradio](https://gradio.app) demo to showcase our STST system, let's first do a quick sanity check +to make sure we can concatenate the two models, putting an audio sample in and getting an audio sample out. We'll do +this by concatenating the two functions we defined in the previous two sub-sections, such that we input the source audio +and retrieve the translated text, then synthesise the translated text to get the translated speech. Finally, we'll convert +the synthesised speech to an `int16` array, which is the output audio file format expected by Gradio. To do this, we +first have to normalise the audio array by the dynamic range of the target dtype (`int16`), and then convert from the +default NumPy dtype (`float64`) to the target dtype (`int16`): + +```python +import numpy as np + +target_dtype = np.int16 +max_range = np.iinfo(target_dtype).max + + +def speech_to_speech_translation(audio): + translated_text = translate(audio) + synthesised_speech = synthesise(translated_text) + synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16) + return 16000, synthesised_speech +``` + +Let's check this concatenated function gives the expected result: + +```python +sampling_rate, synthesised_speech = speech_to_speech_translation(sample["audio"]) + +Audio(synthesised_speech, rate=sampling_rate) +``` + +Perfect! Now we'll wrap this up into a nice Gradio demo so that we can record our source speech using a microphone input +or file input and playback the system's prediction: + +```python +import gradio as gr + +demo = gr.Blocks() + +mic_translate = gr.Interface( + fn=speech_to_speech_translation, + inputs=gr.Audio(source="microphone", type="filepath"), + outputs=gr.Audio(label="Generated Speech", type="numpy"), +) + +file_translate = gr.Interface( + fn=speech_to_speech_translation, + inputs=gr.Audio(source="upload", type="filepath"), + outputs=gr.Audio(label="Generated Speech", type="numpy"), +) + +with demo: + gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"]) + +demo.launch(debug=True) +``` + +This will launch a Gradio demo similar to the one running on the Hugging Face Space: + + + +You can [duplicate](https://huggingface.co/spaces/course-demos/speech-to-speech-translation?duplicate=true) this demo and adapt +it to use a different Whisper checkpoint, a different TTS checkpoint, or relax the constraint of outputting English +speech and follow the tips provide for translating into a langauge of your choice! + +## Going forwards + +While the cascaded system is a compute and data efficient way of building a STST system, it suffers from the issues of +error propagation and additive latency described above. Recent works have explored a *direct* approach to STST, one that +does not predict an intermediate text output and instead maps directly from source speech to target speech. These systems +are also capable of retaining the speaking characteristics of the source speaker in the target speech (such a prosody, +pitch and intonation). If you're interested in finding out more about these systems, check-out the resources listed in +the section on [supplemental reading](supplemental_reading). diff --git a/chapters/en/chapter7/supplemental_reading.mdx b/chapters/en/chapter7/supplemental_reading.mdx new file mode 100644 index 00000000..97aed728 --- /dev/null +++ b/chapters/en/chapter7/supplemental_reading.mdx @@ -0,0 +1,19 @@ +# Supplemental reading and resources + +This Unit pieced together many components from previous units, introducing the tasks of speech-to-speech translation, +voice assistants and speaker diarization. The supplemental reading material is thus split into these three new tasks +for your convenience: + +Speech-to-speech translation: +* [STST with discrete units](https://ai.facebook.com/blog/advancing-direct-speech-to-speech-modeling-with-discrete-units/) by Meta AI: a direct approach to STST through encoder-decoder models +* [Hokkien direct speech-to-speech translation](https://ai.facebook.com/blog/ai-translation-hokkien/) by Meta AI: a direct approach to STST using encoder-decoder models with a two-stage decoder +* [Leveraging unsupervised and weakly-supervised data to improve direct STST](https://arxiv.org/abs/2203.13339) by Google: proposes new approaches for leveraging unsupervised and weakly supervised data for training direct STST models and a small change to the Transformer architecture +* [Translatotron-2](https://google-research.github.io/lingvo-lab/translatotron2/) by Google: a system that is able to retain speaker characteristics in translated speech + +Voice Assistant: +* [Accurate wakeword detection](https://www.amazon.science/publications/accurate-detection-of-wake-word-start-and-end-using-a-cnn) by Amazon: a low latency approach for wakeword detection for on-device applications +* [RNN-Transducer Architecture](https://arxiv.org/pdf/1811.06621.pdf) by Google: a modification to the CTC architecture for streaming on-device ASR + +Meeting Transcriptions: +* [pyannote.audio Technical Report](https://huggingface.co/pyannote/speaker-diarization/blob/main/technical_report_2.1.pdf) by HervΓ© Bredin: this report describes the main principles behind the `pyannote.audio` speaker diarization pipeline +* [Whisper X](https://arxiv.org/pdf/2303.00747.pdf) by Max Bain et al.: a superior approach to computing word-level timestamps using the Whisper model \ No newline at end of file diff --git a/chapters/en/chapter7/transcribe-meeting.mdx b/chapters/en/chapter7/transcribe-meeting.mdx new file mode 100644 index 00000000..0f8a0118 --- /dev/null +++ b/chapters/en/chapter7/transcribe-meeting.mdx @@ -0,0 +1,231 @@ +# Transcribe a meeting + +In this final section, we'll use the Whisper model to generate a transcription for a conversation or meeting between +two or more speakers. We'll then pair it with a *speaker diarization* model to predict "who spoke when". By matching +the timestamps from the Whisper transcriptions with the timestamps from the speaker diarization model, we can predict an +end-to-end meeting transcription with fully formatted start / end times for each speaker. This is a basic version of +the meeting transcription services you might have seen online from the likes of [Otter.ai](https://otter.ai) and co: + +
+ +
+ +## Speaker Diarization + +Speaker diarization (or diarisation) is the task of taking an unlabelled audio input and predicting "who spoke when". +In doing so, we can predict start / end timestamps for each speaker turn, corresponding to when each speaker starts +speaking and when they finish. + +πŸ€— Transformers currently does not have a model for speaker diarization included in the library, but there are checkpoints +on the Hub that can be used with relative ease. In this example, we'll use the pre-trained speaker diarization model from +[pyannote.audio](https://github.com/pyannote/pyannote-audio). Let's get started and pip install the package: + +```bash +pip install --upgrade pyannote.audio +``` + +Great! The weights for this model are hosted on the Hugging Face Hub. To access them, we first have to agree to the speaker diarization model's +terms of use: [pyannote/speaker-diarization](https://huggingface.co/pyannote/speaker-diarization). And subsequently the +segmentation model's terms of use: [pyannote/segmentation](https://huggingface.co/pyannote/segmentation). + +Once complete, we can load the pre-trained speaker diarization pipeline locally on our device: + +```python +from pyannote.audio import Pipeline + +diarization_pipeline = Pipeline.from_pretrained( + "pyannote/speaker-diarization@2.1", use_auth_token=True +) +``` + +Let's try it out on a sample audio file! For this, we'll load a sample of the [LibriSpeech ASR](https://huggingface.co/datasets/librispeech_asr) +dataset that consists of two different speakers that have been concatenated together to give a single audio file: + +```python +from datasets import load_dataset + +concatenated_librispeech = load_dataset( + "sanchit-gandhi/concatenated_librispeech", split="train", streaming=True +) +sample = next(iter(concatenated_librispeech)) +``` + +We can listen to the audio to see what it sounds like: + +```python +from IPython.display import Audio + +Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"]) +``` + +Cool! We can clearly hear two different speakers, with a transition roughly 15s of the way through. Let's pass this audio +file to the diarization model to get the speaker start / end times. Note that pyannote.audio expects the audio input to be a +PyTorch tensor of shape `(channels, seq_len)`, so we need to perform this conversion prior to running the model: + +```python +import torch + +input_tensor = torch.from_numpy(sample["audio"]["array"][None, :]).float() +outputs = diarization_pipeline( + {"waveform": input_tensor, "sample_rate": sample["audio"]["sampling_rate"]} +) + +outputs.for_json()["content"] +``` + +```text +[{'segment': {'start': 0.4978125, 'end': 14.520937500000002}, + 'track': 'B', + 'label': 'SPEAKER_01'}, + {'segment': {'start': 15.364687500000002, 'end': 21.3721875}, + 'track': 'A', + 'label': 'SPEAKER_00'}] +``` + +This looks pretty good! We can see that the first speaker is predicted as speaking up until the 14.5 second mark, and the +second speaker from 15.4s onwards. Now we need to get our transcription! + +## Speech transcription + +For the third time in this Unit, we'll use the Whisper model for our speech transcription system. Specifically, we'll load the +[Whisper Base](https://huggingface.co/openai/whisper-base) checkpoint, since it's small enough to give good +inference speed with reasonable transcription accuracy. As before, feel free to use any speech recognition checkpoint +on [the Hub](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&library=transformers&sort=trending), +including Wav2Vec2, MMS ASR or other Whisper checkpoints: + +```python +from transformers import pipeline + +asr_pipeline = pipeline( + "automatic-speech-recognition", + model="openai/whisper-base", +) +``` + +Let's get the transcription for our sample audio, returning the segment level timestamps as well so that we know the +start / end times for each segment. You'll remember from Unit 5 that we need to pass the argument +`return_timestamps=True` to activate the timestamp prediction task for Whisper: + +```python +asr_pipeline( + sample["audio"].copy(), + generate_kwargs={"max_new_tokens": 256}, + return_timestamps=True, +) +``` + +```text +{ + "text": " The second and importance is as follows. Sovereignty may be defined to be the right of making laws. In France, the king really exercises a portion of the sovereign power, since the laws have no weight. He was in a favored state of mind, owing to the blight his wife's action threatened to cast upon his entire future.", + "chunks": [ + {"timestamp": (0.0, 3.56), "text": " The second and importance is as follows."}, + { + "timestamp": (3.56, 7.84), + "text": " Sovereignty may be defined to be the right of making laws.", + }, + { + "timestamp": (7.84, 13.88), + "text": " In France, the king really exercises a portion of the sovereign power, since the laws have", + }, + {"timestamp": (13.88, 15.48), "text": " no weight."}, + { + "timestamp": (15.48, 19.44), + "text": " He was in a favored state of mind, owing to the blight his wife's action threatened to", + }, + {"timestamp": (19.44, 21.28), "text": " cast upon his entire future."}, + ], +} +``` + +Alright! We see that each segment of the transcript has a start and end time, with the speakers changing at the 15.48 second +mark. We can now pair this transcription with the speaker timestamps that we got from our diarization model to get our +final transcription. + +## Speechbox + +To get the final transcription, we'll align the timestamps from the diarization model with those from the Whisper model. +The diarization model predicted the first speaker to end at 14.5 seconds, and the second speaker to start at 15.4s, whereas Whisper predicted segment boundaries at +13.88, 15.48 and 19.44 seconds respectively. Since the timestamps from Whisper don't match perfectly with those from the +diarization model, we need to find which of these boundaries are closest to 14.5 and 15.4 seconds, and segment the transcription by +speakers accordingly. Specifically, we'll find the closest alignment between diarization and transcription timestamps by +minimising the absolute distance between both. + +Luckily for us, we can use the πŸ€— Speechbox package to perform this alignment. First, let's pip install `speechbox` from +main: + +```bash +pip install git+https://github.com/huggingface/speechbox +``` + +We can now instantiate our combined diarization plus transcription pipeline, by passing the diarization model and +ASR model to the [`ASRDiarizationPipeline`](https://github.com/huggingface/speechbox/tree/main#asr-with-speaker-diarization) class: + +```python +from speechbox import ASRDiarizationPipeline + +pipeline = ASRDiarizationPipeline( + asr_pipeline=asr_pipeline, diarization_pipeline=diarization_pipeline +) +``` + + + You can also instantiate the ASRDiarizationPipeline directly from pre-trained by specifying the model id + of an ASR model on the Hub: +

pipeline = ASRDiarizationPipeline.from_pretrained("openai/whisper-base")

+
+ +Let's pass the audio file to the composite pipeline and see what we get out: + +```python +pipeline(sample["audio"].copy()) +``` + +```text +[{'speaker': 'SPEAKER_01', + 'text': ' The second and importance is as follows. Sovereignty may be defined to be the right of making laws. In France, the king really exercises a portion of the sovereign power, since the laws have no weight.', + 'timestamp': (0.0, 15.48)}, + {'speaker': 'SPEAKER_00', + 'text': " He was in a favored state of mind, owing to the blight his wife's action threatened to cast upon his entire future.", + 'timestamp': (15.48, 21.28)}] +``` + +Excellent! The first speaker is segmented as speaking from 0 to 15.48 seconds, and the second speaker from 15.48 to 21.28 seconds, +with the corresponding transcriptions for each. + +We can format the timestamps a little more nicely by defining two helper functions. The first converts a tuple of +timestamps to a string, rounded to a set number of decimal places. The second combines the speaker id, timestamp and text +information onto one line, and splits each speaker onto their own line for ease of reading: + +```python +def tuple_to_string(start_end_tuple, ndigits=1): + return str((round(start_end_tuple[0], ndigits), round(start_end_tuple[1], ndigits))) + + +def format_as_transcription(raw_segments): + return "\n\n".join( + [ + chunk["speaker"] + " " + tuple_to_string(chunk["timestamp"]) + chunk["text"] + for chunk in raw_segments + ] + ) +``` + +Let's re-run the pipeline, this time formatting the transcription according to the function we've just defined: +```python +outputs = pipeline(sample["audio"].copy()) + +format_as_transcription(outputs) +``` + +```text +SPEAKER_01 (0.0, 15.5) The second and importance is as follows. Sovereignty may be defined to be the right of making laws. +In France, the king really exercises a portion of the sovereign power, since the laws have no weight. + +SPEAKER_00 (15.5, 21.3) He was in a favored state of mind, owing to the blight his wife's action threatened to cast upon +his entire future. +``` + +There we go! With that, we've both diarized and transcribe our input audio and returned speaker-segmented transcriptions. +While the minimum distance algoirthm to align the diarized timestamps and transcribed timestamps is simple, it +works well in practice. If you want to explore more advanced methods for combining the timestamps, the +source code for the `ASRDiarizationPipeline` is a good place to start: [speechbox/diarize.py](https://github.com/huggingface/speechbox/blob/96d2d1a180252d92263f862a1cd25a48860f1aed/src/speechbox/diarize.py#L12) diff --git a/chapters/en/chapter7/voice-assistant.mdx b/chapters/en/chapter7/voice-assistant.mdx new file mode 100644 index 00000000..287743b2 --- /dev/null +++ b/chapters/en/chapter7/voice-assistant.mdx @@ -0,0 +1,466 @@ +# Creating a voice assistant + +In this section, we'll piece together three models that we've already had hands-on experience with to build an end-to-end +voice assistant called **Marvin** πŸ€–. Like Amazon's Alexa or Apple's Siri, Marvin is a virtual voice assistant who +responds to a particular 'wake word', then listens out for a spoken query, and finally responds with a spoken answer. + +We can break down the voice assistant pipeline into four stages, each of which requires a standalone model: + +
+ +
+ +### 1. Wake word detection + +Voice assistants are constantly listening to the audio inputs coming through your device's microphone, however they only +boot into action when a particular 'wake word' or 'trigger word' is spoken. + +The wake word detection task is handled by a small on-device audio classification model, which is much smaller and lighter +than the speech recognition model, often only several millions of parameters compared to several hundred millions for +speech recognition. Thus, it can be run continuously on your device without draining your battery. Only when the wake +word is detected is the larger speech recognition model launched, and afterwards it is shut down again. + +### 2. Speech transcription + +The next stage in the pipeline is transcribing the spoken query to text. In practice, transferring audio files from your +local device to the Cloud is slow due to the large nature of audio files, so it's more efficient to transcribe them +directly using an automatic speech recognition (ASR) model on-device rather than using a model in the Cloud. The on-device +model might be smaller and thus less accurate than one hosted in the Cloud, but the faster inference speed makes it +worthwhile since we can run speech recognition in near real-time, our spoken audio utterance being transcribed as we say it. + +We're very familiar with the speech recognition process now, so this should be a piece of cake! + +### 3. Language model query + +Now that we know what the user asked, we need to generate a response! The best candidate models for this task are +*large language models (LLMs)*, since they are effectively able to understand the semantics of the text query and +generate a suitable response. + +Since our text query is small (just a few text tokens), and language models large (many billions of parameters), the most +efficient way of running LLM inference is to send our text query from our device to an LLM running in the Cloud, +generate a text response, and return the response back to the device. + +### 4. Synthesise speech + +Finally, we'll use a text-to-speech (TTS) model to synthesise the text response as spoken speech. This is done +on-device, but you could feasibly run a TTS model in the Cloud, generating the audio output and transferring it back to +the device. + +Again, we've done this several times now, so the process will be very familiar! + +## Wake word detection + +The first stage in the voice assistant pipeline is detecting whether the wake word was spoken, and we need to find ourselves +an appropriate pre-trained model for this task! You'll remember from the section on [pre-trained models for audio classification](../chapter4/classification_models.mdx) +that [Speech Commands](https://huggingface.co/datasets/speech_commands) is a dataset of spoken words designed to +evaluate audio classification models on 15+ simple command words like `"up"`, `"down"`, `"yes"` and `"no"`, as well as a +`"silence"` label to classify no speech. Take a minute to listen through the samples on the datasets viewer on +the Hub and re-acquaint yourself with the Speech Commands dataset: [datasets viewer](https://huggingface.co/datasets/speech_commands/viewer/v0.01/train). + +We can take an audio classification model pre-trained on the Speech Commands dataset and pick one of these simple command +words to be our chosen wake word. Out of the 15+ possible command words, if the model predicts our chosen wake word with the +highest probability, we can be fairly certain that the wake word has been said. + +Let's head to the Hugging Face Hub and click on the "Models" tab: https://huggingface.co/models + +This is going to bring up all the models on the Hugging Face Hub, sorted by downloads in the past 30 days: + +
+ +
+ +You'll notice on the left-hand side that we have a selection of tabs that we can select to filter models by task, library, +dataset, etc. Scroll down and select the task "Audio Classification" from the list of audio tasks: + +
+ +
+ +We're now presented with the sub-set of 500+ audio classification models on the Hub. To further refine this selection, we +can filter models by dataset. Click on the tab "Datasets", and in the search box type "speech_commands". As you begin typing, +you'll see the selection for `speech_commands` appear underneath the search tab. You can click this button to filter all +audio classification models to those fine-tuned on the Speech Commands dataset: + +
+ +
+ +Great! We see that we have six pre-trained models available to us for this specific dataset and task (although there may +be new models added if you're reading at a later date!). You'll recognise the first of these models as the [Audio Spectrogram Transformer checkpoint](https://huggingface.co/MIT/ast-finetuned-speech-commands-v2) +that we used in Unit 4 example. We'll use this checkpoint again for our wake word detection task. + +Let's go ahead and load the checkpoint using the `pipeline` class: + +```python +from transformers import pipeline +import torch + +device = "cuda:0" if torch.cuda.is_available() else "cpu" + +classifier = pipeline( + "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device +) +``` + +We can check what labels the model was trained on by checking the `id2label` attribute in the model config: +```python +classifier.model.config.id2label +``` + +Alright! We see that the model was trained on 35 class labels, including some simple command words that we described above, +as well as some particular objects like `"bed"`, `"house"` and `"cat"`. We see that there is one name in these class labels: +id 27 corresponds to the label **"marvin"**: + +```python +classifier.model.config.id2label[27] +``` + +``` +'marvin' +``` + +Perfect! We can use this name as our wake word for our voice assistant, similar to how "Alexa" is used for Amazon's Alexa, +or "Hey Siri" is used for Apple's Siri. Of all the possible labels, if the model predicts `"marvin"` with the highest class +probability, we can be fairly sure that our chosen wake word has been said. + +Now we need to define a function that is constantly listening to our device's microphone input, and continuously +passes the audio to the classification model for inference. To do this, we'll use a handy helper function that comes +with πŸ€— Transformers called [`ffmpeg_microphone_live`](https://github.com/huggingface/transformers/blob/fb78769b9c053876ed7ae152ee995b0439a4462a/src/transformers/pipelines/audio_utils.py#L98). + +This function forwards small chunks of audio of specified length `chunk_length_s` to the model to be classified. To ensure that +we get smooth boundaries across chunks of audio, we run a sliding window across our audio with stride `chunk_length_s / 6`. +So that we don't have to wait for the entire first chunk to be recorded before we start inferring, we also define a minimal +temporary audio input length `stream_chunk_s` that is forwarded to the model before `chunk_length_s` time is reached. + +The function `ffmpeg_microphone_live` returns a *generator* object, yielding a sequence of audio chunks that can each +be passed to the classification model to make a prediction. We can pass this generator directly to the `pipeline`, +which in turn returns a sequence of output predictions, one for each chunk of audio input. We can inspect the class +label probabilities for each audio chunk, and stop our wake word detection loop when we detect that the wake word +has been spoken. + +We'll use a very simple criteria for classifying whether our wake word was spoken: if the class label with the highest +probability was our wake word, and this probability exceeds a threshold `prob_threshold`, we declare that the wake word +as having been spoken. Using a probability threshold to gate our classifier this way ensures that the wake word is not +erroneously predicted if the audio input is noise, which is typically when the model is very uncertain and all the class +label probabilities low. You might want to tune this probability threshold, or explore more sophisticated means for +the wake word decision through an [*entropy*](https://en.wikipedia.org/wiki/Entropy_(information_theory)) (or uncertainty) based metric. + +```python +from transformers.pipelines.audio_utils import ffmpeg_microphone_live + + +def launch_fn( + wake_word="marvin", + prob_threshold=0.5, + chunk_length_s=2.0, + stream_chunk_s=0.25, + debug=False, +): + if wake_word not in classifier.model.config.label2id.keys(): + raise ValueError( + f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}." + ) + + sampling_rate = classifier.feature_extractor.sampling_rate + + mic = ffmpeg_microphone_live( + sampling_rate=sampling_rate, + chunk_length_s=chunk_length_s, + stream_chunk_s=stream_chunk_s, + ) + + print("Listening for wake word...") + for prediction in classifier(mic): + prediction = prediction[0] + if debug: + print(prediction) + if prediction["label"] == wake_word: + if prediction["score"] > prob_threshold: + return True +``` + +Let's give this function a try to see how it works! We'll set the flag `debug=True` to print out the prediction for each +chunk of audio. Let the model run for a few seconds to see the kinds of predictions that it makes when there is no speech +input, then clearly say the wake word `"marvin"` and watch the class label prediction for `"marvin"` spike to near 1: + +```python +launch_fn(debug=True) +``` + +```text +Listening for wake word... +{'score': 0.055326107889413834, 'label': 'one'} +{'score': 0.05999856814742088, 'label': 'off'} +{'score': 0.1282748430967331, 'label': 'five'} +{'score': 0.07310110330581665, 'label': 'follow'} +{'score': 0.06634809821844101, 'label': 'follow'} +{'score': 0.05992642417550087, 'label': 'tree'} +{'score': 0.05992642417550087, 'label': 'tree'} +{'score': 0.999913215637207, 'label': 'marvin'} +``` + +Awesome! As we expect, the model generates garbage predictions for the first few seconds. There is no speech input, so the +model makes close to random predictions, but with very low probability. As soon as we say the wake word, the model predicts +`"marvin"` with probability close to 1 and terminates the loop, signalling that the wake word has been detected and that the +ASR system should be activated! + +## Speech transcription + +Once again, we'll use the Whisper model for our speech transcription system. Specifically, we'll load the [Whisper Base English](https://huggingface.co/openai/whisper-base.en) +checkpoint, since it's small enough to give good inference speed with reasonable transcription accuracy. We'll use a trick +to get near real-time transcription by being clever with how we forward our audio inputs to the model. As before, feel +free to use any speech recognition checkpoint on [the Hub](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&library=transformers&sort=trending), +including Wav2Vec2, MMS ASR or other Whisper checkpoints: + +```python +transcriber = pipeline( + "automatic-speech-recognition", model="openai/whisper-base.en", device=device +) +``` + + + If you're using a GPU, you can increase the checkpoint size to use the Whisper Small English + checkpoint, which will return better transcription accuracy and still be within the required latency threshold. Simply swap the + model id to: "openai/whisper-small.en". + + +We can now define a function to record our microphone input and transcribe the corresponding text. With the `ffmpeg_microphone_live` +helper function, we can control how 'real-time' our speech recognition model is. Using a smaller `stream_chunk_s` lends +itself to more real-time speech recognition, since we divide our input audio into smaller chunks and transcribe them on +the fly. However, this comes at the expense of poorer accuracy, since there's less context for the model to infer from. + +As we're transcribing the speech, we also need to have an idea of when the user **stops** speaking, so that we can terminate +the recording. For simplicity, we'll terminate our microphone recording after the first `chunk_length_s` (which is set to +5 seconds by default), but you can experiment with using a [voice activity detection (VAD)](https://huggingface.co/models?pipeline_tag=voice-activity-detection&sort=trending) +model to predict when the user has stopped speaking. + +```python +import sys + + +def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0): + sampling_rate = transcriber.feature_extractor.sampling_rate + + mic = ffmpeg_microphone_live( + sampling_rate=sampling_rate, + chunk_length_s=chunk_length_s, + stream_chunk_s=stream_chunk_s, + ) + + print("Start speaking...") + for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}): + sys.stdout.write("\033[K") + print(item["text"], end="\r") + if not item["partial"][0]: + break + + return item["text"] +``` + +Let's give this a go and see how we get on! Once the microphone is live, start speaking and watch your transcription +appear in semi real-time: +```python +transcribe() +``` + +```text +Start speaking... + Hey, this is a test with the whisper model. +``` + +Nice! You can adjust the maximum audio length `chunk_length_s` based on how fast or slow you speak (increase it if you +felt like you didn't have enough time to speak, decrease it if you were left waiting at the end), and the +`stream_chunk_s` for the real-time factor. Just pass these as arguments to the `transcribe` function. + +## Language model query + +Now that we have our spoken query transcribed, we want to generate a meaningful response. To do this, we'll use an LLM +hosted on the Cloud. Specifically, we'll pick an LLM on the Hugging Face Hub and use the [Inference API](https://huggingface.co/inference-api) +to easily query the model. + +First, let's head over to the Hugging Face Hub. To find our LLM, we'll use the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), +a Space that ranks LLM models by performance over four generation tasks. We'll search by "instruct" to filter out models +that have been instruction fine-tuned, since these should work better for our querying task: + +
+ +
+ +We'll use the [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct) checkpoint by [TII](https://www.tii.ae/), +a 7B parameter decoder-only LM fine-tuned on a mixture of chat and instruction datasets. You can use any LLM on the Hugging +Face Hub that has the "Hosted inference API" enabled, just look out for the widget on the right-side of the model card: + +
+ +
+ +The Inference API allows us to send a HTTP request from our local machine to the LLM hosted on the Hub, and returns the +response as a `json` file. All we need to provide is our Hugging Face Hub token (which we retrieve directly from our Hugging Face +Hub folder) and the model id of the LLM we wish to query: + +```python +from huggingface_hub import HfFolder +import requests + + +def query(text, model_id="tiiuae/falcon-7b-instruct"): + api_url = f"https://api-inference.huggingface.co/models/{model_id}" + headers = {"Authorization": f"Bearer {HfFolder().get_token()}"} + payload = {"inputs": text} + + print(f"Querying...: {text}") + response = requests.post(api_url, headers=headers, json=payload) + return response.json()[0]["generated_text"][len(text) + 1 :] +``` + +Let's give it a try with a test input! +```python +query("What does Hugging Face do?") +``` + +``` +'Hugging Face is a company that provides natural language processing and machine learning tools for developers. They' +``` + +You'll notice just how fast inference is using the Inference API - we only have to send a small number of text tokens +from our local machine to the hosted model, so the communication cost is very low. The LLM is hosted on GPU accelerators, +so inference runs very quickly. Finally, the generated response is transferred back from the model to our local machine, +again with low communication overhead. + +## Synthesise speech + +And now we're ready to get the final spoken output! Once again, we'll use the Microsoft [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) +model for English TTS, but you can use any TTS model of your choice. Let's go ahead and load the processor and model: + +```python +from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan + +processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") + +model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device) +vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) +``` + +And also the speaker embeddings: +```python +from datasets import load_dataset + +embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") +speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) +``` + +We'll re-use the `synthesise` function that we defined in the previous chapter on [Speech-to-speech translation](speech-to-speech.mdx): + +```python +def synthesise(text): + inputs = processor(text=text, return_tensors="pt") + speech = model.generate_speech( + inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder + ) + return speech.cpu() +``` + +Let's quickly verify this works as expected: + +```python +from IPython.display import Audio + +audio = synthesise( + "Hugging Face is a company that provides natural language processing and machine learning tools for developers." +) + +Audio(audio, rate=16000) +``` + +Nice job πŸ‘ + +## Marvin πŸ€– + +Now that we've defined a function for each of the four stages of the voice assistant pipeline, all that's left to do is +piece them together to get our end-to-end voice assistant. We'll simply concatenate the four stages, starting with +wake word detection (`launch_fn`), speech transcription, querying the LLM, and finally speech synthesis. + +```python +launch_fn() +transcription = transcribe() +response = query(transcription) +audio = synthesise(response) + +Audio(audio, rate=16000, autoplay=True) +``` + +Try it out with a few prompts! Here are some examples to get you started: +* *What is the hottest country in the world?* +* *How do Transformer models work?* +* *Do you know Spanish?* + +And with that, we have our end-to-end voice assistant complete, made using the πŸ€— audio tools you've learnt throughout +this course, with a sprinkling of LLM magic at the end. There are several extensions that we could make to improve the +voice assistant. Firstly, the audio classification model classifies 35 different labels. We could use a smaller, more +lightweight binary classification model that only predicts whether the wake word was spoken or not. Secondly, we pre-load +all the models ahead and keep them running on our device. If we wanted to save power, we would only load each model at +the time it was required, and subsequently un-load them afterwards. Thirdly, we're missing a voice activity detection model +in our transcription function, transcribing for a fixed amount of time, which in some cases is too long, and in others too +short. + +## Generalise to anything πŸͺ„ + +So far, we've seen how we can generate speech outputs with our voice assistant Marvin. To finish, we'll demonstrate how +we can generalise these speech outputs to text, audio and image. + +We'll use [Transformers Agents](https://huggingface.co/docs/transformers/transformers_agents) to build our assistant. +Transformers Agents provides a natural language API on top of the πŸ€— Transformers and Diffusers libraries, interpreting +a natural language input using an LLM with carefully crafted prompts, and using a set of curated tools to provide +multimodal outputs. + +Let's go ahead and instantiate an agent. There are [three LLMs available](https://huggingface.co/docs/transformers/transformers_agents#quickstart) +for Transformers Agents, two of which are open-source and free on the Hugging Face Hub. The third is a model from OpenAI +that requires an OpenAI API key. We'll use the free [Bigcode Starcoder](https://huggingface.co/bigcode/starcoder) model +in this example, but you can also try either of the other LLMs available: + +```python +from transformers import HfAgent + +agent = HfAgent( + url_endpoint="https://api-inference.huggingface.co/models/bigcode/starcoder" +) +``` + +To use the agent, we simply have to call `agent.run` with our text prompt. As an example, we'll get it to generate an +image of a cat 🐈 (that hopefully looks a bit better than this emoji): + +```python +agent.run("Generate an image of a cat") +``` + +
+ +
+ + + Note that the first time calling this will trigger the model weights to be downloaded, which might take + some time depending on your Hub download speed. + + +Easy as that! The Agent interpreted our prompt, and used [Stable Diffusion](https://huggingface.co/docs/diffusers/using-diffusers/conditional_image_generation) +under the hood to generate the image, without us having to worry about loading the model, writing the function or executing +the code. + +We can now replace our LLM query function and text synthesis step with our Transformers Agent in our voice assistant, +since the Agent is going to take care of both of these steps for us: + +```python +launch_fn() +transcription = transcribe() +agent.run(transcription) +``` + +Try speaking the same prompt "Generate an image of a cat" and see how the system gets on. If you ask the Agent a simple +question / answer query, the Agent will respond with a text answer. You can encourage it to generate multimodal outputs +by asking it to return an image or speech. For example, you can ask it to: "Generate an image of a cat, caption it, and +speak the caption". + +While the Agent is more flexible than our first iteration Marvin πŸ€– assistant, generalising the voice assistant task in this way +may lead to inferior performance on standard voice assistant queries. To recover performance, you can try using a +more performant LLM checkpoint, such as the one from OpenAI, or define a set of [custom tools](https://huggingface.co/docs/transformers/transformers_agents#custom-tools) +that are specific to the voice assistant task. diff --git a/chapters/unpublished/chapter7/choosing_dataset.mdx b/chapters/unpublished/chapter7/choosing_dataset.mdx deleted file mode 100644 index e2a0fdbb..00000000 --- a/chapters/unpublished/chapter7/choosing_dataset.mdx +++ /dev/null @@ -1 +0,0 @@ -# Choosing a dataset \ No newline at end of file diff --git a/chapters/unpublished/chapter7/evaluation.mdx b/chapters/unpublished/chapter7/evaluation.mdx deleted file mode 100644 index 2058b987..00000000 --- a/chapters/unpublished/chapter7/evaluation.mdx +++ /dev/null @@ -1 +0,0 @@ -# Evaluation metrics for audio-to-audio \ No newline at end of file diff --git a/chapters/unpublished/chapter7/fine-tuning.mdx b/chapters/unpublished/chapter7/fine-tuning.mdx deleted file mode 100644 index 85bc0244..00000000 --- a/chapters/unpublished/chapter7/fine-tuning.mdx +++ /dev/null @@ -1 +0,0 @@ -# Fine-tuning the model \ No newline at end of file diff --git a/chapters/unpublished/chapter7/hands_on.mdx b/chapters/unpublished/chapter7/hands_on.mdx deleted file mode 100644 index 75cb5a68..00000000 --- a/chapters/unpublished/chapter7/hands_on.mdx +++ /dev/null @@ -1 +0,0 @@ -# Hands-on exercise \ No newline at end of file diff --git a/chapters/unpublished/chapter7/introduction.mdx b/chapters/unpublished/chapter7/introduction.mdx deleted file mode 100644 index 538dcf0d..00000000 --- a/chapters/unpublished/chapter7/introduction.mdx +++ /dev/null @@ -1 +0,0 @@ -# What you'll learn and what you'll build \ No newline at end of file diff --git a/chapters/unpublished/chapter7/preprocessing.mdx b/chapters/unpublished/chapter7/preprocessing.mdx deleted file mode 100644 index 0e3982a9..00000000 --- a/chapters/unpublished/chapter7/preprocessing.mdx +++ /dev/null @@ -1 +0,0 @@ -# Preprocessing data \ No newline at end of file diff --git a/chapters/unpublished/chapter7/quiz.mdx b/chapters/unpublished/chapter7/quiz.mdx deleted file mode 100644 index 62962841..00000000 --- a/chapters/unpublished/chapter7/quiz.mdx +++ /dev/null @@ -1,3 +0,0 @@ - - -# Check your understanding of the course material \ No newline at end of file diff --git a/chapters/unpublished/chapter7/supplemental_reading.mdx b/chapters/unpublished/chapter7/supplemental_reading.mdx deleted file mode 100644 index 88ff75ab..00000000 --- a/chapters/unpublished/chapter7/supplemental_reading.mdx +++ /dev/null @@ -1 +0,0 @@ -# Supplemental reading and resources \ No newline at end of file diff --git a/chapters/unpublished/chapter7/tasks.mdx b/chapters/unpublished/chapter7/tasks.mdx deleted file mode 100644 index c55d9323..00000000 --- a/chapters/unpublished/chapter7/tasks.mdx +++ /dev/null @@ -1 +0,0 @@ -# Examples of tasks: noise removal, source separation \ No newline at end of file