From 9a68af922c6ad0b18908935c61b5924fb29f9a58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 14 Jan 2025 22:32:24 +0100 Subject: [PATCH] chore: remove librosa dependency in favor of torchaudio --- .github/workflows/test_requirements.txt | 1 - README.md | 4 ++++ environment.yml | 1 - scripts/demo.py | 18 ++++++++-------- scripts/measure_convolve_execution_time.py | 9 ++++---- setup.py | 1 - tests/test_background_noise.py | 15 +++----------- tests/test_convolution.py | 11 +++++----- tests/test_impulse_response.py | 14 +++---------- tests/test_mix.py | 24 ++++------------------ torch_audiomentations/utils/dsp.py | 8 -------- torch_audiomentations/utils/file.py | 16 --------------- torch_audiomentations/utils/io.py | 15 +------------- 13 files changed, 33 insertions(+), 104 deletions(-) diff --git a/.github/workflows/test_requirements.txt b/.github/workflows/test_requirements.txt index 74845fe6..1f407f34 100644 --- a/.github/workflows/test_requirements.txt +++ b/.github/workflows/test_requirements.txt @@ -4,7 +4,6 @@ torch==1.11.0 torchaudio==0.11.0 audioread>=2.1.8 julius>=0.2.3,<0.3 -librosa==0.9.1 py-cpuinfo>=7.0.0 pytest==5.3.4 pytest-cov==2.8.1 diff --git a/README.md b/README.md index efde6126..721b1f82 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,10 @@ classification. It was successfully applied in the paper * Add new transforms: `Mix`, `Padding`, `RandomCrop` and `SpliceOut` +### Changes + +* Remove `librosa` dependency in favor of `torchaudio` + ## [v0.11.2] - 2025-01-09 ### Fixed diff --git a/environment.yml b/environment.yml index 3247566f..253dc42e 100644 --- a/environment.yml +++ b/environment.yml @@ -19,7 +19,6 @@ dependencies: - black==23.12.1 - coverage==5.3 - julius>=0.2.3,<0.3 - - librosa==0.9.1 - pandas==1.1.4 - py-cpuinfo==7.0.0 - pytest==7.4.4 diff --git a/scripts/demo.py b/scripts/demo.py index 4c489925..d2671781 100644 --- a/scripts/demo.py +++ b/scripts/demo.py @@ -3,7 +3,6 @@ import time from pathlib import Path -import librosa import numpy as np import torch from scipy.io import wavfile @@ -30,6 +29,7 @@ from torch_audiomentations.augmentations.splice_out import SpliceOut from torch_audiomentations.core.transforms_interface import ModeNotSupportedException from torch_audiomentations.utils.object_dict import ObjectDict +from torch_audiomentations.utils.io import Audio SAMPLE_RATE = 44100 @@ -88,14 +88,14 @@ def __exit__(self, type, value, traceback): random.seed(43) filenames = ["perfect-alley1.ogg", "perfect-alley2.ogg"] - samples1, _ = librosa.load( - os.path.join(TEST_FIXTURES_DIR, filenames[0]), sr=SAMPLE_RATE, mono=False - ) - samples2, _ = librosa.load( - os.path.join(TEST_FIXTURES_DIR, filenames[1]), sr=SAMPLE_RATE, mono=False - ) - samples = np.stack((samples1, samples2), axis=0) - samples = torch.from_numpy(samples) + audio = Audio(SAMPLE_RATE, mono=True) + samples1 = audio(os.path.join(TEST_FIXTURES_DIR, filenames[0])) + _, num_samples1 = samples1.shape + samples2 = audio(os.path.join(TEST_FIXTURES_DIR, filenames[1])) + _, num_samples2 = samples2.shape + num_samples = min(num_samples1, num_samples2) + samples = torch.stack([samples1[:, :num_samples], samples2[:, :num_samples]], dim=0) + modes = ["per_batch", "per_example", "per_channel"] for mode in modes: diff --git a/scripts/measure_convolve_execution_time.py b/scripts/measure_convolve_execution_time.py index 34d31b49..29922be9 100644 --- a/scripts/measure_convolve_execution_time.py +++ b/scripts/measure_convolve_execution_time.py @@ -1,4 +1,3 @@ -import librosa import numpy as np import torch from cpuinfo import get_cpu_info @@ -8,14 +7,14 @@ from scripts.demo import TEST_FIXTURES_DIR, timer from scripts.plot import show_horizontal_bar_chart from torch_audiomentations.utils.convolution import convolve as torch_convolve +from torch_audiomentations.utils.io import Audio if __name__ == "__main__": file_path = TEST_FIXTURES_DIR / "acoustic_guitar_0.wav" sample_rate = 48000 - samples, _ = librosa.load(file_path, sr=sample_rate) - ir_samples, _ = librosa.load( - TEST_FIXTURES_DIR / "ir" / "impulse_response_0.wav", sr=sample_rate - ) + audio = Audio(sample_rate, mono=True) + samples = audio(file_path).numpy() + ir_samples = audio(TEST_FIXTURES_DIR / "ir" / "impulse_response_0.wav").numpy() is_cuda_available = torch.cuda.is_available() print("Is torch CUDA available:", is_cuda_available) diff --git a/setup.py b/setup.py index 0d1a85e2..a12c23ff 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,6 @@ def find_version(*file_paths): ), install_requires=[ "julius>=0.2.3,<0.3", - "librosa>=0.6.0", "torch>=1.7.0", "torchaudio>=0.9.0", "torch-pitch-shift>=1.2.2", diff --git a/tests/test_background_noise.py b/tests/test_background_noise.py index ca74c819..86ff12a1 100644 --- a/tests/test_background_noise.py +++ b/tests/test_background_noise.py @@ -13,9 +13,8 @@ from torch_audiomentations import AddBackgroundNoise from torch_audiomentations.utils.dsp import calculate_rms -from torch_audiomentations.utils.file import load_audio from .utils import TEST_FIXTURES_DIR - +from torch_audiomentations.utils.io import Audio class TestAddBackgroundNoise(unittest.TestCase): def setUp(self): @@ -23,17 +22,9 @@ def setUp(self): self.batch_size = 16 self.empty_input_audio = torch.empty(0, 1, 16000) # TODO: use utils.io.Audio - self.input_audio = ( - torch.from_numpy( - load_audio( - TEST_FIXTURES_DIR / "acoustic_guitar_0.wav", - sample_rate=self.sample_rate, - ) - ) - .unsqueeze(0) - .unsqueeze(0) - ) + audio = Audio(self.sample_rate, mono=True) + self.input_audio = audio(TEST_FIXTURES_DIR / "acoustic_guitar_0.wav")[None] self.input_audios = torch.cat([self.input_audio] * self.batch_size, dim=0) self.bg_path = TEST_FIXTURES_DIR / "bg" diff --git a/tests/test_convolution.py b/tests/test_convolution.py index b83ad393..7de2798b 100644 --- a/tests/test_convolution.py +++ b/tests/test_convolution.py @@ -1,21 +1,20 @@ -import librosa import torch from numpy.testing import assert_almost_equal from scipy.signal import convolve as scipy_convolve from tests.utils import TEST_FIXTURES_DIR from torch_audiomentations.utils.convolution import convolve as torch_convolve - +from torch_audiomentations.utils.io import Audio class TestConvolution: def test_convolve(self): sample_rate = 16000 file_path = TEST_FIXTURES_DIR / "acoustic_guitar_0.wav" - samples, _ = librosa.load(file_path, sr=sample_rate) - ir_samples, _ = librosa.load( - TEST_FIXTURES_DIR / "ir" / "impulse_response_0.wav", sr=sample_rate - ) + audio = Audio(sample_rate, mono=True) + samples = audio(file_path).numpy() + ir_samples = audio(TEST_FIXTURES_DIR / "ir" / "impulse_response_0.wav").numpy() + expected_output = scipy_convolve(samples, ir_samples) actual_output = torch_convolve( diff --git a/tests/test_impulse_response.py b/tests/test_impulse_response.py index c84dc050..77d3090f 100644 --- a/tests/test_impulse_response.py +++ b/tests/test_impulse_response.py @@ -4,7 +4,7 @@ import torch from torch_audiomentations import ApplyImpulseResponse -from torch_audiomentations.utils.file import load_audio +from torch_audiomentations.utils.io import Audio from .utils import TEST_FIXTURES_DIR @@ -15,16 +15,8 @@ def sample_rate(): @pytest.fixture def input_audio(sample_rate): - return ( - torch.from_numpy( - load_audio( - os.path.join(TEST_FIXTURES_DIR, "acoustic_guitar_0.wav"), - sample_rate=sample_rate, - ) - ) - .unsqueeze(0) - .unsqueeze(0) - ) + audio = Audio(sample_rate, mono=True) + return audio(os.path.join(TEST_FIXTURES_DIR, "acoustic_guitar_0.wav"))[None] @pytest.fixture diff --git a/tests/test_mix.py b/tests/test_mix.py index b43ad41b..f5161eda 100644 --- a/tests/test_mix.py +++ b/tests/test_mix.py @@ -4,32 +4,16 @@ from torch_audiomentations.augmentations.mix import Mix from torch_audiomentations.utils.dsp import calculate_rms -from torch_audiomentations.utils.file import load_audio +from torch_audiomentations.utils.io import Audio from .utils import TEST_FIXTURES_DIR class TestMix(unittest.TestCase): def setUp(self): self.sample_rate = 16000 - self.guitar = ( - torch.from_numpy( - load_audio( - TEST_FIXTURES_DIR / "acoustic_guitar_0.wav", - sample_rate=self.sample_rate, - ) - ) - .unsqueeze(0) - .unsqueeze(0) - ) - self.noise = ( - torch.from_numpy( - load_audio( - TEST_FIXTURES_DIR / "bg" / "bg.wav", sample_rate=self.sample_rate - ) - ) - .unsqueeze(0) - .unsqueeze(0) - ) + audio = Audio(self.sample_rate, mono=True) + self.guitar = audio(TEST_FIXTURES_DIR / "acoustic_guitar_0.wav")[None] + self.noise = audio(TEST_FIXTURES_DIR / "bg" / "bg.wav")[None] common_num_samples = min(self.guitar.shape[-1], self.noise.shape[-1]) self.guitar = self.guitar[:, :, :common_num_samples] diff --git a/torch_audiomentations/utils/dsp.py b/torch_audiomentations/utils/dsp.py index ffc98982..297078ef 100644 --- a/torch_audiomentations/utils/dsp.py +++ b/torch_audiomentations/utils/dsp.py @@ -24,14 +24,6 @@ def calculate_desired_noise_rms(clean_rms, snr): return noise_rms -def resample_audio(audio, orig_sr, target_sr): - # TODO: We can probably remove this function and call resample directly where needed - """Resamples the audio to a new sampling rate.""" - import librosa - - return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) - - def convert_decibels_to_amplitude_ratio(decibels): return 10 ** (decibels / 20) diff --git a/torch_audiomentations/utils/file.py b/torch_audiomentations/utils/file.py index f271cc56..88cdfbd1 100644 --- a/torch_audiomentations/utils/file.py +++ b/torch_audiomentations/utils/file.py @@ -2,9 +2,6 @@ from pathlib import Path from typing import List, Union -import soundfile - -from .dsp import resample_audio SUPPORTED_EXTENSIONS = (".wav",) @@ -64,16 +61,3 @@ def find_audio_files( break return file_paths - - -def load_audio(audio_file_path, sample_rate=None, start=0, stop=None): - # TODO: Clarify whether start/stop is in samples or in seconds, and whether or not it - # relates to the original or the resampled audio. - """Loads the audio given the path of an audio file.""" - audio, source_sample_rate = soundfile.read(audio_file_path, start=start, stop=stop) - - if sample_rate: - audio = resample_audio(audio, source_sample_rate, sample_rate) - - # TODO: return sample rate as well - return audio diff --git a/torch_audiomentations/utils/io.py b/torch_audiomentations/utils/io.py index 80b9cdb0..3ce075fb 100644 --- a/torch_audiomentations/utils/io.py +++ b/torch_audiomentations/utils/io.py @@ -1,8 +1,6 @@ -import warnings from pathlib import Path from typing import Text, Union -import librosa import torch import torchaudio from torch import Tensor @@ -155,18 +153,7 @@ def downmix_and_resample(self, samples: Tensor, sample_rate: int) -> Tensor: # resample if self.sample_rate != sample_rate: - samples = samples.numpy() - if self.mono: - # librosa expects mono audio to be of shape (n,), but we have (1, n). - samples = librosa.core.resample( - samples[0], orig_sr=sample_rate, target_sr=self.sample_rate - )[None] - else: - samples = librosa.core.resample( - samples.T, orig_sr=sample_rate, target_sr=self.sample_rate - ).T - - samples = torch.tensor(samples) + samples = torchaudio.functional.resample(samples, sample_rate, self.sample_rate) return samples