Audio Files with Librosa
Loading audio files
Librosa is a Python libray created for working with audio data. It's both easy to understand and has an extensive feuture list.
By default Librosa support all the popular audio file extensions, like WAV, OGG, MP3, and FLAC.
import librosa
import librosa.display
filename = librosa.ex('trumpet')
y, sr = librosa.load(filename)
y is a NumPy matrix that contains audio time series. If audio file is mono it will be one-dimensional vector, if audio file is stereo it will be two-dimensional, and so on. sr is a sampling rate of an audio file.
Playing audio in Jupyter notebook
Using the code bellow you will be able to play the audio inside of your Jupyter notebook. The player is very basic, but will be enough for simpler projects.
from IPython.display import Audio
Audio(data=y, rate=sr)
Vizualising audio files
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 4))
librosa.display.waveshow(y, sr=sr)
plt.show()
fig, ax = plt.subplots()
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,
fmax=8000)
S_dB = librosa.power_to_db(S, ref=np.max)
img = librosa.display.specshow(S_dB, x_axis='time',
y_axis='mel', sr=sr,
fmax=8000, ax=ax)
fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set(title='Mel-frequency spectrogram')
Working with multiple audio files
from glob import glob
audio_filepaths = []
for filename in glob('./audio/*.wav'):
audio_filepaths += [filename]
audio_filepaths
padded_audio_files = []
max_allowed_length = 32000
for audio_filepath in audio_filepaths:
y_voice, sr_voice = librosa.load(audio_filepath)
if len(y_voice) > max_allowed_length:
raise ValueError("data length cannot exceed padding length.")
elif len(y_voice) < max_allowed_length:
embedded_data = np.zeros(max_allowed_length)
offset = np.random.randint(low = 0, high = max_allowed_length - len(y_voice))
embedded_data[offset:offset+len(y_voice)] = y_voice
elif len(y_voice) == max_allowed_length:
embedded_data = y_voice
pass
padded_audio_files += [embedded_data]
padded_audio_files = np.array(padded_audio_files)
padded_audio_files.shape
Advanced Audio Analysis
S = np.abs(librosa.stft(y))
S.shape
D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
D.shape
S = np.abs(librosa.stft(y))
times = librosa.times_like(S)
fig, ax = plt.subplots(nrows=2, sharex=True)
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
y_axis='log', x_axis='time', ax=ax[0])
ax[0].set(title='Power spectrogram')
ax[0].label_outer()
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
ax[1].plot(times, 2 + onset_env / onset_env.max(), alpha=0.8,
label='Mean (mel)')
onset_env = librosa.onset.onset_strength(y=y, sr=sr,
aggregate=np.median,
fmax=8000, n_mels=256)
ax[1].plot(times, 1 + onset_env / onset_env.max(), alpha=0.8,
label='Median (custom mel)')
C = np.abs(librosa.cqt(y=y, sr=sr))
onset_env = librosa.onset.onset_strength(sr=sr, S=librosa.amplitude_to_db(C, ref=np.max))
ax[1].plot(times, onset_env / onset_env.max(), alpha=0.8,
label='Mean (CQT)')
ax[1].legend()
ax[1].set(ylabel='Normalized strength', yticks=[])
Filters
fig, ax = plt.subplots(nrows=2, figsize=(12, 4), constrained_layout=True)
ax[0].set(title='Normal waveform')
librosa.display.waveshow(y, sr=sr, ax=ax[0])
sos = signal.butter(17, 150, 'hp', fs=1000, output='sos')
filtered = signal.sosfilt(sos, y)
ax[1].set(title='Filtered waveform')
librosa.display.waveshow(filtered, sr=sr, ax=ax[1])