import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import IPython

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from tqdm import tqdm

!wget https://github.com/karolpiczak/ESC-50/archive/master.zip
!unzip master.zip

fn_csv = 'ESC-50-master/meta/esc50.csv'


### START CODING
df = pd. ...  # pd = pandas dataframe
...

unique_classes = df...  # you may use unique()
print(f'...')
### END CODING

# Setup some filepaths
path = 'ESC-50-master/audio/'
file0 = path + df['filename'][2]  # We use indices [2-4] here, feel free to choose other files
file1 = path + df['filename'][3]
file2 = path + df['filename'][4]

# Show audio player for each file
print(df['category'][2])
IPython.display.display(IPython.display.Audio(data=file0))
print(df['category'][3])
IPython.display.display(IPython.display.Audio(data=file1))
print(df['category'][4])
IPython.display.display(IPython.display.Audio(data=file2))

# Plot mel specs
files = [file0, file1, file2]
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))
for i in range(3):
  y, sr = librosa.load(files[i])
  D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
  img = librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=sr, ax=ax[i])
  ax[i].set(title='Power spec')
  ax[i].label_outer()
fig.colorbar(img, ax=ax, format="%+2.f dB")

our_classes = ['crying_baby', 'dog', 'rain', 'rooster', 'sneezing']  # Note: This is also a class map for later.
esc5_X = []  # File list
esc5_y = []  # Class list
fn_csv = 'ESC-50-master/meta/esc50.csv'


### START CODING ###
df = ...

for ... in df.values:  # This is one way, not an ideal way: This loop aims to find files for each class in our_classes.
  if ... :
    esc5_X.append( ... )  # filename column of df
    ...                   # class column of df

print( ...(esc5_X[:5], esc5_y[:5])... )
print(f'Lengths: ...')
### END CODING ###

### START CODING HERE ###
X_train, X_test, ... = ...(..., random_state=1337)

print(...)
print(f'X: ...; y: ...')
### END CODING ###

### START CODING HERE ###
def extract_mel_spec(...):
  X = []  # feature tensor
  y = []  # target tensor

  mel_bands = 128
  for ..., ... in tqdm(enumerate(...)):  # tqdm simply displays a progress bar.
    wav_data, sr = librosa...  # Use the wav file's sample rate.

    # Features (2D)
    mel_spec = librosa...  # Create mel spectrogram. Output shape: (128, 216) (n_mels, frames)
    mel_spec = mel_spec/... # Normalization
    mel_spec = mel_spec...  # Transposition. Output shape: (216, 128)
    X.append( ... )  # Append to feature tensor

    # Targets == class_name
    targets = np.ones( mel_spec... )  # Create a PLACEHOLDER target vector. Output shape: (216) (Note: silent frames are not going to be labeled as "silent")
    targets = targets * our_classes...  # Convert values to actual class-index from 'our_classes'
    ...  # Append to target tensor

  # Stack tensors
  X = np.vstack(X)
  y = np.hstack(y)

  # Return the tensors
  return ...


# Call the function on our previously generated lists
X_train_ready, y_train_ready = extract_mel_spec(...)
X_test_ready, y_test_ready = ...
### END CODING HERE ###


print(f'\nShapes: X_train_ready: {X_train_ready.shape}, y_train_ready: {y_train_ready.shape}')
print(f'Shapes: X_test_ready: {X_test_ready.shape}, y_test_ready: {y_test_ready.shape}')

### START CODING HERE ###
# Feature scaling / Data standardization / Normalization
print('Scaling...')
scaler = StandardScaler()
scaler.fit(X_train_ready)
X_train_ready = scaler.transform(...)  # Normalize the features here for both train and test
X_test_ready = ...

print('Fitting...')
model = ...(n_neighbors=..., weights=...)  # Call the kNN-classifier. Look at your imports again for a hint.
model...  # Fit/Train the classifier using our generated tensors.

print('Evaluating...')
print(f'Train score: {np.round(model.score(...), decimals=...)}')
print(f'Test score: ...')
### END CODING HERE ###

### START CODING HERE ###
...(model, ..., ..., normalize=...)  # Call the confusion matrix plot function. Look at your imports again for a hint.
plt.xticks(ticks=np.arange(...), labels=...)  # hint: np.arange(5) = (0, 1, 2, 3, 4)
plt.yticks(...)
plt...
### END CODING HERE ###

Before you start¶

Machine Listening Seminar 3: Sound event classification¶

1. Import libraries¶

2. Fetch the Dataset¶

3. Metadata and analysis I¶

4. Metadata and analysis II¶

5. ESC-5: Curation¶

6. ESC-5: Dataset splitting¶

7. ESC-5: Create mel spectrograms¶

8. ESC-5: Train a nearest neighbor classifier¶

9. ESC-5: Plot the confusion matrix¶