import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm


# Tensorflow: check available devices
all_devices = tf.config.list_physical_devices()
print('Found {} devices: {}'.format( len(all_devices), all_devices ))

!wget https://github.com/karolpiczak/ESC-50/archive/master.zip
!unzip master.zip

fn_csv = 'ESC-50-master/meta/esc50.csv'

df = pd.read_csv(fn_csv)  # reads csv, creates dataframe
print(df.head())
unique_classes = df['category'].unique()  # retrieves a list of unique classes
print(f'\nUnique classes: {unique_classes}\nCount: {len(unique_classes)}')

our_classes = ["crying_baby", "dog", "rain", "rooster", "sneezing"]  # Note: This is also our class map for later.
esc5_X = []  # File list
esc5_y = []  # Class list
fn_csv = "ESC-50-master/meta/esc50.csv"


### START CODING HERE ###
df = pd...

for row in df.values:  # (This particular algorithm here works, but is not ideal)
  if any(... == ... for ... in our_classes):
    esc5_X.append( ... )  # Filename column
    ....append( ... )  # Class column

print( list(zip(esc5_X[:5], esc5_y[:5])) )
print(f"Lengths: esc5_X: {...}, esc5_y: {...}")
### END CODING HERE ###

### START CODING HERE ###
X_train, X_test, y_train, ... = ...(esc5_X, ..., test_size=..., random_state=...)  # Look at your imports for a hint which function to use.

print(X_train...)
print(f"X: {...}, {...}; y: {...}, {...}")
### END CODING HERE ###

### START CODING HERE ###
def extract_mel_spec(...):
  X = []  # feature tensor
  y = []  # target tensor

  mel_bands = 128
  for i, filename in tqdm(...(data_X)):  # tqdm displays a progress bar.
    wav_data, sr = ...(f"ESC-50-master/audio/{filename}")  # Use the wav file's sample rate.

    # Features
    mel_spec = librosa.feature.melspectrogram(y=wav_data, sr=sr, n_mels=mel_bands)  # Create mel spectrogram. Output shape: (128, 216) (n_mels, frames)
    mel_spec = mel_spec...  # Normalization
    mel_spec = mel_spec...  # Transposition. Output shape: (216, 128)
    X ... ( mel_spec )  # Append to feature tensor

    # Targets == class_name
    targets = np.ones( mel_spec.shape[0] )  # Create a PLACEHOLDER target vector. Output shape: (216) (Note: silent frames are NOT going to be labeled as "silent")
    targets = targets * our_classes.index( data_y[i] )  # Convert values to actual class-index from 'our_classes'
    ...  # Append to target tensor

  # Stack tensors
  X = np.vstack(X)
  y = np.hstack(y)

  # Return the tensors
  return X, y


# Call the function on our previously generated lists
X_train_ready, y_train_ready = ...(X_train, y_train)
X_test_ready, y_test_ready = extract_mel_spec(...)
### END CODING HERE ###


print(f"\nShapes: X_train_ready: {X_train_ready.shape}, y_train_ready: {y_train_ready.shape}")
print(f"Shapes: X_test_ready: {X_test_ready.shape}, y_test_ready: {y_test_ready.shape}")

### START CODING HERE ###
# Feature scaling/standardization: removes mean & scales to unit variance
print("Scaling...")
scaler = StandardScaler()
scaler.fit(X_train_ready)
X_train_ready = scaler.transform(...)  # Normalize the features here for both train and test
X_test_ready = scaler.transform(...)

print("Fitting...")
model = KNeighborsClassifier(n_neighbors=..., weights=...)  # Call the kNN-classifier. Look at your imports again for a hint.
model.fit(..., ...)  # Fit/Train the classifier using our generated tensors.

print("Evaluating...")
print(f"Train score: {...(model.score(...), decimals=4)}")
print(f"Test score: {...(model.score(...), decimals=4)}")
### END CODING HERE ###

### START CODING HERE ###
y_test_pred = ...
cm = confusion_matrix(..., normalize='true')  # Call the confusion matrix function. Look at your imports again for a hint.
disp = ...
disp.plot()
plt.xticks(ticks=..., labels=...)  # hint: np.arange(5) = (0, 1, 2, 3, 4)
plt.yticks(...)
...show()
### END CODING HERE ###

from keras.models import Sequential, Model
from keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, Dropout
from keras.optimizers import SGD
from keras.losses import CategoricalCrossentropy
from keras.utils import to_categorical

print(f"y_train_ready.shape: {y_train_ready.shape}, y_test_ready.shape: {y_test_ready.shape}")  # These are our shapes so far.

### START CODING HERE ###
y_train_ready_OHE = to_categorical(y=..., num_classes=...)
y_test_ready_OHE = ....

print(f"y_train_ready_OHE.shape: {...}, y_test_ready_OHE.shape: {...}")  # These are our new shapes.
### END CODING HERE ###

# Sequential API
model_seq = Sequential()
model_seq.add(Dense(128, input_dim=128))       # input layer, 128 input features due to our feature extraction process (Dense == Fully Connected)
model_seq.add(Dense(256, activation="relu"))   # hidden layer 1
model_seq.add(Dense(64, activation="relu"))    # hidden layer 2
model_seq.add(Dense(5, activation="softmax"))  # output layer. Often this is sigmoid (each index can be between 0 and 1), or softmax (all indices sum up to 1)

# Functional API (node-like)
model_func_in = Input(shape=(128,))                               # input layer
model_func_x = Dense(10, activation="relu")(model_func_in)        # hidden layer 1
model_func_x = Dense(10, activation="relu")(model_func_x)         # hidden layer 2
model_func_x = Dense(10, activation="relu")(model_func_x)         # hidden layer 3
model_func_out = Dense(5, activation="softmax")(model_func_x)     # output layer
model_func = Model(inputs=model_func_in, outputs=model_func_out)  # model instance, specified with input and output layers

# To simplify the following cells, select a desired model here
model = model_seq
#model = model_func

model.compile(optimizer=SGD(lr=0.001), loss=CategoricalCrossentropy(), metrics=["accuracy"])  # Compiles the model
# -> Given this loss function, "accuracy" means "Categorical Accuracy". Notice how "Categorical" relates to OHE

model.summary()  # Prints the architecture

### START CODING HERE ###
model...(x=..., y=..., epochs=10)
### END CODING HERE ###

### START CODING HERE ###
score = model...(x=..., y=..., return_dict=True, verbose=1)
print(f"Scores: ...")
### END CODING HERE ###

### START CODING HERE ###
# Predict the test set
predictions = model...(...)
print(f"Shape of predictions: {...}")

# Plot confusion matrix
confmat = confusion_matrix(y_true=..., y_pred=..., normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=..., display_labels=...)
disp.plot(xticks_rotation=45)  # rotate labels on x-axis for readability
...show()
### END CODING HERE ###

# Input layer
model_cnn_in = Input(shape=(1, 128, 1))  # Why 3 dims here...? "num_examples" can be left "unknown". Note "(None)" in the summary. (See next cell for more info.)

# Hidden layers
model_cnn_conv1 = Conv2D(filters=32, kernel_size=(1, 3), activation="relu")(model_cnn_in)  # Input is a vector (1, 128), so (1, 3) spans the kernel only over the feature dimension
model_cnn_pool1 = MaxPooling2D(pool_size=(1, 2))(model_cnn_conv1)  # Same as above: We pool only in frequency dimension
model_cnn_conv2 = Conv2D(filters=64, kernel_size=(1, 3), activation="relu")(model_cnn_pool1)
model_cnn_pool2 = MaxPooling2D(pool_size=(1, 2))(model_cnn_conv2)
model_cnn_conv2 = Conv2D(filters=128, kernel_size=(1, 3), activation="relu")(model_cnn_pool2)

# Towards backend
model_cnn_flat = Flatten()(model_cnn_conv2)
model_cnn_drop1 = Dropout(0.2)(model_cnn_flat)
model_cnn_dense1 = Dense(64, activation="relu")(model_cnn_drop1)

# Output layer
model_cnn_out = Dense(5, activation="softmax")(model_cnn_dense1)

# Actual model creation
model_cnn = Model(inputs=model_cnn_in, outputs=model_cnn_out)
model_cnn.compile(optimizer=SGD(lr=0.1), loss=CategoricalCrossentropy(), metrics=["accuracy"])
model_cnn.summary()

# Convert tensors to correct input shape (for both training and test sets)
print(f"Shapes: X_train_ready {X_train_ready.shape}, y_train_ready_OHE {y_train_ready_OHE.shape}")
# Shape X_train_ready: (34560, 128)    <-- needs format adjustment
# Shape y_train_ready_OHE: (34560, 5)  <-- already in correct format

# potentially: feed more "height" (time) in by switching dimensions to (1, 34560, 128, 1), then slicing the tensor into 160 tensors (with 216 frames each, essentially per-file training then)

### START CODING HERE ###
# Add channel dimension
X_train_cnn = np.expand_dims(X_train_ready, axis=-1)  # (34560, 128, 1)
X_test_cnn = ...                                      # (8640, 128, 1)

# Add height dimension
X_train_cnn = ...(X_train_cnn, axis=...)  # (34560, 1, 128, 1)
X_test_cnn = ...                          # (8640, 1, 128, 1)

# New shapes
print(f"Final shapes: X_train_cnn {...}, X_test_cnn {...}")
### END CODING HERE ###

### START CODING HERE ###
model_cnn...(x=..., y=..., batch_size=16, epochs=10)
### END CODING HERE ###

### START CODING HERE ###
scores = model_cnn...(..., return_dict=True, verbose=1)
print(f"Scores: ...")
### END CODING HERE ###

### START CODING HERE ####
# Predict test set
predictions = model_cnn...(...)
print(f"Shape of predictions: {...}")

# Plot confusion matrix
confmat = confusion_matrix(y_true=..., y_pred=, normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=..., display_labels=...)
disp.plot(xticks_rotation=45)
...show()  # display plot
### END CODING HERE ###

Before you start¶

Machine Listening Seminar 3: Sound/Music classification with simple Neural Networks¶

1. Import libraries¶

2. Fetch the Dataset¶

3. Metadata and analysis¶

4. ESC-5: Curation¶

5. Splitting the dataset into train and test subsets¶

6. ESC-5: Create mel spectrograms¶

7. Train a nearest neighbor classifier¶

8. ESC-5: Plot the confusion matrix¶

Getting to know Tensorflow for Neural Networks¶

9. Import Tensorflow modules¶

10. One-Hot Encoding (OHE) of targets (binary class matrix)¶

11. Build a simple DNN (Sequential API & Functional API)¶

12. Build a simple CNN (Functional API)¶

13. Hyper-parameter tuning¶