Trying to use wav2vec 2.0 in conjuction with CNN for speech emotion recognition. Four classes have been defined. All the audios has been preprocessed and adequately truncated/padded and resampled according to the need of the wav2vec 2.0 model. This is how the model has been defined:
class SimpleNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(SimpleNN, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.softmax(x)
return x
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
# Modify the last layer to match the number of output classes
label_mapping = {'OAF_Fear': 0, 'OAF_angry': 1, 'OAF_happy': 2, 'OAF_neutral': 3}
num_classes = len(label_mapping)
model.lm_head = nn.Linear(in_features=model.config.hidden_size, out_features=num_classes, bias=True)
for param in model.parameters():
param.requires_grad = False
for param in model.lm_head.parameters():
param.requires_grad = True
input_size = 768 # Size of features extracted from pre-trained model
hidden_size = 256
output_size = num_classes # Number of emotion classes
learning_rate = 0.001
num_epochs = 50
batch_size = 32
root_dir = "/content/drive/MyDrive/BTP_hanan_dataset/Dataset/TESS"
class FullModel(nn.Module):
def __init__(self, wav2vec_model, simple_nn_model):
super(FullModel, self).__init__()
self.wav2vec_model = wav2vec_model
self.simple_nn_model = simple_nn_model
def forward(self, x):
# Get hidden states from pre-trained model
hidden_states = self.wav2vec_model(x)[0]
# Aggregate hidden states (e.g., by averaging or max-pooling)
aggregated_hidden_state = torch.mean(hidden_states, dim=1) # Example: averaging
# Pass through simple neural network
output = self.simple_nn_model(aggregated_hidden_state)
return output
simple_nn = SimpleNN(input_size, hidden_size, output_size)
for param in simple_nn.parameters():
param.requires_grad = True
# Combine pre-trained model and simple neural network into a single model
full_model = FullModel(model, simple_nn)
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(full_model.parameters(), lr=learning_rate)
The last layer of the pretrained model has been trained, whose parameters are passed on to the simple CNN. The model is stagnant at 35 percent accuracy.
Tried it on two different datasets, yet, nothing is improving. Early stopping is triggered after 7 - 10 epochs, with patience = 5. What am I doing wrong?