I got a neural network that separates noise from people voices, at least tries to, and I want to test it on a file, I used the code below but I got back a buzzing sound, not relevant at all. I think the problem is I am using only the magnitude to reconstruct the signal, but I need to use the phase, as it is in my requirements.
How do i do that?
import os
import soundfile as sf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
# Define the model architecture
class AudioUNet(nn.Module):
...
# Create an instance of the model
model = AudioUNet(input_channels=1, start_neurons=16)
# Load the trained model weights
checkpoint_path = 'models/checkpoint_epoch.pth'
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval() # Set the model to evaluation mode
# Load the noisy audio file
noisy_audio_file = 'F:\IRRI\combinate\mixed_audio_90.wav'
noisy_waveform, sample_rate = sf.read(noisy_audio_file, always_2d=True)
noisy_waveform = torch.from_numpy(noisy_waveform).to(torch.float32)
# Apply STFT to the noisy audio file
noisy_specgram = torch.stft(
noisy_waveform.squeeze(),
n_fft=1024,
hop_length=512,
win_length=1024,
window=torch.hann_window(1024),
return_complex=True, # Ensure complex output for istft
)
noisy_magnitude = torch.sqrt(noisy_specgram.real ** 2 + noisy_specgram.imag ** 2)
# Forward pass through the model
with torch.no_grad():
output_magnitude = model(noisy_magnitude.unsqueeze(0))
# Convert the output back to numpy array
output_magnitude = output_magnitude.squeeze().numpy()
# Convert the output_magnitude to a PyTorch tensor
output_magnitude = torch.from_numpy(output_magnitude)
# Reconstruct the denoised audio waveform
output_specgram = output_magnitude * torch.exp(1j * torch.angle(noisy_specgram))
denoised_waveform = torch.istft(output_specgram, n_fft=1024, hop_length=512, win_length=1024,
window=torch.hann_window(1024), length=len(noisy_waveform))
# Convert the denoised waveform to a numpy array
denoised_waveform = denoised_waveform.numpy()
# Save the denoised audio as a new WAV file
denoised_audio_file = 'denoised.wav'
sf.write(denoised_audio_file, denoised_waveform, sample_rate)
print(f"Denoised audio saved at: {denoised_audio_file}")