I'm making small pet-project. It's voice assistant. Now, development is on speech recognition part, and i need to make silent detection, to react on user's phrase end. I use vosk for speech recognition, pyaudio to record audio, numpy for energy calculations, and wave to save output file. Here is a part of my code:
import vosk
import json
import pyaudio
import numpy as np
import wave
def is_silence(audio_chunk, silence_thresh=-50, energy_thresh=0.001):
# Convert the audio chunk to a NumPy array of 16-bit integers
audio_chunk = np.frombuffer(audio_chunk, dtype=np.int16)
# Calculate the energy of the audio chunk
energy = np.sum(audio_chunk.astype(np.float32) ** 2) / float(len(audio_chunk))
# Check if both energy and maximum absolute value are below thresholds
return energy < energy_thresh and np.max(np.abs(audio_chunk)) < silence_thresh
def record_audio(output_file, max_silence_duration=1500, sample_rate=16000, chunk_size=1024, silence_thresh=-50):
# Initialize PyAudio
p = pyaudio.PyAudio()
# Open a stream for recording audio
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=sample_rate,
input=True,
frames_per_buffer=chunk_size)
# Initialize variables for storing audio frames and counting consecutive silence
frames = []
consecutive_silence_counter = 0
print("Recording...")
while True:
# Read a chunk of audio data from the stream
data = stream.read(chunk_size)
frames.append(data)
# Convert the audio chunk to a NumPy array
audio_chunk = np.frombuffer(data, dtype=np.int16)
# Check if the audio chunk is silence
if is_silence(audio_chunk, silence_thresh=silence_thresh):
consecutive_silence_counter += 1
else:
consecutive_silence_counter = 0
# Break the loop if consecutive silence reaches the specified duration
if consecutive_silence_counter * chunk_size >= (max_silence_duration // chunk_size):
break
print("Recording finished.")
# Stop and close the audio stream, and terminate PyAudio
stream.stop_stream()
stream.close()
p.terminate()
# Write the recorded frames to a WAV file
with wave.open(output_file, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(sample_rate)
wf.writeframes(b''.join(frames))
# recognize_speech function
if __name__ == "__main__":
# Path to the Vosk model
model_path = "path/to/vosk-model-en-us-0.22-lgraph"
# Path to the output audio file
output_file_path = "path to audio file"
# Record audio and save it to the specified output file
audio_data = record_audio(output_file=output_file_path)
# Recognize speech from the recorded audio using the Vosk model
recognized_text = recognize_speech(model_path, audio_data)
# Print the recognized text
print("Recognized text:", recognized_text)
My question is why is_silence method returns False, even if i saying nothing. I have background noises, but they are not so loud. I already tried using pydub and webrtcvad, but they didn't work. It's not a problem with file saving, or speech recognition. I know that i need to edit is_silence method or while cycle in record_audio function, but i just haven't got any ideas how to make this. I run it on MacOS and in python 3.12 virtual environment.
I already tried using pydub, webrtcvad, pyttsx3, and pocketsphinx, but they don't work. I tried to edit some arguments to a methods like energy_thresh and silence_thresh, too. Any ideas to do this?