The following NodeJS code handles Text-to-Speech successfully.
However, calling the /transcribe end-point for Speech-to-Text, returns a successful result , however, also throws an error:
STT ERROR: Error: 7 PERMISSION_DENIED: The caller does not have permission
at callErrorFromStatus (C:\DevEnv\Projects\my-dream-team-tts\node_modules\@grpc\grpc-js\build\src\call.js:31:19)
at Object.onReceiveStatus (C:\DevEnv\Projects\my-dream-team-tts\node_modules\@grpc\grpc-js\build\src\client.js:192:76)
at Object.onReceiveStatus (C:\DevEnv\Projects\my-dream-team-tts\node_modules\@grpc\grpc-js\build\src\client-interceptors.js:360:141)
at Object.onReceiveStatus (C:\DevEnv\Projects\my-dream-team-tts\node_modules\@grpc\grpc-js\build\src\client-interceptors.js:323:181)
at C:\DevEnv\Projects\my-dream-team-tts\node_modules\@grpc\grpc-js\build\src\resolving-call.js:99:78
at process.processTicksAndRejections (node:internal/process/task_queues:77:11)
for call at
at ServiceClientImpl.makeUnaryRequest (C:\DevEnv\Projects\my-dream-team-tts\node_modules\@grpc\grpc-js\build\src\client.js:160:32)
at ServiceClientImpl.<anonymous> (C:\DevEnv\Projects\my-dream-team-tts\node_modules\@grpc\grpc-js\build\src\make-client.js:105:19)
at C:\DevEnv\Projects\my-dream-team-tts\node_modules\@google-cloud\speech\build\src\v1\speech_client.js:229:29
at C:\DevEnv\Projects\my-dream-team-tts\node_modules\google-gax\build\src\normalCalls\timeout.js:44:16
at LongrunningApiCaller._wrapOperation (C:\DevEnv\Projects\my-dream-team-tts\node_modules\google-gax\build\src\longRunningCalls\longRunningApiCaller.js:55:16)
at C:\DevEnv\Projects\my-dream-team-tts\node_modules\google-gax\build\src\longRunningCalls\longRunningApiCaller.js:46:25
at OngoingCallPromise.call (C:\DevEnv\Projects\my-dream-team-tts\node_modules\google-gax\build\src\call.js:67:27)
at LongrunningApiCaller.call (C:\DevEnv\Projects\my-dream-team-tts\node_modules\google-gax\build\src\longRunningCalls\longRunningApiCaller.js:45:19)
}
],
totalBilledTime: { seconds: '4', nanos: 0 },
speechAdaptationInfo: null,
requestId: '5550510368443781709'
}
About to send back transribed text: are you listening to me or not
This is the NodeJS code:
require('dotenv').config();
const fs = require('fs');
const path = require('path');
const express = require('express');
const bodyParser = require('body-parser');
const { TextToSpeechClient } = require('@google-cloud/text-to-speech');
const { SpeechClient } = require('@google-cloud/speech');
const app = express();
const port = 3005;
const credentialsPath = process.env.GOOGLE_APP_CREDENTIALS;
console.log('credentialsPath: ', credentialsPath);
const credentials = JSON.parse(fs.readFileSync(credentialsPath, 'utf8'));
//TTS
const client = new TextToSpeechClient({
credentials: {
client_email: credentials.client_email,
private_key: credentials.private_key,
},
});
//STT
const speechClient = new SpeechClient({
credentials: {
client_email: credentials.client_email,
private_key: credentials.private_key,
},
});
const LONG_RECORDING_THRESHOLD_MS = 59000; //Up to 1 min - use the Google's STT recognize method
app.use(bodyParser.json());
app.use((req, res, next) => {
res.header('Access-Control-Allow-Origin', '*');
res.header('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept');
next();
});
/* TEXT-TO-SPEECH */
/* SPEECH-To-TEXT */
app.post('/transcribe', (req, res) => {
console.log('**** Received a TRANSCRIBE request: ', req.body);
const audioFile = req.body.audio;
const duration = req.body.duration; //In milliseconds
const encoding = 'BASE64';
const sampleRateHertz = 48000;
const languageCode = 'en-US';
const request = {
audio: { content: audioFile },
config: {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
},
};
console.log('*** About to call speechClient with request: ', request);
// Check audio length to decide which method to use
if (duration <= LONG_RECORDING_THRESHOLD_MS) {
speechClient.recognize(request)
.then(handleResponse)
.catch(handleError);
} else {
//WILL NOT WORK - this requires that the audio file is first uploaded to a storage bucket.
speechClient.longRunningRecognize(request)
.then(data => {
const operation = data[0];
return operation.promise();
})
.then(handleResponse)
.catch(handleError);
}
function handleResponse([response]) {
console.log('Handling successful transcribe result: ', response);
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n');
console.log('About to send back transribed text: ', transcription);
res.send({ transcription: transcription });
}
function handleError(error) {
console.log('STT ERROR: ', error);
res.status(500).send({ error: error.message });
}
});
app.listen(port, () => {
console.log(`Server started on http://localhost:${port}`);
});
The client (Angular) web app is passing a base64 transformed audio file that is captured here:
startRecording() {
this.audioChunks = [];
const stream = navigator.mediaDevices.getUserMedia({ audio: true });
stream.then(audioStream => {
this.mediaRecorder = new MediaRecorder(audioStream);
this.mediaRecorder.ondataavailable = (event: { data: any; }) => {
this.audioChunks.push(event.data);
};
this.mediaRecorder.onstop = async () => {
const recordingDuration = Date.now() - this.recordingStartTime;
const audioBlob = new Blob(this.audioChunks, { type: 'audio/wav' });
const audioBase64 = await this.textToSpeech.blobToBase64(audioBlob);
const payload = {
audio: audioBase64,
duration: recordingDuration
};
this.textToSpeech.transcribeAudio(payload).pipe(takeUntilDestroyed(this.destroyRef)).subscribe(response => {
console.log('Transcribed text upon FINISH:', response.transcription);
this.chatService.chtPrompt.set(response.transcription);
setTimeout(() => {
//this.promptEnterClicked();
}, 500);
});
};
this.mediaRecorder.start();
this.audioIcon = "pi pi-stop-circle";
this.promptPlaceholder = "Recording";
this.recordingStartTime = Date.now();
});
}
(When played-back in the browser for debugging, the captured audio file sounds great).
This is how the NodeJS service is called:
transcribeAudio(payload: any): Observable<any> {
return this.http.post(`${this.baseUrl}/transcribe`, payload)
.pipe(
retry(0),
catchError(this.handleError),
take(1)
);
}
I followed the instructions on setting up the account, billing, credentials, service account - and using the json file as the GOOGLE_APP_CREDENTIALS.
The service account inherits from my only one project, with owner permissions. I tried adding a specific permission for this service with the Cloud Speech Client, Cloud Speech Administrator permissions:
Is there any specific permission I'm missing in the service account or something?
How is it possible that I receive both a successful response with transcripted text string, and a PERMISSION_DENIED error?
