- The transcription is not as good as it should be.
- The transcription only works when the language code is "en-US".
- I don't get any results when i set language code to "en-IN".
Here is my java code
package com.amazonaws.transcribestreaming;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import com.amazonaws.kvstranscribestreaming.KVSStreamTrackObject;
import com.amazonaws.kvstranscribestreaming.KVSUtils;
import com.google.api.gax.rpc.ClientStream;
import com.google.api.gax.rpc.ResponseObserver;
import com.google.api.gax.rpc.StreamController;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.SpeechClient;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1p1beta1.StreamingRecognitionConfig;
import com.google.cloud.speech.v1p1beta1.StreamingRecognitionResult;
import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest;
import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse;
import com.google.protobuf.ByteString;
import com.google.protobuf.Duration;
public class InfiniteStreamRecognize {
private static final int STREAMING_LIMIT = 290000; // ~5 minutes
public static final String RED = "\033[0;31m";
public static final String GREEN = "\033[0;32m";
public static final String YELLOW = "\033[0;33m";
// Creating shared object
private static volatile BlockingQueue<byte[]> sharedQueue = new LinkedBlockingQueue<byte[]>();
private static int restartCounter = 0;
private static ArrayList<ByteString> audioInput = new ArrayList<ByteString>();
private static ArrayList<ByteString> lastAudioInput = new ArrayList<ByteString>();
private static int resultEndTimeInMS = 0;
private static int isFinalEndTime = 0;
private static int finalRequestEndTime = 0;
private static boolean newStream = true;
private static double bridgingOffset = 0;
private static boolean lastTranscriptWasFinal = false;
private static StreamController referenceToStreamController;
private static ByteString tempByteString;
private static boolean breakTranscription = false;
public static String convertMillisToDate(double milliSeconds) {
long millis = (long) milliSeconds;
DecimalFormat format = new DecimalFormat();
format.setMinimumIntegerDigits(2);
return String.format(
"%s:%s /",
format.format(TimeUnit.MILLISECONDS.toMinutes(millis)),
format.format(
TimeUnit.MILLISECONDS.toSeconds(millis)
- TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))));
}
/** Performs infinite streaming speech recognition */
public static void infiniteStreamingRecognize(String languageCode, KVSStreamTrackObject kvsStreamTrackObject, String contactId) throws Exception {
// AWS Input buffering
class AWSAudioBuffer implements Runnable {
@Override
public void run() {
try {
byte[] data;
ByteBuffer audioBuffer = KVSUtils.getByteBufferFromStream(kvsStreamTrackObject.getStreamingMkvReader(),
kvsStreamTrackObject.getFragmentVisitor(), kvsStreamTrackObject.getTagProcessor(), contactId, kvsStreamTrackObject.getTrackName());
do{
data = new byte[audioBuffer.remaining()];
audioBuffer.get(data);
sharedQueue.put(data.clone());
audioBuffer = KVSUtils.getByteBufferFromStream(kvsStreamTrackObject.getStreamingMkvReader(),
kvsStreamTrackObject.getFragmentVisitor(), kvsStreamTrackObject.getTagProcessor(), contactId, kvsStreamTrackObject.getTrackName());
//expensive op
kvsStreamTrackObject.getOutputStream().write(data);
}while(data.length>0);
} catch (Exception e) {
System.out.println("kvs interrupted, "+e.getMessage());
} finally {
// break transcription if no audio available
breakTranscription = true;
try {
System.out.println("Closing the audiofile.");
kvsStreamTrackObject.getOutputStream().close();
} catch (IOException e2) {
System.out.println("Error while closing file:-- "+e2.getMessage());
}
}
}
}
// Creating microphone input buffer thread
AWSAudioBuffer awsRunnable = new AWSAudioBuffer();
Thread awsAudioThread = new Thread(awsRunnable);
// Response observer
ResponseObserver<StreamingRecognizeResponse> responseObserver = null;
try (SpeechClient client = SpeechClient.create()) {
ArrayList<String> languageList = new ArrayList<> ();
languageList.add("en-IN");
languageList.add("hi-IN");
ClientStream<StreamingRecognizeRequest> clientStream;
responseObserver =
new ResponseObserver<StreamingRecognizeResponse>() {
ArrayList<StreamingRecognizeResponse> responses = new ArrayList<>();
public void onStart(StreamController controller) {
referenceToStreamController = controller;
}
public void onResponse(StreamingRecognizeResponse response) {
responses.add(response);
StreamingRecognitionResult result = response.getResultsList().get(0);
Duration resultEndTime = result.getResultEndTime();
resultEndTimeInMS =
(int)
((resultEndTime.getSeconds() * 1000) + (resultEndTime.getNanos() / 1000000));
double correctedTime =
resultEndTimeInMS - bridgingOffset + (STREAMING_LIMIT * restartCounter);
SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
if (result.getIsFinal()) {
System.out.print(GREEN);
System.out.print("\033[2K\r");
System.out.printf(
"%s: %s [confidence: %.2f]\n",
convertMillisToDate(correctedTime),
alternative.getTranscript(),
alternative.getConfidence());
isFinalEndTime = resultEndTimeInMS;
lastTranscriptWasFinal = true;
} else {
System.out.print(RED);
System.out.print("\033[2K\r");
System.out.printf(
"%s: %s", convertMillisToDate(correctedTime), alternative.getTranscript());
lastTranscriptWasFinal = false;
}
}
public void onComplete() {}
public void onError(Throwable t) {}
};
clientStream = client.streamingRecognizeCallable().splitCall(responseObserver);
RecognitionConfig recognitionConfig =
RecognitionConfig.newBuilder()
.setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
.setLanguageCode(languageCode)
.addAllAlternativeLanguageCodes(languageList)
.setSampleRateHertz(8000)
.build();
StreamingRecognitionConfig streamingRecognitionConfig =
StreamingRecognitionConfig.newBuilder()
.setConfig(recognitionConfig)
.setInterimResults(false)
.build();
StreamingRecognizeRequest request =
StreamingRecognizeRequest.newBuilder()
.setStreamingConfig(streamingRecognitionConfig)
.build(); // The first request in a streaming call has to be a config
clientStream.send(request);
try {
awsAudioThread.start();
long startTime = System.currentTimeMillis();
while (true) {
long estimatedTime = System.currentTimeMillis() - startTime;
if (estimatedTime >= STREAMING_LIMIT) {
clientStream.closeSend();
referenceToStreamController.cancel(); // remove Observer
if (resultEndTimeInMS > 0) {
finalRequestEndTime = isFinalEndTime;
}
resultEndTimeInMS = 0;
lastAudioInput = null;
lastAudioInput = audioInput;
audioInput = new ArrayList<ByteString>();
restartCounter++;
if (!lastTranscriptWasFinal) {
System.out.print('\n');
}
newStream = true;
clientStream = client.streamingRecognizeCallable().splitCall(responseObserver);
request =
StreamingRecognizeRequest.newBuilder()
.setStreamingConfig(streamingRecognitionConfig)
.build();
System.out.println(YELLOW);
System.out.printf("%d: RESTARTING REQUEST\n", restartCounter * STREAMING_LIMIT);
startTime = System.currentTimeMillis();
} else {
if ((newStream) && (lastAudioInput.size() > 0)) {
// if this is the first audio from a new request
// calculate amount of unfinalized audio from last request
// resend the audio to the speech client before incoming audio
double chunkTime = STREAMING_LIMIT / lastAudioInput.size();
// ms length of each chunk in previous request audio arrayList
if (chunkTime != 0) {
if (bridgingOffset < 0) {
// bridging Offset accounts for time of resent audio
// calculated from last request
bridgingOffset = 0;
}
if (bridgingOffset > finalRequestEndTime) {
bridgingOffset = finalRequestEndTime;
}
int chunksFromMs =
(int) Math.floor((finalRequestEndTime - bridgingOffset) / chunkTime);
// chunks from MS is number of chunks to resend
bridgingOffset =
(int) Math.floor((lastAudioInput.size() - chunksFromMs) * chunkTime);
// set bridging offset for next request
for (int i = chunksFromMs; i < lastAudioInput.size(); i++) {
request =
StreamingRecognizeRequest.newBuilder()
.setAudioContent(lastAudioInput.get(i))
.build();
clientStream.send(request);
}
}
newStream = false;
}
tempByteString = ByteString.copyFrom(sharedQueue.take());
request =
StreamingRecognizeRequest.newBuilder().setAudioContent(tempByteString).build();
audioInput.add(tempByteString);
}
clientStream.send(request);
if(breakTranscription) {
clientStream.closeSend();
break;
}
}
} catch (Exception e) {
System.out.println(e);
}
}
}
}
When setting language code to en-IN only the partial transcripts are produced but they are no good. I tried setting alternate languages also by passing languageCode="en-US" and adding "en-IN" and "hi-IN" in alternate language list but that started producing transcripts in hindi even when i was speaking in english. what am i doing wrong can someone help.