2020import com .google .api .gax .rpc .ClientStream ;
2121import com .google .api .gax .rpc .ResponseObserver ;
2222import com .google .api .gax .rpc .StreamController ;
23- import com .google .cloud .speech .v1 .RecognitionConfig ;
24- import com .google .cloud .speech .v1 .SpeechClient ;
25- import com .google .cloud .speech .v1 .SpeechRecognitionAlternative ;
26- import com .google .cloud .speech .v1 .StreamingRecognitionConfig ;
27- import com .google .cloud .speech .v1 .StreamingRecognitionResult ;
28- import com .google .cloud .speech .v1 .StreamingRecognizeRequest ;
29- import com .google .cloud .speech .v1 .StreamingRecognizeResponse ;
23+ import com .google .cloud .speech .v1p1beta1 .RecognitionConfig ;
24+ import com .google .cloud .speech .v1p1beta1 .SpeechClient ;
25+ import com .google .cloud .speech .v1p1beta1 .SpeechRecognitionAlternative ;
26+ import com .google .cloud .speech .v1p1beta1 .StreamingRecognitionConfig ;
27+ import com .google .cloud .speech .v1p1beta1 .StreamingRecognitionResult ;
28+ import com .google .cloud .speech .v1p1beta1 .StreamingRecognizeRequest ;
29+ import com .google .cloud .speech .v1p1beta1 .StreamingRecognizeResponse ;
3030import com .google .protobuf .ByteString ;
31+ import com .google .protobuf .Duration ;
32+ import java .lang .Math ;
33+ import java .text .DecimalFormat ;
3134import java .util .ArrayList ;
3235import java .util .concurrent .BlockingQueue ;
3336import java .util .concurrent .LinkedBlockingQueue ;
3942
4043public class InfiniteStreamRecognize {
4144
45+ private static final int STREAMING_LIMIT = 10000 ; // 10 seconds
46+
47+ public static final String RED = "\033 [0;31m" ;
48+ public static final String GREEN = "\033 [0;32m" ;
49+ public static final String YELLOW = "\033 [0;33m" ;
50+
4251 // Creating shared object
4352 private static volatile BlockingQueue <byte []> sharedQueue = new LinkedBlockingQueue ();
4453 private static TargetDataLine targetDataLine ;
4554 private static int BYTES_PER_BUFFER = 6400 ; // buffer size in bytes
4655
56+ private static int restartCounter = 0 ;
57+ private static ArrayList <ByteString > audioInput = new ArrayList <ByteString >();
58+ private static ArrayList <ByteString > lastAudioInput = new ArrayList <ByteString >();
59+ private static int resultEndTimeInMS = 0 ;
60+ private static int isFinalEndTime = 0 ;
61+ private static int finalRequestEndTime = 0 ;
62+ private static boolean newStream = true ;
63+ private static double bridgingOffset = 0 ;
64+ private static boolean lastTranscriptWasFinal = false ;
65+ private static StreamController referenceToStreamController ;
66+ private static ByteString tempByteString ;
67+
4768 public static void main (String ... args ) {
4869 try {
4970 infiniteStreamingRecognize ();
@@ -60,6 +81,7 @@ class MicBuffer implements Runnable {
6081
6182 @ Override
6283 public void run () {
84+ System .out .println (YELLOW );
6385 System .out .println ("Start speaking...Press Ctrl-C to stop" );
6486 targetDataLine .start ();
6587 byte [] data = new byte [BYTES_PER_BUFFER ];
@@ -88,24 +110,48 @@ public void run() {
88110
89111 ArrayList <StreamingRecognizeResponse > responses = new ArrayList <>();
90112
91- public void onStart (StreamController controller ) {}
113+ public void onStart (StreamController controller ) {
114+ referenceToStreamController = controller ;
115+ }
92116
93117 public void onResponse (StreamingRecognizeResponse response ) {
118+
94119 responses .add (response );
120+
95121 StreamingRecognitionResult result = response .getResultsList ().get (0 );
96- // There can be several alternative transcripts for a given chunk of speech. Just
97- // use the first (most likely) one here.
122+
123+ Duration resultEndTime = result .getResultEndTime ();
124+
125+ resultEndTimeInMS = (int ) ((resultEndTime .getSeconds () * 1000 )
126+ + (resultEndTime .getNanos () / 1000000 ));
127+
128+ double correctedTime = resultEndTimeInMS - bridgingOffset
129+ + (STREAMING_LIMIT * restartCounter );
130+ DecimalFormat format = new DecimalFormat ("0.#" );
131+
98132 SpeechRecognitionAlternative alternative = result .getAlternativesList ().get (0 );
99- System .out .printf ("Transcript : %s\n " , alternative .getTranscript ());
100- }
133+ if (result .getIsFinal ()) {
134+ System .out .print (GREEN );
135+ System .out .print ("\033 [2K\r " );
136+ System .out .printf ("%s: %s\n " , format .format (correctedTime ),
137+ alternative .getTranscript ());
101138
102- public void onComplete () {
103- System .out .println ("Done" );
104- }
139+ isFinalEndTime = resultEndTimeInMS ;
140+ lastTranscriptWasFinal = true ;
141+ } else {
142+ System .out .print (RED );
143+ System .out .print ("\033 [2K\r " );
144+ System .out .printf ("%s: %s" , format .format (correctedTime ),
145+ alternative .getTranscript ());
105146
106- public void onError ( Throwable t ) {
107- System . out . println ( t );
147+ lastTranscriptWasFinal = false ;
148+ }
108149 }
150+
151+ public void onComplete () {}
152+
153+ public void onError (Throwable t ) {}
154+
109155 };
110156
111157 clientStream = client .streamingRecognizeCallable ().splitCall (responseObserver );
@@ -116,8 +162,12 @@ public void onError(Throwable t) {
116162 .setLanguageCode ("en-US" )
117163 .setSampleRateHertz (16000 )
118164 .build ();
165+
119166 StreamingRecognitionConfig streamingRecognitionConfig =
120- StreamingRecognitionConfig .newBuilder ().setConfig (recognitionConfig ).build ();
167+ StreamingRecognitionConfig .newBuilder ()
168+ .setConfig (recognitionConfig )
169+ .setInterimResults (true )
170+ .build ();
121171
122172 StreamingRecognizeRequest request =
123173 StreamingRecognizeRequest .newBuilder ()
@@ -151,23 +201,84 @@ public void onError(Throwable t) {
151201
152202 long estimatedTime = System .currentTimeMillis () - startTime ;
153203
154- if (estimatedTime >= 55000 ) {
204+ if (estimatedTime >= STREAMING_LIMIT ) {
155205
156206 clientStream .closeSend ();
207+ referenceToStreamController .cancel (); // remove Observer
208+
209+ if (resultEndTimeInMS > 0 ) {
210+ finalRequestEndTime = isFinalEndTime ;
211+ }
212+ resultEndTimeInMS = 0 ;
213+
214+ lastAudioInput = null ;
215+ lastAudioInput = audioInput ;
216+ audioInput = new ArrayList <ByteString >();
217+
218+ restartCounter ++;
219+
220+ if (!lastTranscriptWasFinal ) {
221+ System .out .print ('\n' );
222+ }
223+
224+ newStream = true ;
225+
157226 clientStream = client .streamingRecognizeCallable ().splitCall (responseObserver );
158227
159228 request =
160229 StreamingRecognizeRequest .newBuilder ()
161230 .setStreamingConfig (streamingRecognitionConfig )
162231 .build ();
163232
233+ System .out .println (YELLOW );
234+ System .out .printf ("%d: RESTARTING REQUEST\n " , restartCounter * STREAMING_LIMIT );
235+
164236 startTime = System .currentTimeMillis ();
165237
166238 } else {
239+
240+ if ((newStream ) && (lastAudioInput .size () > 0 )) {
241+ // if this is the first audio from a new request
242+ // calculate amount of unfinalized audio from last request
243+ // resend the audio to the speech client before incoming audio
244+ double chunkTime = STREAMING_LIMIT / lastAudioInput .size ();
245+ // ms length of each chunk in previous request audio arrayList
246+ if (chunkTime != 0 ) {
247+ if (bridgingOffset < 0 ) {
248+ // bridging Offset accounts for time of resent audio
249+ // calculated from last request
250+ bridgingOffset = 0 ;
251+ }
252+ if (bridgingOffset > finalRequestEndTime ) {
253+ bridgingOffset = finalRequestEndTime ;
254+ }
255+ int chunksFromMS = (int ) Math .floor ((finalRequestEndTime
256+ - bridgingOffset ) / chunkTime );
257+ // chunks from MS is number of chunks to resend
258+ bridgingOffset = (int ) Math .floor ((lastAudioInput .size ()
259+ - chunksFromMS ) * chunkTime );
260+ // set bridging offset for next request
261+ for (int i = chunksFromMS ; i < lastAudioInput .size (); i ++) {
262+
263+ request =
264+ StreamingRecognizeRequest .newBuilder ()
265+ .setAudioContent (lastAudioInput .get (i ))
266+ .build ();
267+ clientStream .send (request );
268+ }
269+ }
270+ newStream = false ;
271+ }
272+
273+ tempByteString = ByteString .copyFrom (sharedQueue .take ());
274+
167275 request =
168276 StreamingRecognizeRequest .newBuilder ()
169- .setAudioContent (ByteString . copyFrom ( sharedQueue . take ()) )
277+ .setAudioContent (tempByteString )
170278 .build ();
279+
280+ audioInput .add (tempByteString );
281+
171282 }
172283
173284 clientStream .send (request );
0 commit comments