@@ -19,59 +19,82 @@ const MultimodalLiveResponseType = {
1919/**
2020 * Parses response messages from the Gemini Live API
2121 */
22- class MultimodalLiveResponseMessage {
23- constructor ( data ) {
24- this . data = "" ;
25- this . type = "" ;
26- this . endOfTurn = false ;
22+ /**
23+ * Parses ALL response types from a single server message.
24+ * The server can now bundle multiple fields (e.g. audio + transcription)
25+ * in the same message. Returns an array of response objects.
26+ */
27+ function parseResponseMessages ( data ) {
28+ const responses = [ ] ;
29+ const serverContent = data ?. serverContent ;
30+ const parts = serverContent ?. modelTurn ?. parts ;
31+
32+ try {
33+ // Setup complete (exclusive — no other fields expected)
34+ if ( data ?. setupComplete ) {
35+ console . log ( "🏁 SETUP COMPLETE response" , data ) ;
36+ responses . push ( { type : MultimodalLiveResponseType . SETUP_COMPLETE , data : "" , endOfTurn : false } ) ;
37+ return responses ;
38+ }
2739
28- console . log ( "raw message data: " , data ) ;
40+ // Tool call (exclusive)
41+ if ( data ?. toolCall ) {
42+ console . log ( "🎯 🛠️ TOOL CALL response" , data ?. toolCall ) ;
43+ responses . push ( { type : MultimodalLiveResponseType . TOOL_CALL , data : data . toolCall , endOfTurn : false } ) ;
44+ return responses ;
45+ }
2946
30- const serverContent = data ?. serverContent ;
31- this . endOfTurn = serverContent ?. turnComplete ;
32- const parts = serverContent ?. modelTurn ?. parts ;
47+ // Audio data from model turn parts
48+ if ( parts ?. length ) {
49+ for ( const part of parts ) {
50+ if ( part . inlineData ) {
51+ responses . push ( { type : MultimodalLiveResponseType . AUDIO , data : part . inlineData . data , endOfTurn : false } ) ;
52+ } else if ( part . text ) {
53+ console . log ( "💬 TEXT response" , part . text ) ;
54+ responses . push ( { type : MultimodalLiveResponseType . TEXT , data : part . text , endOfTurn : false } ) ;
55+ }
56+ }
57+ }
3358
34- try {
35- if ( data ?. setupComplete ) {
36- console . log ( "🏁 SETUP COMPLETE response" , data ) ;
37- this . type = MultimodalLiveResponseType . SETUP_COMPLETE ;
38- } else if ( serverContent ?. turnComplete ) {
39- console . log ( "🏁 TURN COMPLETE response" ) ;
40- this . type = MultimodalLiveResponseType . TURN_COMPLETE ;
41- } else if ( serverContent ?. interrupted ) {
42- console . log ( "🗣️ INTERRUPTED response" ) ;
43- this . type = MultimodalLiveResponseType . INTERRUPTED ;
44- } else if ( serverContent ?. inputTranscription ) {
45- console . log ( "📝 INPUT TRANSCRIPTION:" , serverContent . inputTranscription ) ;
46- this . type = MultimodalLiveResponseType . INPUT_TRANSCRIPTION ;
47- this . data = {
59+ // Transcriptions — checked independently, NOT in else-if with audio
60+ if ( serverContent ?. inputTranscription ) {
61+ responses . push ( {
62+ type : MultimodalLiveResponseType . INPUT_TRANSCRIPTION ,
63+ data : {
4864 text : serverContent . inputTranscription . text || "" ,
4965 finished : serverContent . inputTranscription . finished || false ,
50- } ;
51- } else if ( serverContent ?. outputTranscription ) {
52- console . log ( "📝 OUTPUT TRANSCRIPTION:" , serverContent . outputTranscription ) ;
53- this . type = MultimodalLiveResponseType . OUTPUT_TRANSCRIPTION ;
54- this . data = {
66+ } ,
67+ endOfTurn : false ,
68+ } ) ;
69+ }
70+
71+ if ( serverContent ?. outputTranscription ) {
72+ responses . push ( {
73+ type : MultimodalLiveResponseType . OUTPUT_TRANSCRIPTION ,
74+ data : {
5575 text : serverContent . outputTranscription . text || "" ,
5676 finished : serverContent . outputTranscription . finished || false ,
57- } ;
58- } else if ( data ?. toolCall ) {
59- console . log ( "🎯 🛠️ TOOL CALL response" , data ?. toolCall ) ;
60- this . type = MultimodalLiveResponseType . TOOL_CALL ;
61- this . data = data ?. toolCall ;
62- } else if ( parts ?. length && parts [ 0 ] . text ) {
63- console . log ( "💬 TEXT response" , parts [ 0 ] . text ) ;
64- this . data = parts [ 0 ] . text ;
65- this . type = MultimodalLiveResponseType . TEXT ;
66- } else if ( parts ?. length && parts [ 0 ] . inlineData ) {
67- console . log ( "🔊 AUDIO response" ) ;
68- this . data = parts [ 0 ] . inlineData . data ;
69- this . type = MultimodalLiveResponseType . AUDIO ;
70- }
71- } catch ( err ) {
72- console . log ( "⚠️ Error parsing response data: " , err , data ) ;
77+ } ,
78+ endOfTurn : false ,
79+ } ) ;
7380 }
81+
82+ // Interrupted
83+ if ( serverContent ?. interrupted ) {
84+ console . log ( "🗣️ INTERRUPTED response" ) ;
85+ responses . push ( { type : MultimodalLiveResponseType . INTERRUPTED , data : "" , endOfTurn : false } ) ;
86+ }
87+
88+ // Turn complete
89+ if ( serverContent ?. turnComplete ) {
90+ console . log ( "🏁 TURN COMPLETE response" ) ;
91+ responses . push ( { type : MultimodalLiveResponseType . TURN_COMPLETE , data : "" , endOfTurn : true } ) ;
92+ }
93+ } catch ( err ) {
94+ console . log ( "⚠️ Error parsing response data: " , err , data ) ;
7495 }
96+
97+ return responses ;
7598}
7699
77100/**
@@ -105,7 +128,7 @@ class FunctionCallDefinition {
105128 parameters
106129 ) } `
107130 ) ;
108- this . functionToCall ( parameters ) ;
131+ return this . functionToCall ( parameters ) ;
109132 }
110133}
111134
@@ -121,10 +144,8 @@ class GeminiLiveAPI {
121144 this . responseModalities = [ "AUDIO" ] ;
122145 this . systemInstructions = "" ;
123146 this . googleGrounding = false ;
124- this . enableAffectiveDialog = false ; // Default affective dialog
125147 this . voiceName = "Puck" ; // Default voice
126148 this . temperature = 1.0 ; // Default temperature
127- this . proactivity = { proactiveAudio : false } ; // Proactivity config
128149 this . inputAudioTranscription = false ;
129150 this . outputAudioTranscription = false ;
130151 this . enableFunctionCalls = false ;
@@ -195,10 +216,7 @@ class GeminiLiveAPI {
195216 this . voiceName = voiceName ;
196217 }
197218
198- setProactivity ( proactivity ) {
199- console . log ( "setting proactivity: " , proactivity ) ;
200- this . proactivity = proactivity ;
201- }
219+
202220
203221 setInputAudioTranscription ( enabled ) {
204222 console . log ( "setting input audio transcription: " , enabled ) ;
@@ -223,7 +241,7 @@ class GeminiLiveAPI {
223241
224242 callFunction ( functionName , parameters ) {
225243 const functionToCall = this . functionsMap [ functionName ] ;
226- functionToCall . runFunction ( parameters ) ;
244+ return functionToCall . runFunction ( parameters ) ;
227245 }
228246
229247 connect ( ) {
@@ -238,15 +256,12 @@ class GeminiLiveAPI {
238256 }
239257
240258 sendMessage ( message ) {
241- console . log ( "🟩 Sending message: " , message ) ;
242259 if ( this . webSocket && this . webSocket . readyState === WebSocket . OPEN ) {
243260 this . webSocket . send ( JSON . stringify ( message ) ) ;
244261 }
245262 }
246263
247264 async onReceiveMessage ( messageEvent ) {
248- console . log ( "Message received: " , messageEvent ) ;
249-
250265 let jsonData ;
251266 if ( messageEvent . data instanceof Blob ) {
252267 jsonData = await messageEvent . data . text ( ) ;
@@ -258,8 +273,11 @@ class GeminiLiveAPI {
258273
259274 try {
260275 const messageData = JSON . parse ( jsonData ) ;
261- const message = new MultimodalLiveResponseMessage ( messageData ) ;
262- this . onReceiveResponse ( message ) ;
276+ // Parse all response types from this message (audio + transcription can coexist)
277+ const responses = parseResponseMessages ( messageData ) ;
278+ for ( const response of responses ) {
279+ this . onReceiveResponse ( response ) ;
280+ }
263281 } catch ( err ) {
264282 console . error ( "Error parsing JSON message:" , err , jsonData ) ;
265283 }
@@ -322,8 +340,8 @@ class GeminiLiveAPI {
322340 } ,
323341 } ,
324342 systemInstruction : { parts : [ { text : this . systemInstructions } ] } ,
325- tools : { functionDeclarations : tools } ,
326- proactivity : this . proactivity ,
343+ tools : [ { functionDeclarations : tools } ] ,
344+
327345
328346 realtimeInputConfig : {
329347 automaticActivityDetection : {
@@ -334,6 +352,7 @@ class GeminiLiveAPI {
334352 startOfSpeechSensitivity : this . automaticActivityDetection . start_of_speech_sensitivity ,
335353 } ,
336354 activityHandling : this . activityHandling ,
355+ turnCoverage : "TURN_INCLUDES_ONLY_ACTIVITY" ,
337356 } ,
338357 } ,
339358 } ;
@@ -347,18 +366,14 @@ class GeminiLiveAPI {
347366 }
348367
349368 if ( this . googleGrounding ) {
350- sessionSetupMessage . setup . tools . googleSearch = { } ;
351369 // Currently can't have both Google Search with custom tools.
352370 console . log (
353371 "Google Grounding enabled, removing custom function calls if any."
354372 ) ;
355- delete sessionSetupMessage . setup . tools . functionDeclarations ;
373+ sessionSetupMessage . setup . tools = [ { googleSearch : { } } ] ;
356374 }
357375
358- // Add affective dialog if enabled
359- if ( this . enableAffectiveDialog ) {
360- sessionSetupMessage . setup . generationConfig . enableAffectiveDialog = true ;
361- }
376+
362377
363378 // Store the setup message for later access
364379 this . lastSetupMessage = sessionSetupMessage ;
@@ -376,11 +391,10 @@ class GeminiLiveAPI {
376391 this . sendMessage ( message ) ;
377392 }
378393
379- sendToolResponse ( toolCallId , response ) {
394+ sendToolResponse ( functionResponses ) {
380395 const message = {
381396 toolResponse : {
382- id : toolCallId ,
383- response : response ,
397+ functionResponses : functionResponses ,
384398 } ,
385399 } ;
386400 console . log ( "🔧 Sending tool response:" , message ) ;
0 commit comments