@@ -19,59 +19,82 @@ const MultimodalLiveResponseType = {
1919/**
2020 * Parses response messages from the Gemini Live API
2121 */
22- class MultimodalLiveResponseMessage {
23- constructor ( data ) {
24- this . data = "" ;
25- this . type = "" ;
26- this . endOfTurn = false ;
22+ /**
23+ * Parses ALL response types from a single server message.
24+ * The server can now bundle multiple fields (e.g. audio + transcription)
25+ * in the same message. Returns an array of response objects.
26+ */
27+ function parseResponseMessages ( data ) {
28+ const responses = [ ] ;
29+ const serverContent = data ?. serverContent ;
30+ const parts = serverContent ?. modelTurn ?. parts ;
31+
32+ try {
33+ // Setup complete (exclusive — no other fields expected)
34+ if ( data ?. setupComplete ) {
35+ console . log ( "🏁 SETUP COMPLETE response" , data ) ;
36+ responses . push ( { type : MultimodalLiveResponseType . SETUP_COMPLETE , data : "" , endOfTurn : false } ) ;
37+ return responses ;
38+ }
2739
28- console . log ( "raw message data: " , data ) ;
40+ // Tool call (exclusive)
41+ if ( data ?. toolCall ) {
42+ console . log ( "🎯 🛠️ TOOL CALL response" , data ?. toolCall ) ;
43+ responses . push ( { type : MultimodalLiveResponseType . TOOL_CALL , data : data . toolCall , endOfTurn : false } ) ;
44+ return responses ;
45+ }
2946
30- const serverContent = data ?. serverContent ;
31- this . endOfTurn = serverContent ?. turnComplete ;
32- const parts = serverContent ?. modelTurn ?. parts ;
47+ // Audio data from model turn parts
48+ if ( parts ?. length ) {
49+ for ( const part of parts ) {
50+ if ( part . inlineData ) {
51+ responses . push ( { type : MultimodalLiveResponseType . AUDIO , data : part . inlineData . data , endOfTurn : false } ) ;
52+ } else if ( part . text ) {
53+ console . log ( "💬 TEXT response" , part . text ) ;
54+ responses . push ( { type : MultimodalLiveResponseType . TEXT , data : part . text , endOfTurn : false } ) ;
55+ }
56+ }
57+ }
3358
34- try {
35- if ( data ?. setupComplete ) {
36- console . log ( "🏁 SETUP COMPLETE response" , data ) ;
37- this . type = MultimodalLiveResponseType . SETUP_COMPLETE ;
38- } else if ( serverContent ?. turnComplete ) {
39- console . log ( "🏁 TURN COMPLETE response" ) ;
40- this . type = MultimodalLiveResponseType . TURN_COMPLETE ;
41- } else if ( serverContent ?. interrupted ) {
42- console . log ( "🗣️ INTERRUPTED response" ) ;
43- this . type = MultimodalLiveResponseType . INTERRUPTED ;
44- } else if ( serverContent ?. inputTranscription ) {
45- console . log ( "📝 INPUT TRANSCRIPTION:" , serverContent . inputTranscription ) ;
46- this . type = MultimodalLiveResponseType . INPUT_TRANSCRIPTION ;
47- this . data = {
59+ // Transcriptions — checked independently, NOT in else-if with audio
60+ if ( serverContent ?. inputTranscription ) {
61+ responses . push ( {
62+ type : MultimodalLiveResponseType . INPUT_TRANSCRIPTION ,
63+ data : {
4864 text : serverContent . inputTranscription . text || "" ,
4965 finished : serverContent . inputTranscription . finished || false ,
50- } ;
51- } else if ( serverContent ?. outputTranscription ) {
52- console . log ( "📝 OUTPUT TRANSCRIPTION:" , serverContent . outputTranscription ) ;
53- this . type = MultimodalLiveResponseType . OUTPUT_TRANSCRIPTION ;
54- this . data = {
66+ } ,
67+ endOfTurn : false ,
68+ } ) ;
69+ }
70+
71+ if ( serverContent ?. outputTranscription ) {
72+ responses . push ( {
73+ type : MultimodalLiveResponseType . OUTPUT_TRANSCRIPTION ,
74+ data : {
5575 text : serverContent . outputTranscription . text || "" ,
5676 finished : serverContent . outputTranscription . finished || false ,
57- } ;
58- } else if ( data ?. toolCall ) {
59- console . log ( "🎯 🛠️ TOOL CALL response" , data ?. toolCall ) ;
60- this . type = MultimodalLiveResponseType . TOOL_CALL ;
61- this . data = data ?. toolCall ;
62- } else if ( parts ?. length && parts [ 0 ] . text ) {
63- console . log ( "💬 TEXT response" , parts [ 0 ] . text ) ;
64- this . data = parts [ 0 ] . text ;
65- this . type = MultimodalLiveResponseType . TEXT ;
66- } else if ( parts ?. length && parts [ 0 ] . inlineData ) {
67- console . log ( "🔊 AUDIO response" ) ;
68- this . data = parts [ 0 ] . inlineData . data ;
69- this . type = MultimodalLiveResponseType . AUDIO ;
70- }
71- } catch ( err ) {
72- console . log ( "⚠️ Error parsing response data: " , err , data ) ;
77+ } ,
78+ endOfTurn : false ,
79+ } ) ;
80+ }
81+
82+ // Interrupted
83+ if ( serverContent ?. interrupted ) {
84+ console . log ( "🗣️ INTERRUPTED response" ) ;
85+ responses . push ( { type : MultimodalLiveResponseType . INTERRUPTED , data : "" , endOfTurn : false } ) ;
7386 }
87+
88+ // Turn complete
89+ if ( serverContent ?. turnComplete ) {
90+ console . log ( "🏁 TURN COMPLETE response" ) ;
91+ responses . push ( { type : MultimodalLiveResponseType . TURN_COMPLETE , data : "" , endOfTurn : true } ) ;
92+ }
93+ } catch ( err ) {
94+ console . log ( "⚠️ Error parsing response data: " , err , data ) ;
7495 }
96+
97+ return responses ;
7598}
7699
77100/**
@@ -238,15 +261,12 @@ class GeminiLiveAPI {
238261 }
239262
240263 sendMessage ( message ) {
241- console . log ( "🟩 Sending message: " , message ) ;
242264 if ( this . webSocket && this . webSocket . readyState === WebSocket . OPEN ) {
243265 this . webSocket . send ( JSON . stringify ( message ) ) ;
244266 }
245267 }
246268
247269 async onReceiveMessage ( messageEvent ) {
248- console . log ( "Message received: " , messageEvent ) ;
249-
250270 let jsonData ;
251271 if ( messageEvent . data instanceof Blob ) {
252272 jsonData = await messageEvent . data . text ( ) ;
@@ -258,8 +278,11 @@ class GeminiLiveAPI {
258278
259279 try {
260280 const messageData = JSON . parse ( jsonData ) ;
261- const message = new MultimodalLiveResponseMessage ( messageData ) ;
262- this . onReceiveResponse ( message ) ;
281+ // Parse all response types from this message (audio + transcription can coexist)
282+ const responses = parseResponseMessages ( messageData ) ;
283+ for ( const response of responses ) {
284+ this . onReceiveResponse ( response ) ;
285+ }
263286 } catch ( err ) {
264287 console . error ( "Error parsing JSON message:" , err , jsonData ) ;
265288 }
0 commit comments