Skip to content

Commit 8bc190a

Browse files
authored
Merge pull request #5 from google-gemini/thor/gemini-3-eap
feat: Gemini 3.1 Flash Live
2 parents 80fbc26 + 251aa78 commit 8bc190a

File tree

13 files changed

+171
-129
lines changed

13 files changed

+171
-129
lines changed

command-line/node/main.mts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const ai = new GoogleGenAI({});
88
// More information at: https://ai.google.dev/gemini-api/docs/ephemeral-tokens
99

1010
// --- Live API config ---
11-
const model = 'gemini-2.5-flash-native-audio-preview-12-2025';
11+
const model = 'gemini-3.1-flash-live-preview';
1212
const config = {
1313
responseModalities: [Modality.AUDIO],
1414
systemInstruction: "You are a helpful and friendly AI assistant.",

command-line/python/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
pya = pyaudio.PyAudio()
1515

1616
# --- Live API config ---
17-
MODEL = "gemini-2.5-flash-native-audio-preview-12-2025"
17+
MODEL = "gemini-3.1-flash-live-preview"
1818
CONFIG = {
1919
"response_modalities": ["AUDIO"],
2020
"system_instruction": "You are a helpful and friendly AI assistant.",

gemini-live-ephemeral-tokens-websocket/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ class MyTool extends FunctionCallDefinition {
9292

9393
## Configuration Options
9494

95-
- **Model**: `gemini-2.5-flash-native-audio-preview-12-2025` (default)
95+
- **Model**: `gemini-3.1-flash-live-preview` (default)
9696
- **Voice**: Puck, Charon, Kore, Fenrir, Aoede
9797
- **Response**: Audio, text, or both
9898
- **Tools**: Custom functions or Google Search grounding

gemini-live-ephemeral-tokens-websocket/frontend/audio-processors/capture.worklet.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
class AudioCaptureProcessor extends AudioWorkletProcessor {
66
constructor() {
77
super();
8-
this.bufferSize = 4096;
8+
this.bufferSize = 512; // 32ms at 16kHz — per Gemini best practices (20-40ms chunks)
99
this.buffer = new Float32Array(this.bufferSize);
1010
this.bufferIndex = 0;
1111
}

gemini-live-ephemeral-tokens-websocket/frontend/audio-processors/playback.worklet.js

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
11
/**
2-
* Audio Playback Worklet Processor for playing PCM audio
2+
* Audio Playback Worklet Processor for playing PCM audio.
3+
* Uses an offset tracker instead of slice() to avoid allocations
4+
* on the real-time audio thread.
35
*/
46

57
class PCMProcessor extends AudioWorkletProcessor {
68
constructor() {
79
super();
810
this.audioQueue = [];
11+
this.currentOffset = 0; // Track position in current buffer (avoids slice())
912

1013
this.port.onmessage = (event) => {
1114
if (event.data === "interrupt") {
1215
// Clear the queue on interrupt
1316
this.audioQueue = [];
17+
this.currentOffset = 0;
1418
} else if (event.data instanceof Float32Array) {
1519
// Add audio data to the queue
1620
this.audioQueue.push(event.data);
@@ -31,23 +35,23 @@ class PCMProcessor extends AudioWorkletProcessor {
3135

3236
if (!currentBuffer || currentBuffer.length === 0) {
3337
this.audioQueue.shift();
38+
this.currentOffset = 0;
3439
continue;
3540
}
3641

3742
const remainingOutput = channel.length - outputIndex;
38-
const remainingBuffer = currentBuffer.length;
43+
const remainingBuffer = currentBuffer.length - this.currentOffset;
3944
const copyLength = Math.min(remainingOutput, remainingBuffer);
4045

41-
// Copy audio data to output
46+
// Copy audio data to output using offset (no slice allocation)
4247
for (let i = 0; i < copyLength; i++) {
43-
channel[outputIndex++] = currentBuffer[i];
48+
channel[outputIndex++] = currentBuffer[this.currentOffset++];
4449
}
4550

46-
// Update or remove the current buffer
47-
if (copyLength < remainingBuffer) {
48-
this.audioQueue[0] = currentBuffer.slice(copyLength);
49-
} else {
51+
// If we've consumed the entire buffer, move to the next one
52+
if (this.currentOffset >= currentBuffer.length) {
5053
this.audioQueue.shift();
54+
this.currentOffset = 0;
5155
}
5256
}
5357

gemini-live-ephemeral-tokens-websocket/frontend/geminilive.js

Lines changed: 82 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -19,59 +19,82 @@ const MultimodalLiveResponseType = {
1919
/**
2020
* Parses response messages from the Gemini Live API
2121
*/
22-
class MultimodalLiveResponseMessage {
23-
constructor(data) {
24-
this.data = "";
25-
this.type = "";
26-
this.endOfTurn = false;
22+
/**
23+
* Parses ALL response types from a single server message.
24+
* The server can now bundle multiple fields (e.g. audio + transcription)
25+
* in the same message. Returns an array of response objects.
26+
*/
27+
function parseResponseMessages(data) {
28+
const responses = [];
29+
const serverContent = data?.serverContent;
30+
const parts = serverContent?.modelTurn?.parts;
31+
32+
try {
33+
// Setup complete (exclusive — no other fields expected)
34+
if (data?.setupComplete) {
35+
console.log("🏁 SETUP COMPLETE response", data);
36+
responses.push({ type: MultimodalLiveResponseType.SETUP_COMPLETE, data: "", endOfTurn: false });
37+
return responses;
38+
}
2739

28-
console.log("raw message data: ", data);
40+
// Tool call (exclusive)
41+
if (data?.toolCall) {
42+
console.log("🎯 🛠️ TOOL CALL response", data?.toolCall);
43+
responses.push({ type: MultimodalLiveResponseType.TOOL_CALL, data: data.toolCall, endOfTurn: false });
44+
return responses;
45+
}
2946

30-
const serverContent = data?.serverContent;
31-
this.endOfTurn = serverContent?.turnComplete;
32-
const parts = serverContent?.modelTurn?.parts;
47+
// Audio data from model turn parts
48+
if (parts?.length) {
49+
for (const part of parts) {
50+
if (part.inlineData) {
51+
responses.push({ type: MultimodalLiveResponseType.AUDIO, data: part.inlineData.data, endOfTurn: false });
52+
} else if (part.text) {
53+
console.log("💬 TEXT response", part.text);
54+
responses.push({ type: MultimodalLiveResponseType.TEXT, data: part.text, endOfTurn: false });
55+
}
56+
}
57+
}
3358

34-
try {
35-
if (data?.setupComplete) {
36-
console.log("🏁 SETUP COMPLETE response", data);
37-
this.type = MultimodalLiveResponseType.SETUP_COMPLETE;
38-
} else if (serverContent?.turnComplete) {
39-
console.log("🏁 TURN COMPLETE response");
40-
this.type = MultimodalLiveResponseType.TURN_COMPLETE;
41-
} else if (serverContent?.interrupted) {
42-
console.log("🗣️ INTERRUPTED response");
43-
this.type = MultimodalLiveResponseType.INTERRUPTED;
44-
} else if (serverContent?.inputTranscription) {
45-
console.log("📝 INPUT TRANSCRIPTION:", serverContent.inputTranscription);
46-
this.type = MultimodalLiveResponseType.INPUT_TRANSCRIPTION;
47-
this.data = {
59+
// Transcriptions — checked independently, NOT in else-if with audio
60+
if (serverContent?.inputTranscription) {
61+
responses.push({
62+
type: MultimodalLiveResponseType.INPUT_TRANSCRIPTION,
63+
data: {
4864
text: serverContent.inputTranscription.text || "",
4965
finished: serverContent.inputTranscription.finished || false,
50-
};
51-
} else if (serverContent?.outputTranscription) {
52-
console.log("📝 OUTPUT TRANSCRIPTION:", serverContent.outputTranscription);
53-
this.type = MultimodalLiveResponseType.OUTPUT_TRANSCRIPTION;
54-
this.data = {
66+
},
67+
endOfTurn: false,
68+
});
69+
}
70+
71+
if (serverContent?.outputTranscription) {
72+
responses.push({
73+
type: MultimodalLiveResponseType.OUTPUT_TRANSCRIPTION,
74+
data: {
5575
text: serverContent.outputTranscription.text || "",
5676
finished: serverContent.outputTranscription.finished || false,
57-
};
58-
} else if (data?.toolCall) {
59-
console.log("🎯 🛠️ TOOL CALL response", data?.toolCall);
60-
this.type = MultimodalLiveResponseType.TOOL_CALL;
61-
this.data = data?.toolCall;
62-
} else if (parts?.length && parts[0].text) {
63-
console.log("💬 TEXT response", parts[0].text);
64-
this.data = parts[0].text;
65-
this.type = MultimodalLiveResponseType.TEXT;
66-
} else if (parts?.length && parts[0].inlineData) {
67-
console.log("🔊 AUDIO response");
68-
this.data = parts[0].inlineData.data;
69-
this.type = MultimodalLiveResponseType.AUDIO;
70-
}
71-
} catch (err) {
72-
console.log("⚠️ Error parsing response data: ", err, data);
77+
},
78+
endOfTurn: false,
79+
});
7380
}
81+
82+
// Interrupted
83+
if (serverContent?.interrupted) {
84+
console.log("🗣️ INTERRUPTED response");
85+
responses.push({ type: MultimodalLiveResponseType.INTERRUPTED, data: "", endOfTurn: false });
86+
}
87+
88+
// Turn complete
89+
if (serverContent?.turnComplete) {
90+
console.log("🏁 TURN COMPLETE response");
91+
responses.push({ type: MultimodalLiveResponseType.TURN_COMPLETE, data: "", endOfTurn: true });
92+
}
93+
} catch (err) {
94+
console.log("⚠️ Error parsing response data: ", err, data);
7495
}
96+
97+
return responses;
7598
}
7699

77100
/**
@@ -105,7 +128,7 @@ class FunctionCallDefinition {
105128
parameters
106129
)}`
107130
);
108-
this.functionToCall(parameters);
131+
return this.functionToCall(parameters);
109132
}
110133
}
111134

@@ -121,10 +144,8 @@ class GeminiLiveAPI {
121144
this.responseModalities = ["AUDIO"];
122145
this.systemInstructions = "";
123146
this.googleGrounding = false;
124-
this.enableAffectiveDialog = false; // Default affective dialog
125147
this.voiceName = "Puck"; // Default voice
126148
this.temperature = 1.0; // Default temperature
127-
this.proactivity = { proactiveAudio: false }; // Proactivity config
128149
this.inputAudioTranscription = false;
129150
this.outputAudioTranscription = false;
130151
this.enableFunctionCalls = false;
@@ -195,10 +216,7 @@ class GeminiLiveAPI {
195216
this.voiceName = voiceName;
196217
}
197218

198-
setProactivity(proactivity) {
199-
console.log("setting proactivity: ", proactivity);
200-
this.proactivity = proactivity;
201-
}
219+
202220

203221
setInputAudioTranscription(enabled) {
204222
console.log("setting input audio transcription: ", enabled);
@@ -223,7 +241,7 @@ class GeminiLiveAPI {
223241

224242
callFunction(functionName, parameters) {
225243
const functionToCall = this.functionsMap[functionName];
226-
functionToCall.runFunction(parameters);
244+
return functionToCall.runFunction(parameters);
227245
}
228246

229247
connect() {
@@ -238,15 +256,12 @@ class GeminiLiveAPI {
238256
}
239257

240258
sendMessage(message) {
241-
console.log("🟩 Sending message: ", message);
242259
if (this.webSocket && this.webSocket.readyState === WebSocket.OPEN) {
243260
this.webSocket.send(JSON.stringify(message));
244261
}
245262
}
246263

247264
async onReceiveMessage(messageEvent) {
248-
console.log("Message received: ", messageEvent);
249-
250265
let jsonData;
251266
if (messageEvent.data instanceof Blob) {
252267
jsonData = await messageEvent.data.text();
@@ -258,8 +273,11 @@ class GeminiLiveAPI {
258273

259274
try {
260275
const messageData = JSON.parse(jsonData);
261-
const message = new MultimodalLiveResponseMessage(messageData);
262-
this.onReceiveResponse(message);
276+
// Parse all response types from this message (audio + transcription can coexist)
277+
const responses = parseResponseMessages(messageData);
278+
for (const response of responses) {
279+
this.onReceiveResponse(response);
280+
}
263281
} catch (err) {
264282
console.error("Error parsing JSON message:", err, jsonData);
265283
}
@@ -322,8 +340,8 @@ class GeminiLiveAPI {
322340
},
323341
},
324342
systemInstruction: { parts: [{ text: this.systemInstructions }] },
325-
tools: { functionDeclarations: tools },
326-
proactivity: this.proactivity,
343+
tools: [{ functionDeclarations: tools }],
344+
327345

328346
realtimeInputConfig: {
329347
automaticActivityDetection: {
@@ -334,6 +352,7 @@ class GeminiLiveAPI {
334352
startOfSpeechSensitivity: this.automaticActivityDetection.start_of_speech_sensitivity,
335353
},
336354
activityHandling: this.activityHandling,
355+
turnCoverage: "TURN_INCLUDES_ONLY_ACTIVITY",
337356
},
338357
},
339358
};
@@ -347,18 +366,14 @@ class GeminiLiveAPI {
347366
}
348367

349368
if (this.googleGrounding) {
350-
sessionSetupMessage.setup.tools.googleSearch = {};
351369
// Currently can't have both Google Search with custom tools.
352370
console.log(
353371
"Google Grounding enabled, removing custom function calls if any."
354372
);
355-
delete sessionSetupMessage.setup.tools.functionDeclarations;
373+
sessionSetupMessage.setup.tools = [{ googleSearch: {} }];
356374
}
357375

358-
// Add affective dialog if enabled
359-
if (this.enableAffectiveDialog) {
360-
sessionSetupMessage.setup.generationConfig.enableAffectiveDialog = true;
361-
}
376+
362377

363378
// Store the setup message for later access
364379
this.lastSetupMessage = sessionSetupMessage;
@@ -376,11 +391,10 @@ class GeminiLiveAPI {
376391
this.sendMessage(message);
377392
}
378393

379-
sendToolResponse(toolCallId, response) {
394+
sendToolResponse(functionResponses) {
380395
const message = {
381396
toolResponse: {
382-
id: toolCallId,
383-
response: response,
397+
functionResponses: functionResponses,
384398
},
385399
};
386400
console.log("🔧 Sending tool response:", message);

gemini-live-ephemeral-tokens-websocket/frontend/index.html

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,7 @@ <h2>API Configuration</h2>
8585

8686
<div>
8787
<label for="model">Model ID:</label><br />
88-
<input type="text" id="model" value="gemini-2.5-flash-native-audio-preview-12-2025"
89-
placeholder="Enter model ID" />
88+
<input type="text" id="model" value="gemini-3.1-flash-live-preview" placeholder="Enter model ID" />
9089
</div>
9190
</details>
9291

@@ -119,24 +118,12 @@ <h2>API Configuration</h2>
119118
Lower = more predictable/focused</small>
120119
</div>
121120

122-
<div>
123-
<input type="checkbox" id="enableProactiveAudio" checked />
124-
<label for="enableProactiveAudio">Enable proactive audio (Gemini will ignore speech based on
125-
instructions)</label>
126-
</div>
127-
128121
<div>
129122
<input type="checkbox" id="enableGrounding" />
130123
<label for="enableGrounding">Enable Google grounding (Enabling Google grounding will disable
131124
custom tools)
132125
</label>
133126
</div>
134-
135-
<div>
136-
<input type="checkbox" id="enableAffectiveDialog" checked />
137-
<label for="enableAffectiveDialog">Enable affective dialog (emotion detection and empathetic
138-
responses)</label>
139-
</div>
140127
</details>
141128

142129
<!-- Custom Tools -->

gemini-live-ephemeral-tokens-websocket/frontend/mediaUtils.js

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -462,10 +462,11 @@ class AudioPlayer {
462462
await this.audioContext.resume();
463463
}
464464

465-
// Convert base64 to Float32Array
465+
// Efficient base64 → binary decode
466466
const binaryString = atob(base64Audio);
467-
const bytes = new Uint8Array(binaryString.length);
468-
for (let i = 0; i < binaryString.length; i++) {
467+
const len = binaryString.length;
468+
const bytes = new Uint8Array(len);
469+
for (let i = 0; i < len; i++) {
469470
bytes[i] = binaryString.charCodeAt(i);
470471
}
471472

0 commit comments

Comments
 (0)