Skip to content

Commit 21949a5

Browse files
committed
refactor: optimize audio processing and enhance API response parsing to support multiple types per message.
1 parent 0d213e6 commit 21949a5

5 files changed

Lines changed: 89 additions & 64 deletions

File tree

gemini-live-ephemeral-tokens-websocket/frontend/audio-processors/capture.worklet.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
class AudioCaptureProcessor extends AudioWorkletProcessor {
66
constructor() {
77
super();
8-
this.bufferSize = 4096;
8+
this.bufferSize = 512; // 32ms at 16kHz — per Gemini best practices (20-40ms chunks)
99
this.buffer = new Float32Array(this.bufferSize);
1010
this.bufferIndex = 0;
1111
}

gemini-live-ephemeral-tokens-websocket/frontend/audio-processors/playback.worklet.js

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,20 @@
11
/**
2-
* Audio Playback Worklet Processor for playing PCM audio
2+
* Audio Playback Worklet Processor for playing PCM audio.
3+
* Uses an offset tracker instead of slice() to avoid allocations
4+
* on the real-time audio thread.
35
*/
46

57
class PCMProcessor extends AudioWorkletProcessor {
68
constructor() {
79
super();
810
this.audioQueue = [];
11+
this.currentOffset = 0; // Track position in current buffer (avoids slice())
912

1013
this.port.onmessage = (event) => {
1114
if (event.data === "interrupt") {
1215
// Clear the queue on interrupt
1316
this.audioQueue = [];
17+
this.currentOffset = 0;
1418
} else if (event.data instanceof Float32Array) {
1519
// Add audio data to the queue
1620
this.audioQueue.push(event.data);
@@ -31,23 +35,23 @@ class PCMProcessor extends AudioWorkletProcessor {
3135

3236
if (!currentBuffer || currentBuffer.length === 0) {
3337
this.audioQueue.shift();
38+
this.currentOffset = 0;
3439
continue;
3540
}
3641

3742
const remainingOutput = channel.length - outputIndex;
38-
const remainingBuffer = currentBuffer.length;
43+
const remainingBuffer = currentBuffer.length - this.currentOffset;
3944
const copyLength = Math.min(remainingOutput, remainingBuffer);
4045

41-
// Copy audio data to output
46+
// Copy audio data to output using offset (no slice allocation)
4247
for (let i = 0; i < copyLength; i++) {
43-
channel[outputIndex++] = currentBuffer[i];
48+
channel[outputIndex++] = currentBuffer[this.currentOffset++];
4449
}
4550

46-
// Update or remove the current buffer
47-
if (copyLength < remainingBuffer) {
48-
this.audioQueue[0] = currentBuffer.slice(copyLength);
49-
} else {
51+
// If we've consumed the entire buffer, move to the next one
52+
if (this.currentOffset >= currentBuffer.length) {
5053
this.audioQueue.shift();
54+
this.currentOffset = 0;
5155
}
5256
}
5357

gemini-live-ephemeral-tokens-websocket/frontend/geminilive.js

Lines changed: 72 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -19,59 +19,82 @@ const MultimodalLiveResponseType = {
1919
/**
2020
* Parses response messages from the Gemini Live API
2121
*/
22-
class MultimodalLiveResponseMessage {
23-
constructor(data) {
24-
this.data = "";
25-
this.type = "";
26-
this.endOfTurn = false;
22+
/**
23+
* Parses ALL response types from a single server message.
24+
* The server can now bundle multiple fields (e.g. audio + transcription)
25+
* in the same message. Returns an array of response objects.
26+
*/
27+
function parseResponseMessages(data) {
28+
const responses = [];
29+
const serverContent = data?.serverContent;
30+
const parts = serverContent?.modelTurn?.parts;
31+
32+
try {
33+
// Setup complete (exclusive — no other fields expected)
34+
if (data?.setupComplete) {
35+
console.log("🏁 SETUP COMPLETE response", data);
36+
responses.push({ type: MultimodalLiveResponseType.SETUP_COMPLETE, data: "", endOfTurn: false });
37+
return responses;
38+
}
2739

28-
console.log("raw message data: ", data);
40+
// Tool call (exclusive)
41+
if (data?.toolCall) {
42+
console.log("🎯 🛠️ TOOL CALL response", data?.toolCall);
43+
responses.push({ type: MultimodalLiveResponseType.TOOL_CALL, data: data.toolCall, endOfTurn: false });
44+
return responses;
45+
}
2946

30-
const serverContent = data?.serverContent;
31-
this.endOfTurn = serverContent?.turnComplete;
32-
const parts = serverContent?.modelTurn?.parts;
47+
// Audio data from model turn parts
48+
if (parts?.length) {
49+
for (const part of parts) {
50+
if (part.inlineData) {
51+
responses.push({ type: MultimodalLiveResponseType.AUDIO, data: part.inlineData.data, endOfTurn: false });
52+
} else if (part.text) {
53+
console.log("💬 TEXT response", part.text);
54+
responses.push({ type: MultimodalLiveResponseType.TEXT, data: part.text, endOfTurn: false });
55+
}
56+
}
57+
}
3358

34-
try {
35-
if (data?.setupComplete) {
36-
console.log("🏁 SETUP COMPLETE response", data);
37-
this.type = MultimodalLiveResponseType.SETUP_COMPLETE;
38-
} else if (serverContent?.turnComplete) {
39-
console.log("🏁 TURN COMPLETE response");
40-
this.type = MultimodalLiveResponseType.TURN_COMPLETE;
41-
} else if (serverContent?.interrupted) {
42-
console.log("🗣️ INTERRUPTED response");
43-
this.type = MultimodalLiveResponseType.INTERRUPTED;
44-
} else if (serverContent?.inputTranscription) {
45-
console.log("📝 INPUT TRANSCRIPTION:", serverContent.inputTranscription);
46-
this.type = MultimodalLiveResponseType.INPUT_TRANSCRIPTION;
47-
this.data = {
59+
// Transcriptions — checked independently, NOT in else-if with audio
60+
if (serverContent?.inputTranscription) {
61+
responses.push({
62+
type: MultimodalLiveResponseType.INPUT_TRANSCRIPTION,
63+
data: {
4864
text: serverContent.inputTranscription.text || "",
4965
finished: serverContent.inputTranscription.finished || false,
50-
};
51-
} else if (serverContent?.outputTranscription) {
52-
console.log("📝 OUTPUT TRANSCRIPTION:", serverContent.outputTranscription);
53-
this.type = MultimodalLiveResponseType.OUTPUT_TRANSCRIPTION;
54-
this.data = {
66+
},
67+
endOfTurn: false,
68+
});
69+
}
70+
71+
if (serverContent?.outputTranscription) {
72+
responses.push({
73+
type: MultimodalLiveResponseType.OUTPUT_TRANSCRIPTION,
74+
data: {
5575
text: serverContent.outputTranscription.text || "",
5676
finished: serverContent.outputTranscription.finished || false,
57-
};
58-
} else if (data?.toolCall) {
59-
console.log("🎯 🛠️ TOOL CALL response", data?.toolCall);
60-
this.type = MultimodalLiveResponseType.TOOL_CALL;
61-
this.data = data?.toolCall;
62-
} else if (parts?.length && parts[0].text) {
63-
console.log("💬 TEXT response", parts[0].text);
64-
this.data = parts[0].text;
65-
this.type = MultimodalLiveResponseType.TEXT;
66-
} else if (parts?.length && parts[0].inlineData) {
67-
console.log("🔊 AUDIO response");
68-
this.data = parts[0].inlineData.data;
69-
this.type = MultimodalLiveResponseType.AUDIO;
70-
}
71-
} catch (err) {
72-
console.log("⚠️ Error parsing response data: ", err, data);
77+
},
78+
endOfTurn: false,
79+
});
80+
}
81+
82+
// Interrupted
83+
if (serverContent?.interrupted) {
84+
console.log("🗣️ INTERRUPTED response");
85+
responses.push({ type: MultimodalLiveResponseType.INTERRUPTED, data: "", endOfTurn: false });
7386
}
87+
88+
// Turn complete
89+
if (serverContent?.turnComplete) {
90+
console.log("🏁 TURN COMPLETE response");
91+
responses.push({ type: MultimodalLiveResponseType.TURN_COMPLETE, data: "", endOfTurn: true });
92+
}
93+
} catch (err) {
94+
console.log("⚠️ Error parsing response data: ", err, data);
7495
}
96+
97+
return responses;
7598
}
7699

77100
/**
@@ -238,15 +261,12 @@ class GeminiLiveAPI {
238261
}
239262

240263
sendMessage(message) {
241-
console.log("🟩 Sending message: ", message);
242264
if (this.webSocket && this.webSocket.readyState === WebSocket.OPEN) {
243265
this.webSocket.send(JSON.stringify(message));
244266
}
245267
}
246268

247269
async onReceiveMessage(messageEvent) {
248-
console.log("Message received: ", messageEvent);
249-
250270
let jsonData;
251271
if (messageEvent.data instanceof Blob) {
252272
jsonData = await messageEvent.data.text();
@@ -258,8 +278,11 @@ class GeminiLiveAPI {
258278

259279
try {
260280
const messageData = JSON.parse(jsonData);
261-
const message = new MultimodalLiveResponseMessage(messageData);
262-
this.onReceiveResponse(message);
281+
// Parse all response types from this message (audio + transcription can coexist)
282+
const responses = parseResponseMessages(messageData);
283+
for (const response of responses) {
284+
this.onReceiveResponse(response);
285+
}
263286
} catch (err) {
264287
console.error("Error parsing JSON message:", err, jsonData);
265288
}

gemini-live-ephemeral-tokens-websocket/frontend/mediaUtils.js

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -462,10 +462,11 @@ class AudioPlayer {
462462
await this.audioContext.resume();
463463
}
464464

465-
// Convert base64 to Float32Array
465+
// Efficient base64 → binary decode
466466
const binaryString = atob(base64Audio);
467-
const bytes = new Uint8Array(binaryString.length);
468-
for (let i = 0; i < binaryString.length; i++) {
467+
const len = binaryString.length;
468+
const bytes = new Uint8Array(len);
469+
for (let i = 0; i < len; i++) {
469470
bytes[i] = binaryString.charCodeAt(i);
470471
}
471472

gemini-live-ephemeral-tokens-websocket/frontend/script.js

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,17 +234,14 @@ function disconnect() {
234234

235235
// Handle messages
236236
function handleMessage(message) {
237-
console.log("Message:", message);
238237
updateStatus("debugInfo", `Message: ${message.type}`);
239238

240239
switch (message.type) {
241240
case MultimodalLiveResponseType.TEXT:
242-
console.log("Text message:");
243241
addMessage(message.data, "assistant");
244242
break;
245243

246244
case MultimodalLiveResponseType.AUDIO:
247-
console.log("Audio message:");
248245
if (state.audio.player) {
249246
state.audio.player.play(message.data);
250247
}

0 commit comments

Comments
 (0)