From c50c69dc1112ac1a07b603c4c56a3eec3f840ba6 Mon Sep 17 00:00:00 2001 From: STC Date: Wed, 20 May 2026 23:02:54 +0900 Subject: [PATCH] Support gemini-3.1-flash-live and gpt-realtime-2 --- contributed/conversationalAI/assets.js | 6 +++-- .../network/services/chatAudioIO/readme.md | 2 +- .../workers/googleGeminiLiveModel.js | 23 +++++++++++++------ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/contributed/conversationalAI/assets.js b/contributed/conversationalAI/assets.js index 0ab6fa796..9544714fe 100644 --- a/contributed/conversationalAI/assets.js +++ b/contributed/conversationalAI/assets.js @@ -262,7 +262,8 @@ const assets = { providers: [ { name:"Google", id:"GOOGLE", models: [ - { name:"Gemini 2.5 Flash Live", id:"gemini-2.5-flash-native-audio-preview-12-2025" } + { name:"Gemini 2.5 Flash Live", id:"gemini-2.5-flash-native-audio-preview-12-2025" }, + { name:"Gemini 3.1 Flash Live", id:"gemini-3.1-flash-live-preview" } ], } ], @@ -435,7 +436,8 @@ const assets = { { name:"OpenAI", id:"OPEN_AI", models: [ { name:"gpt-realtime", id:"gpt-realtime" }, - { name:"gpt-realtime-mini", id:"gpt-realtime-mini" } + { name:"gpt-realtime-mini", id:"gpt-realtime-mini" }, + { name:"gpt-realtime-2", id:"gpt-realtime-2" }, ], } ], diff --git a/modules/network/services/chatAudioIO/readme.md b/modules/network/services/chatAudioIO/readme.md index 66c524220..16e43c2c8 100644 --- a/modules/network/services/chatAudioIO/readme.md +++ b/modules/network/services/chatAudioIO/readme.md @@ -16,7 +16,7 @@ The conversation library implements support for various AI cloud services using - [Google Gemini Live](https://ai.google.dev/api/multimodal-live) - [Hume Empathic Voice Interface](https://dev.hume.ai/docs/empathic-voice-interface-evi/overview) - [Eleven Labs Conversational AI](https://elevenlabs.io/docs/conversational-ai/overview) -- [Deepgram Voice Agent](https://elevenlabs.io/docs/conversational-ai/overview) +- [Deepgram Voice Agent](https://deepgram.com/product/voice-agent-api) ## Programming Interface diff --git a/modules/network/services/chatAudioIO/workers/googleGeminiLiveModel.js b/modules/network/services/chatAudioIO/workers/googleGeminiLiveModel.js index c0e4291d6..ea918502d 100644 --- a/modules/network/services/chatAudioIO/workers/googleGeminiLiveModel.js +++ b/modules/network/services/chatAudioIO/workers/googleGeminiLiveModel.js @@ -21,16 +21,16 @@ import config from "mc/config" import ChatWebSocketWorker from "ChatWebSocketWorker"; -const audioPrefix = Object.freeze(new Uint8Array(ArrayBuffer.fromString('{"realtimeInput":{"mediaChunks":[{"mimeType":"audio/pcm;rate=24000","data":"')), true); -const audioSuffix = Object.freeze(new Uint8Array(ArrayBuffer.fromString('"}]}}')), true); +const audioPrefixOld = Object.freeze(new Uint8Array(ArrayBuffer.fromString('{"realtimeInput":{"mediaChunks":[{"mimeType":"audio/pcm;rate=24000","data":"')), true); +const audioSuffixOld = Object.freeze(new Uint8Array(ArrayBuffer.fromString('"}]}}')), true); +const audioPrefix = Object.freeze(new Uint8Array(ArrayBuffer.fromString('{"realtimeInput":{"audio":{"mimeType":"audio/pcm;rate=24000","data":"')), true); +const audioSuffix = Object.freeze(new Uint8Array(ArrayBuffer.fromString('"}}}')), true); export default class GoogleGeminiLiveModel extends ChatWebSocketWorker { constructor(options) { super(options); this.host = "generativelanguage.googleapis.com"; this.headers = null; - this.audioPrefix = audioPrefix; - this.audioSuffix = audioSuffix; this.speaking = true; } configure(message) { @@ -39,6 +39,15 @@ export default class GoogleGeminiLiveModel extends ChatWebSocketWorker { const voiceName = message.voiceID ?? "aoede"; const model = message.modelID ?? "gemini-2.5-flash-native-audio-preview-12-2025"; const apiKey = message.apiKey ?? config.geminiAPIKey; + + if(model.includes("gemini-2")) { + this.audioPrefix = audioPrefixOld; + this.audioSuffix = audioSuffixOld; + } else { + this.audioPrefix = audioPrefix; + this.audioSuffix = audioSuffix; + } + this.path = `/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent?key=${apiKey}`; this.setup = { model: `models/${model}`, @@ -63,7 +72,7 @@ export default class GoogleGeminiLiveModel extends ChatWebSocketWorker { }; } isBase64(result, current, name) { - return (current?.mimeType == "audio/pcm;rate=24000") && (name == "data"); + return (current?.mimeType === "audio/pcm;rate=24000") && (name === "data"); } onJSON(json) { for (let key in json) { @@ -113,7 +122,7 @@ export default class GoogleGeminiLiveModel extends ChatWebSocketWorker { 'serverContent'(data) { const parts = data.modelTurn?.parts; if (parts) { - const part = parts.find(part => part.inlineData?.mimeType == "audio/pcm;rate=24000"); + const part = parts.find(part => part.inlineData?.mimeType === "audio/pcm;rate=24000"); if (part) { if (this.speaking) { this.postMessage({ id:"receiveInputText", text:"" }); @@ -145,7 +154,7 @@ export default class GoogleGeminiLiveModel extends ChatWebSocketWorker { const functionCalls = data.functionCalls; if (functionCalls) { this.post("listen"); - for (let functionCall of functionCalls) { + for (const functionCall of functionCalls) { this.postMessage({ id:"receiveFunctionCall", call:functionCall.id,