google-gemini
diff --git a/‎README.md‎
Lines changed: 76 additions & 0 deletions b/‎README.md‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎gemini-live-ephemeral-tokens-websocket/.env.example‎
Lines changed: 1 addition & 0 deletions b/‎gemini-live-ephemeral-tokens-websocket/.env.example‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎gemini-live-ephemeral-tokens-websocket/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎gemini-live-ephemeral-tokens-websocket/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎gemini-live-ephemeral-tokens-websocket/README.md‎
Lines changed: 106 additions & 0 deletions b/‎gemini-live-ephemeral-tokens-websocket/README.md‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎gemini-live-ephemeral-tokens-websocket/frontend/audio-processors/capture.worklet.js‎
Lines changed: 43 additions & 0 deletions b/‎gemini-live-ephemeral-tokens-websocket/frontend/audio-processors/capture.worklet.js‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎gemini-live-ephemeral-tokens-websocket/frontend/audio-processors/playback.worklet.js‎
Lines changed: 63 additions & 0 deletions b/‎gemini-live-ephemeral-tokens-websocket/frontend/audio-processors/playback.worklet.js‎
Lines changed: 63 additions & 0 deletions
@@ -0,0 +1,76 @@
+# Gemini Live API Examples
+
+The Live API enables low-latency, real-time voice and video interactions with
+Gemini. It processes continuous streams of audio, video, or text to deliver
+immediate, human-like spoken responses, creating a natural conversational
+experience for your users.
+
+![Live API Overview](https://ai.google.dev/gemini-api/docs/images/live-api-overview.png)
+
+[Try the Live API in Google AI Studio](https://aistudio.google.com/live)
+
+## Example use cases
+
+Live API can be used to build real-time voice and video agents for a
+variety of industries, including:
+
+*   **E-commerce and retail:** Shopping assistants that offer personalized
+    recommendations and support agents that resolve customer issues.
+*   **Gaming:** Interactive non-player characters (NPCs), in-game help
+    assistants, and real-time translation of in-game content.
+*   **Next-gen interfaces:** Voice- and video-enabled experiences in robotics,
+    smart glasses, and vehicles.
+*   **Healthcare:** Health companions for patient support and education.
+*   **Financial services:** AI advisors for wealth management and investment
+    guidance.
+*   **Education:** AI mentors and learner companions that provide personalized
+    instruction and feedback.
+
+## Key features
+
+Live API offers a comprehensive set of features for building
+robust voice and video agents:
+
+*   [**Multilingual support**](https://ai.google.dev/gemini-api/docs/live-guide#supported-languages):
+    Converse in 70 supported languages.
+*   [**Barge-in**](https://ai.google.dev/gemini-api/docs/live-guide#interruptions):
+    Users can interrupt the model at any time for responsive interactions.
+*   [**Tool use**](https://ai.google.dev/gemini-api/docs/live-tools):
+    Integrates tools like function calling and Google Search for dynamic
+    interactions.
+*   [**Audio transcriptions**](https://ai.google.dev/gemini-api/docs/live-guide#audio-transcription):
+    Provides text transcripts of both user input and model output.
+*   [**Proactive audio**](https://ai.google.dev/gemini-api/docs/live-guide#proactive-audio):
+    Lets you control when the model responds and in what contexts.
+*   [**Affective dialog**](https://ai.google.dev/gemini-api/docs/live-guide#affective-dialog):
+    Adapts response style and tone to match the user's input expression.
+
+## Technical specifications
+
+The following table outlines the technical specifications for the
+Live API:
+
+| Category          | Details                                                                                     |
+| :---------------- | :------------------------------------------------------------------------------------------ |
+| Input modalities  | Audio (raw 16-bit PCM audio, 16kHz, little-endian), images/video (JPEG <= 1FPS), text       |
+| Output modalities | Audio (raw 16-bit PCM audio, 24kHz, little-endian), text                                    |
+| Protocol          | Stateful WebSocket connection (WSS)                                                         |
+
+## Examples
+
+*   **[Gen AI SDK Python example](./gemini-live-genai-python-sdk/README.md)**: Recommended for ease of use. Connect to the Gemini Live API using the Gen AI SDK to build a real-time multimodal application with a Python backend.
+*   **[Epheremal tokens and raw WebSocket example](./gemini-live-ephemeral-tokens-websocket/README.md)**: RAW protocol control. Connect to the Gemini Live API using WebSockets to build a real-time multimodal application with a JavaScript frontend and a Python backend.
+
+## Partner integrations
+
+To streamline the development of real-time audio and video apps, you can use
+a third-party integration that supports the Gemini Live
+API over WebRTC or WebSockets.
+
+*   [LiveKit](https://docs.livekit.io/agents/models/realtime/plugins/gemini/): Use the Gemini Live API with LiveKit Agents.
+*   [Pipecat by Daily](https://docs.pipecat.ai/guides/features/gemini-live): Create a real-time AI chatbot using Gemini Live and Pipecat.
+*   [Fishjam by Software Mansion](https://docs.fishjam.io/tutorials/gemini-live-integration): Create live video and audio streaming applications with Fishjam.
+*   [Vision Agents by Stream](https://visionagents.ai/integrations/gemini): Build real-time voice and video AI applications with Vision Agents.
+*   [Voximplant](https://voximplant.com/products/gemini-client): Connect inbound and outbound calls to Live API with Voximplant.
+*   [Agent Development Kit (ADK)](https://google.github.io/adk-docs/streaming/): Create an agent and use the Agent Development Kit (ADK) Streaming to enable voice and video communication.
+*   [Firebase AI SDK](https://firebase.google.com/docs/ai-logic/live-api?api=dev): Get started with the Gemini Live API using Firebase AI Logic.
@@ -0,0 +1 @@
+GEMINI_API_KEY=
@@ -0,0 +1,3 @@
+__pycache__
+.venv
+.env
@@ -0,0 +1,106 @@
+# Gemini Live API - Vanilla JS
+
+WebSocket client for Google's Gemini Live API with audio/video streaming support using **Ephemeral Tokens**. No frameworks, just vanilla JavaScript.
+
+## Quick Start
+
+```bash
+# 1. Install uv (if not already installed)
+# curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# 2. Create a virtual environment and sync dependencies
+uv venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+uv pip install -r requirements.txt
+
+# 3. Set your API Key in a .env file
+echo "GEMINI_API_KEY=your_actual_api_key_here" > .env
+
+# 4. Start server
+uv run server.py
+
+# 5. Open browser
+open http://localhost:8000
+```
+
+## Features
+
+- **Direct client-to-server connection**: Low latency WebSocket connection directly to the Gemini Live API.
+- **Ephemeral Tokens**: Improved security by using short-lived tokens generated by the backend.
+- **Real-time audio/video streaming**: High-performance streaming using standard Web APIs.
+- **Custom tools**: Example implementations for browser alerts and CSS injection.
+- **Device selection**: Full control over microphone and camera inputs.
+
+## Project Structure
+
+```
+/
+├── server.py        # Token provisioning + HTTP server
+├── .env             # API key configuration
+├── requirements.txt # Python dependencies
+└── frontend/
+    ├── index.html    # UI
+    ├── geminilive.js # Gemini API client (direct connection)
+    ├── mediaUtils.js # Audio/video streaming logic
+    ├── tools.js      # Custom tool definitions
+    └── script.js     # Application workflow
+```
+
+## Core APIs
+
+### GeminiLive Client
+
+```javascript
+// Connect using an ephemeral token
+const client = new GeminiLiveAPI(token, model);
+client.addFunction(toolInstance); // Add custom tools
+await client.connect(); // Establish direct WebSocket
+client.sendTextMessage("Hello"); // Send text
+client.sendAudioMessage(base64); // Send audio
+client.sendImageMessage(base64); // Send image
+```
+
+### Media Streaming
+
+```javascript
+// Audio streaming
+const audioStreamer = new AudioStreamer(client);
+await audioStreamer.start(deviceId); 
+
+// Video streaming
+const videoStreamer = new VideoStreamer(client);
+await videoStreamer.start({ fps: 1, width: 640, height: 480 });
+
+// Audio playback
+const player = new AudioPlayer();
+await player.play(base64PCM);
+```
+
+### Custom Tools
+
+```javascript
+class MyTool extends FunctionCallDefinition {
+  constructor() {
+    super("tool_name", "description", parameters, required);
+  }
+
+  functionToCall(params) {
+    // Tool implementation logic
+  }
+}
+```
+
+## Configuration Options
+
+- **Model**: `gemini-2.5-flash-native-audio-preview-12-2025` (default)
+- **Voice**: Puck, Charon, Kore, Fenrir, Aoede
+- **Response**: Audio, text, or both
+- **Tools**: Custom functions or Google Search grounding
+
+## Security & Architecture
+
+This demo uses the **Ephemeral Token** approach:
+
+1.  **Backend**: Uses `GEMINI_API_KEY` to request a short-lived (ephemeral) token via the `google-genai` SDK.
+2.  **Frontend**: Fetches this token from the backend `/api/token` endpoint.
+3.  **Direct Connection**: The browser establishes a WebSocket connection directly to `generativelanguage.googleapis.com` using the token, bypassing the proxy for data streaming.
@@ -0,0 +1,43 @@
+/**
+ * Audio Worklet Processor for capturing and processing audio
+ */
+
+class AudioCaptureProcessor extends AudioWorkletProcessor {
+  constructor() {
+    super();
+    this.bufferSize = 4096;
+    this.buffer = new Float32Array(this.bufferSize);
+    this.bufferIndex = 0;
+  }
+
+  process(inputs, outputs, parameters) {
+    const input = inputs[0];
+
+    if (input && input.length > 0) {
+      const inputChannel = input[0];
+
+      // Buffer the incoming audio
+      for (let i = 0; i < inputChannel.length; i++) {
+        this.buffer[this.bufferIndex++] = inputChannel[i];
+
+        // When buffer is full, send it to main thread
+        if (this.bufferIndex >= this.bufferSize) {
+          // Send the buffered audio to the main thread
+          this.port.postMessage({
+            type: "audio",
+            data: this.buffer.slice(),
+          });
+
+          // Reset buffer
+          this.bufferIndex = 0;
+        }
+      }
+    }
+
+    // Return true to keep the processor alive
+    return true;
+  }
+}
+
+// Register the processor
+registerProcessor("audio-capture-processor", AudioCaptureProcessor);
@@ -0,0 +1,63 @@
+/**
+ * Audio Playback Worklet Processor for playing PCM audio
+ */
+
+class PCMProcessor extends AudioWorkletProcessor {
+  constructor() {
+    super();
+    this.audioQueue = [];
+
+    this.port.onmessage = (event) => {
+      if (event.data === "interrupt") {
+        // Clear the queue on interrupt
+        this.audioQueue = [];
+      } else if (event.data instanceof Float32Array) {
+        // Add audio data to the queue
+        this.audioQueue.push(event.data);
+      }
+    };
+  }
+
+  process(inputs, outputs, parameters) {
+    const output = outputs[0];
+    if (output.length === 0) return true;
+
+    const channel = output[0];
+    let outputIndex = 0;
+
+    // Fill the output buffer from the queue
+    while (outputIndex < channel.length && this.audioQueue.length > 0) {
+      const currentBuffer = this.audioQueue[0];
+
+      if (!currentBuffer || currentBuffer.length === 0) {
+        this.audioQueue.shift();
+        continue;
+      }
+
+      const remainingOutput = channel.length - outputIndex;
+      const remainingBuffer = currentBuffer.length;
+      const copyLength = Math.min(remainingOutput, remainingBuffer);
+
+      // Copy audio data to output
+      for (let i = 0; i < copyLength; i++) {
+        channel[outputIndex++] = currentBuffer[i];
+      }
+
+      // Update or remove the current buffer
+      if (copyLength < remainingBuffer) {
+        this.audioQueue[0] = currentBuffer.slice(copyLength);
+      } else {
+        this.audioQueue.shift();
+      }
+    }
+
+    // Fill remaining output with silence
+    while (outputIndex < channel.length) {
+      channel[outputIndex++] = 0;
+    }
+
+    return true;
+  }
+}
+
+registerProcessor("pcm-processor", PCMProcessor);