Skip to content

Commit 9f2a676

Browse files
committed
Add input/output transcription to live audio example
Enable text transcription alongside audio streaming. Output transcription prints inline, input transcription in italic. Newlines separate input from output and after sentence-ending punctuation. Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
1 parent 0a2e0f0 commit 9f2a676

File tree

2 files changed

+44
-6
lines changed

2 files changed

+44
-6
lines changed

command-line/node/main.mts

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ const model = 'gemini-3.1-flash-live-preview';
1212
const config = {
1313
responseModalities: [Modality.AUDIO],
1414
systemInstruction: "You are a helpful and friendly AI assistant.",
15+
outputAudioTranscription: {},
16+
inputAudioTranscription: {},
1517
};
1618

1719
async function live() {
@@ -40,22 +42,36 @@ async function live() {
4042
process.stdin.pipe(speaker);
4143
}
4244

45+
let lastWasInput = false;
46+
4347
async function messageLoop() {
4448
// Puts incoming messages in the audio queue.
4549
while (true) {
4650
const message = await waitMessage();
47-
if (message.serverContent && message.serverContent.interrupted) {
51+
const sc = message.serverContent;
52+
if (!sc) continue;
53+
if (sc.interrupted) {
4854
// Empty the queue on interruption to stop playback
4955
audioQueue.length = 0;
5056
continue;
5157
}
52-
if (message.serverContent && message.serverContent.modelTurn && message.serverContent.modelTurn.parts) {
53-
for (const part of message.serverContent.modelTurn.parts) {
54-
if (part.inlineData && part.inlineData.data) {
58+
if (sc.modelTurn?.parts) {
59+
for (const part of sc.modelTurn.parts) {
60+
if (part.inlineData?.data) {
5561
audioQueue.push(Buffer.from(part.inlineData.data, 'base64'));
5662
}
5763
}
5864
}
65+
if (sc.outputTranscription?.text) {
66+
if (lastWasInput) { process.stdout.write('\n'); lastWasInput = false; }
67+
const t = sc.outputTranscription.text;
68+
process.stdout.write(t);
69+
if (/[.!?]\s*$/.test(t)) process.stdout.write('\n');
70+
}
71+
if (sc.inputTranscription?.text) {
72+
if (!lastWasInput) { process.stdout.write('\n'); lastWasInput = true; }
73+
process.stdout.write(`\x1b[3m${sc.inputTranscription.text}\x1b[0m`);
74+
}
5975
}
6076
}
6177

command-line/python/main.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
CONFIG = {
1919
"response_modalities": ["AUDIO"],
2020
"system_instruction": "You are a helpful and friendly AI assistant.",
21+
"output_audio_transcription": {},
22+
"input_audio_transcription": {},
2123
}
2224

2325
audio_queue_output = asyncio.Queue()
@@ -50,13 +52,33 @@ async def send_realtime(session):
5052

5153
async def receive_audio(session):
5254
"""Receives responses from GenAI and puts audio data into the speaker audio queue."""
55+
last_was_input = False
5356
while True:
5457
turn = session.receive()
5558
async for response in turn:
56-
if (response.server_content and response.server_content.model_turn):
57-
for part in response.server_content.model_turn.parts:
59+
sc = response.server_content
60+
if not sc:
61+
continue
62+
if sc.model_turn:
63+
for part in sc.model_turn.parts:
5864
if part.inline_data and isinstance(part.inline_data.data, bytes):
5965
audio_queue_output.put_nowait(part.inline_data.data)
66+
if sc.output_transcription:
67+
if last_was_input:
68+
print()
69+
last_was_input = False
70+
t = sc.output_transcription.text
71+
print(t, end="", flush=True)
72+
if t.rstrip()[-1:] in '.!?':
73+
print()
74+
if sc.input_transcription:
75+
if not last_was_input:
76+
print()
77+
last_was_input = True
78+
t = sc.input_transcription.text
79+
print(f"\033[3m{t}\033[0m", end="", flush=True)
80+
if t.rstrip()[-1:] in '.!?':
81+
print()
6082

6183
# Empty the queue on interruption to stop playback
6284
while not audio_queue_output.empty():

0 commit comments

Comments
 (0)