Skip to content

Commit 02882ef

Browse files
authored
Add realtime audio mapping support and SIP session payload tests (#2225)
1 parent fd65d1a commit 02882ef

9 files changed

Lines changed: 458 additions & 46 deletions

File tree

docs/llms-full.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ The Agents SDK delivers a focused set of Python primitives—agents, tools, guar
3333
- [Voice quickstart](https://openai.github.io/openai-agents-python/voice/quickstart/): Build an end-to-end voice assistant with streaming transcription, text-to-speech, and event-driven responses.
3434
- [Voice pipeline](https://openai.github.io/openai-agents-python/voice/pipeline/): Customize audio capture, buffering, model invocation, and playback in voice-first experiences.
3535
- [Voice tracing](https://openai.github.io/openai-agents-python/voice/tracing/): Inspect voice session traces, latency breakdowns, and audio event timelines.
36-
- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Launch realtime agents over WebRTC or websockets, subscribe to events, and manage low-latency execution.
36+
- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Launch realtime agents over websockets (WebRTC is not available in the Python SDK), subscribe to events, and manage low-latency execution.
3737
- [Realtime guide](https://openai.github.io/openai-agents-python/realtime/guide/): Deep dive into realtime session lifecycle, event schemas, concurrency, and backpressure handling.
3838

3939
## Models and Provider Integrations

docs/llms.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ The SDK focuses on a concise set of primitives so you can orchestrate multi-agen
3636
## Modalities and Interfaces
3737
- [Voice quickstart](https://openai.github.io/openai-agents-python/voice/quickstart/): Build speech-enabled agents with streaming transcription and TTS.
3838
- [Voice pipeline](https://openai.github.io/openai-agents-python/voice/pipeline/): Customize audio ingestion, tool execution, and response rendering.
39-
- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Stand up low-latency realtime agents with WebRTC and websocket transports.
39+
- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Stand up low-latency realtime agents with websocket transport (WebRTC is not available in the Python SDK).
4040
- [Realtime guide](https://openai.github.io/openai-agents-python/realtime/guide/): Deep dive into session lifecycle, event formats, and concurrency patterns.
4141

4242
## API Reference Highlights

src/agents/realtime/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
)
8585
from .openai_realtime import (
8686
DEFAULT_MODEL_SETTINGS,
87+
OpenAIRealtimeSIPModel,
8788
OpenAIRealtimeWebSocketModel,
8889
get_api_key,
8990
)
@@ -176,6 +177,7 @@
176177
"RealtimeModelUserInputMessage",
177178
# OpenAI Realtime
178179
"DEFAULT_MODEL_SETTINGS",
180+
"OpenAIRealtimeSIPModel",
179181
"OpenAIRealtimeWebSocketModel",
180182
"get_api_key",
181183
# Session

src/agents/realtime/audio_formats.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from __future__ import annotations
22

3+
from collections.abc import Mapping
4+
from typing import Any, Literal
5+
36
from openai.types.realtime.realtime_audio_formats import (
47
AudioPCM,
58
AudioPCMA,
@@ -11,7 +14,7 @@
1114

1215

1316
def to_realtime_audio_format(
14-
input_audio_format: str | RealtimeAudioFormats | None,
17+
input_audio_format: str | RealtimeAudioFormats | Mapping[str, Any] | None,
1518
) -> RealtimeAudioFormats | None:
1619
format: RealtimeAudioFormats | None = None
1720
if input_audio_format is not None:
@@ -24,6 +27,27 @@ def to_realtime_audio_format(
2427
format = AudioPCMA(type="audio/pcma")
2528
else:
2629
logger.debug(f"Unknown input_audio_format: {input_audio_format}")
30+
elif isinstance(input_audio_format, Mapping):
31+
fmt_type = input_audio_format.get("type")
32+
rate = input_audio_format.get("rate")
33+
if fmt_type == "audio/pcm":
34+
pcm_rate: Literal[24000] | None
35+
if isinstance(rate, (int, float)) and int(rate) == 24000:
36+
pcm_rate = 24000
37+
elif rate is None:
38+
pcm_rate = 24000
39+
else:
40+
logger.debug(
41+
f"Unknown pcm rate in input_audio_format mapping: {input_audio_format}"
42+
)
43+
pcm_rate = 24000
44+
format = AudioPCM(type="audio/pcm", rate=pcm_rate)
45+
elif fmt_type == "audio/pcmu":
46+
format = AudioPCMU(type="audio/pcmu")
47+
elif fmt_type == "audio/pcma":
48+
format = AudioPCMA(type="audio/pcma")
49+
else:
50+
logger.debug(f"Unknown input_audio_format mapping: {input_audio_format}")
2751
else:
2852
format = input_audio_format
2953
return format

src/agents/realtime/config.py

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
from __future__ import annotations
22

3-
from typing import (
4-
Any,
5-
Literal,
6-
Union,
7-
)
3+
from collections.abc import Mapping
4+
from typing import Any, Literal, Union
85

96
from openai.types.realtime.realtime_audio_formats import (
107
RealtimeAudioFormats as OpenAIRealtimeAudioFormats,
@@ -28,13 +25,20 @@
2825
"gpt-4o-realtime-preview-2024-12-17",
2926
"gpt-4o-realtime-preview-2024-10-01",
3027
"gpt-4o-mini-realtime-preview-2024-12-17",
28+
"gpt-realtime-mini",
29+
"gpt-realtime-mini-2025-10-06",
3130
],
3231
str,
3332
]
3433
"""The name of a realtime model."""
3534

3635

37-
RealtimeAudioFormat: TypeAlias = Union[Literal["pcm16", "g711_ulaw", "g711_alaw"], str]
36+
RealtimeAudioFormat: TypeAlias = Union[
37+
Literal["pcm16", "g711_ulaw", "g711_alaw"],
38+
str,
39+
Mapping[str, Any],
40+
OpenAIRealtimeAudioFormats,
41+
]
3842
"""The audio format for realtime audio streams."""
3943

4044

@@ -96,6 +100,30 @@ class RealtimeTurnDetectionConfig(TypedDict):
96100
"""Threshold for server-vad to trigger a response if the user is idle for this duration."""
97101

98102

103+
class RealtimeAudioInputConfig(TypedDict, total=False):
104+
"""Configuration for audio input in realtime sessions."""
105+
106+
format: RealtimeAudioFormat | OpenAIRealtimeAudioFormats
107+
noise_reduction: RealtimeInputAudioNoiseReductionConfig | None
108+
transcription: RealtimeInputAudioTranscriptionConfig
109+
turn_detection: RealtimeTurnDetectionConfig
110+
111+
112+
class RealtimeAudioOutputConfig(TypedDict, total=False):
113+
"""Configuration for audio output in realtime sessions."""
114+
115+
format: RealtimeAudioFormat | OpenAIRealtimeAudioFormats
116+
voice: str
117+
speed: float
118+
119+
120+
class RealtimeAudioConfig(TypedDict, total=False):
121+
"""Audio configuration for realtime sessions."""
122+
123+
input: RealtimeAudioInputConfig
124+
output: RealtimeAudioOutputConfig
125+
126+
99127
class RealtimeSessionModelSettings(TypedDict):
100128
"""Model settings for a realtime model session."""
101129

@@ -111,6 +139,12 @@ class RealtimeSessionModelSettings(TypedDict):
111139
modalities: NotRequired[list[Literal["text", "audio"]]]
112140
"""The modalities the model should support."""
113141

142+
output_modalities: NotRequired[list[Literal["text", "audio"]]]
143+
"""The output modalities the model should support."""
144+
145+
audio: NotRequired[RealtimeAudioConfig]
146+
"""The audio configuration for the session."""
147+
114148
voice: NotRequired[str]
115149
"""The voice to use for audio output."""
116150

0 commit comments

Comments
 (0)