Skip to content

Commit d23d85a

Browse files
authored
fix: enable handling video_url in litellm and chat completions models (#2614)
1 parent e6f4b36 commit d23d85a

2 files changed

Lines changed: 68 additions & 3 deletions

File tree

src/agents/models/chatcmpl_converter.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,11 @@
5656
)
5757
from .fake_id import FAKE_RESPONSES_ID
5858

59-
ResponseInputContentWithAudioParam = Union[ResponseInputContentParam, ResponseInputAudioParam]
59+
ResponseInputContentWithAudioParam = Union[
60+
ResponseInputContentParam,
61+
ResponseInputAudioParam,
62+
dict[str, Any],
63+
]
6064

6165

6266
class Converter:
@@ -309,10 +313,14 @@ def extract_text_content(
309313
all_content = cls.extract_all_content(content)
310314
if isinstance(all_content, str):
311315
return all_content
316+
312317
out: list[ChatCompletionContentPartTextParam] = []
313318
for c in all_content:
314-
if c.get("type") == "text":
319+
c_type = cast(dict[str, Any], c).get("type")
320+
if c_type == "text":
315321
out.append(cast(ChatCompletionContentPartTextParam, c))
322+
elif c_type == "video_url":
323+
raise UserError(f"Only text content is supported here, got: {c}")
316324
return out
317325

318326
@classmethod
@@ -352,6 +360,19 @@ def extract_all_content(
352360
},
353361
)
354362
)
363+
elif isinstance(c, dict) and c.get("type") == "video_url":
364+
video_payload = c.get("video_url")
365+
if not isinstance(video_payload, dict) or not video_payload.get("url"):
366+
raise UserError(f"Only video URLs are supported for video_url {c}")
367+
out.append(
368+
cast(
369+
Any,
370+
{
371+
"type": "video_url",
372+
"video_url": {"url": video_payload["url"]},
373+
},
374+
)
375+
)
355376
elif isinstance(c, dict) and c.get("type") == "input_audio":
356377
casted_audio_param = cast(ResponseInputAudioParam, c)
357378
audio_payload = casted_audio_param.get("input_audio")
@@ -657,7 +678,15 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
657678
if preserve_tool_output_all_content:
658679
tool_result_content = cls.extract_all_content(output_content)
659680
else:
660-
tool_result_content = cls.extract_text_content(output_content) # type: ignore[assignment]
681+
all_output_content = cls.extract_all_content(output_content)
682+
if isinstance(all_output_content, str):
683+
tool_result_content = all_output_content
684+
else:
685+
tool_result_content = [
686+
cast(ChatCompletionContentPartTextParam, c)
687+
for c in all_output_content
688+
if c.get("type") == "text"
689+
]
661690
msg: ChatCompletionToolMessageParam = {
662691
"role": "tool",
663692
"tool_call_id": func_output["call_id"],

src/agents/models/openai_chatcompletions.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
from .. import _debug
2222
from ..agent_output import AgentOutputSchemaBase
23+
from ..exceptions import UserError
2324
from ..handoffs import Handoff
2425
from ..items import ModelResponse, TResponseInputItem, TResponseStreamEvent
2526
from ..logger import logger
@@ -41,6 +42,10 @@
4142

4243

4344
class OpenAIChatCompletionsModel(Model):
45+
_OFFICIAL_OPENAI_SUPPORTED_INPUT_CONTENT_TYPES = frozenset(
46+
{"input_text", "input_image", "input_audio", "input_file"}
47+
)
48+
4449
def __init__(
4550
self,
4651
model: str | ChatModel,
@@ -52,6 +57,36 @@ def __init__(
5257
def _non_null_or_omit(self, value: Any) -> Any:
5358
return value if value is not None else omit
5459

60+
def _validate_official_openai_input_content_types(
61+
self, request_input: str | list[TResponseInputItem]
62+
) -> None:
63+
if not ChatCmplHelpers.is_openai(self._client) or isinstance(request_input, str):
64+
return
65+
66+
for item in request_input:
67+
message = Converter.maybe_easy_input_message(item) or Converter.maybe_input_message(
68+
item
69+
)
70+
if message is None or message["role"] != "user":
71+
continue
72+
73+
content_parts = message["content"]
74+
if isinstance(content_parts, str):
75+
continue
76+
77+
for part in content_parts:
78+
if not isinstance(part, dict):
79+
continue
80+
81+
content_type = part.get("type")
82+
if content_type in self._OFFICIAL_OPENAI_SUPPORTED_INPUT_CONTENT_TYPES:
83+
continue
84+
85+
raise UserError(
86+
"Unsupported content type for official OpenAI Chat Completions: "
87+
f"{content_type!r} in {part}"
88+
)
89+
5590
async def get_response(
5691
self,
5792
system_instructions: str | None,
@@ -272,6 +307,7 @@ async def _fetch_response(
272307
stream: bool = False,
273308
prompt: ResponsePromptParam | None = None,
274309
) -> ChatCompletion | tuple[Response, AsyncStream[ChatCompletionChunk]]:
310+
self._validate_official_openai_input_content_types(input)
275311
converted_messages = Converter.items_to_messages(input, model=self.model)
276312

277313
if system_instructions:

0 commit comments

Comments
 (0)