fix: enable handling video_url in litellm and chat completions models (#2614)

seokhyunan · web-flow · commit d23d85a35309 · 2026-03-07T14:55:10.000+09:00
diff --git a/src/agents/models/chatcmpl_converter.py b/src/agents/models/chatcmpl_converter.py
@@ -56,7 +56,11 @@
 )
 from .fake_id import FAKE_RESPONSES_ID
 
-ResponseInputContentWithAudioParam = Union[ResponseInputContentParam, ResponseInputAudioParam]
+ResponseInputContentWithAudioParam = Union[
+    ResponseInputContentParam,
+    ResponseInputAudioParam,
+    dict[str, Any],
+]
 
 
 class Converter:
@@ -309,10 +313,14 @@ def extract_text_content(
         all_content = cls.extract_all_content(content)
         if isinstance(all_content, str):
             return all_content
+
         out: list[ChatCompletionContentPartTextParam] = []
         for c in all_content:
-            if c.get("type") == "text":
+            c_type = cast(dict[str, Any], c).get("type")
+            if c_type == "text":
                 out.append(cast(ChatCompletionContentPartTextParam, c))
+            elif c_type == "video_url":
+                raise UserError(f"Only text content is supported here, got: {c}")
         return out
 
     @classmethod
@@ -352,6 +360,19 @@ def extract_all_content(
                         },
                     )
                 )
+            elif isinstance(c, dict) and c.get("type") == "video_url":
+                video_payload = c.get("video_url")
+                if not isinstance(video_payload, dict) or not video_payload.get("url"):
+                    raise UserError(f"Only video URLs are supported for video_url {c}")
+                out.append(
+                    cast(
+                        Any,
+                        {
+                            "type": "video_url",
+                            "video_url": {"url": video_payload["url"]},
+                        },
+                    )
+                )
             elif isinstance(c, dict) and c.get("type") == "input_audio":
                 casted_audio_param = cast(ResponseInputAudioParam, c)
                 audio_payload = casted_audio_param.get("input_audio")
@@ -657,7 +678,15 @@ def ensure_assistant_message() -> ChatCompletionAssistantMessageParam:
                 if preserve_tool_output_all_content:
                     tool_result_content = cls.extract_all_content(output_content)
                 else:
-                    tool_result_content = cls.extract_text_content(output_content)  # type: ignore[assignment]
+                    all_output_content = cls.extract_all_content(output_content)
+                    if isinstance(all_output_content, str):
+                        tool_result_content = all_output_content
+                    else:
+                        tool_result_content = [
+                            cast(ChatCompletionContentPartTextParam, c)
+                            for c in all_output_content
+                            if c.get("type") == "text"
+                        ]
                 msg: ChatCompletionToolMessageParam = {
                     "role": "tool",
                     "tool_call_id": func_output["call_id"],
diff --git a/src/agents/models/openai_chatcompletions.py b/src/agents/models/openai_chatcompletions.py
@@ -20,6 +20,7 @@
 
 from .. import _debug
 from ..agent_output import AgentOutputSchemaBase
+from ..exceptions import UserError
 from ..handoffs import Handoff
 from ..items import ModelResponse, TResponseInputItem, TResponseStreamEvent
 from ..logger import logger
@@ -41,6 +42,10 @@
 
 
 class OpenAIChatCompletionsModel(Model):
+    _OFFICIAL_OPENAI_SUPPORTED_INPUT_CONTENT_TYPES = frozenset(
+        {"input_text", "input_image", "input_audio", "input_file"}
+    )
+
     def __init__(
         self,
         model: str | ChatModel,
@@ -52,6 +57,36 @@ def __init__(
     def _non_null_or_omit(self, value: Any) -> Any:
         return value if value is not None else omit
 
+    def _validate_official_openai_input_content_types(
+        self, request_input: str | list[TResponseInputItem]
+    ) -> None:
+        if not ChatCmplHelpers.is_openai(self._client) or isinstance(request_input, str):
+            return
+
+        for item in request_input:
+            message = Converter.maybe_easy_input_message(item) or Converter.maybe_input_message(
+                item
+            )
+            if message is None or message["role"] != "user":
+                continue
+
+            content_parts = message["content"]
+            if isinstance(content_parts, str):
+                continue
+
+            for part in content_parts:
+                if not isinstance(part, dict):
+                    continue
+
+                content_type = part.get("type")
+                if content_type in self._OFFICIAL_OPENAI_SUPPORTED_INPUT_CONTENT_TYPES:
+                    continue
+
+                raise UserError(
+                    "Unsupported content type for official OpenAI Chat Completions: "
+                    f"{content_type!r} in {part}"
+                )
+
     async def get_response(
         self,
         system_instructions: str | None,
@@ -272,6 +307,7 @@ async def _fetch_response(
         stream: bool = False,
         prompt: ResponsePromptParam | None = None,
     ) -> ChatCompletion | tuple[Response, AsyncStream[ChatCompletionChunk]]:
+        self._validate_official_openai_input_content_types(input)
         converted_messages = Converter.items_to_messages(input, model=self.model)
 
         if system_instructions: