From adec90aa5187c2be0291e0fe80a16bee548e8192 Mon Sep 17 00:00:00 2001
From: Tobias Wochinger <tobias.wochinger@clickhouse.com>
Date: Wed, 20 May 2026 10:33:22 +0200
Subject: [PATCH 1/5] feat(scores): support text scores in v3

Backport TEXT score typing and generated API models to v3-stable.
---
 langfuse/_client/client.py                    | 24 ++---
 langfuse/_client/span.py                      | 16 ++--
 langfuse/api/__init__.py                      | 12 +++
 langfuse/api/resources/__init__.py            | 12 +++
 langfuse/api/resources/commons/__init__.py    |  8 ++
 .../api/resources/commons/types/__init__.py   | 15 ++-
 langfuse/api/resources/commons/types/score.py | 68 +++++++++++++-
 .../commons/types/score_config_data_type.py   |  4 +
 .../commons/types/score_data_type.py          |  4 +
 .../api/resources/commons/types/score_v_1.py  | 62 ++++++++++++-
 .../api/resources/commons/types/text_score.py | 48 ++++++++++
 .../resources/commons/types/text_score_v_1.py | 48 ++++++++++
 .../resources/ingestion/types/score_body.py   |  2 +-
 .../score/types/create_score_request.py       |  2 +-
 langfuse/api/resources/score_v_2/__init__.py  |  4 +
 .../api/resources/score_v_2/types/__init__.py |  4 +
 .../types/get_scores_response_data.py         | 66 +++++++++++++
 .../types/get_scores_response_data_text.py    | 46 +++++++++
 langfuse/experiment.py                        |  4 +-
 langfuse/types.py                             |  5 +-
 tests/test_core_sdk.py                        | 93 +++++++++++++++++++
 21 files changed, 519 insertions(+), 28 deletions(-)
 create mode 100644 langfuse/api/resources/commons/types/text_score.py
 create mode 100644 langfuse/api/resources/commons/types/text_score_v_1.py
 create mode 100644 langfuse/api/resources/score_v_2/types/get_scores_response_data_text.py

diff --git a/langfuse/_client/client.py b/langfuse/_client/client.py
index 3787280c5..94e5a084d 100644
--- a/langfuse/_client/client.py
+++ b/langfuse/_client/client.py
@@ -201,7 +201,7 @@ class Langfuse:
                     cost_details={"total_cost": 0.0023}
                 )
 
-                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL)
+                # Score the generation (supports NUMERIC, BOOLEAN, CATEGORICAL, TEXT)
                 generation.score(name="relevance", value=0.95, data_type="NUMERIC")
         ```
     """
@@ -1992,7 +1992,7 @@ def create_score(
         trace_id: Optional[str] = None,
         score_id: Optional[str] = None,
         observation_id: Optional[str] = None,
-        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
+        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
         metadata: Optional[Any] = None,
@@ -2022,13 +2022,13 @@ def create_score(
 
         Args:
             name: Name of the score (e.g., "relevance", "accuracy")
-            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
+            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
             session_id: ID of the Langfuse session to associate the score with
             dataset_run_id: ID of the Langfuse dataset run to associate the score with
             trace_id: ID of the Langfuse trace to associate the score with
             observation_id: Optional ID of the specific observation to score. Trace ID must be provided too.
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             metadata: Optional metadata to be attached to the score
@@ -2152,7 +2152,7 @@ def score_current_span(
         name: str,
         value: str,
         score_id: Optional[str] = None,
-        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
+        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
         metadata: Optional[Any] = None,
@@ -2176,9 +2176,9 @@ def score_current_span(
 
         Args:
             name: Name of the score (e.g., "relevance", "accuracy")
-            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
+            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             metadata: Optional metadata to be attached to the score
@@ -2216,7 +2216,7 @@ def score_current_span(
                 name=name,
                 value=cast(str, value),
                 score_id=score_id,
-                data_type=cast(Literal["CATEGORICAL"], data_type),
+                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
                 comment=comment,
                 config_id=config_id,
                 metadata=metadata,
@@ -2242,7 +2242,7 @@ def score_current_trace(
         name: str,
         value: str,
         score_id: Optional[str] = None,
-        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
+        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
         metadata: Optional[Any] = None,
@@ -2267,9 +2267,9 @@ def score_current_trace(
 
         Args:
             name: Name of the score (e.g., "user_satisfaction", "overall_quality")
-            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL)
+            value: Score value (can be numeric for NUMERIC/BOOLEAN types or string for CATEGORICAL/TEXT)
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             metadata: Optional metadata to be attached to the score
@@ -2305,7 +2305,7 @@ def score_current_trace(
                 name=name,
                 value=cast(str, value),
                 score_id=score_id,
-                data_type=cast(Literal["CATEGORICAL"], data_type),
+                data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
                 comment=comment,
                 config_id=config_id,
                 metadata=metadata,
diff --git a/langfuse/_client/span.py b/langfuse/_client/span.py
index 92c1556ca..69ec83fe5 100644
--- a/langfuse/_client/span.py
+++ b/langfuse/_client/span.py
@@ -287,7 +287,7 @@ def score(
         name: str,
         value: str,
         score_id: Optional[str] = None,
-        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
+        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
         timestamp: Optional[datetime] = None,
@@ -313,9 +313,9 @@ def score(
 
         Args:
             name: Name of the score (e.g., "relevance", "accuracy")
-            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
+            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL/TEXT)
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             timestamp: Optional timestamp for the score (defaults to current UTC time)
@@ -342,7 +342,7 @@ def score(
             trace_id=self.trace_id,
             observation_id=self.id,
             score_id=score_id,
-            data_type=cast(Literal["CATEGORICAL"], data_type),
+            data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
             comment=comment,
             config_id=config_id,
             timestamp=timestamp,
@@ -370,7 +370,7 @@ def score_trace(
         name: str,
         value: str,
         score_id: Optional[str] = None,
-        data_type: Optional[Literal["CATEGORICAL"]] = "CATEGORICAL",
+        data_type: Optional[Literal["CATEGORICAL", "TEXT"]] = "CATEGORICAL",
         comment: Optional[str] = None,
         config_id: Optional[str] = None,
         timestamp: Optional[datetime] = None,
@@ -397,9 +397,9 @@ def score_trace(
 
         Args:
             name: Name of the score (e.g., "user_satisfaction", "overall_quality")
-            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
+            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL/TEXT)
             score_id: Optional custom ID for the score (auto-generated if not provided)
-            data_type: Type of score (NUMERIC, BOOLEAN, or CATEGORICAL)
+            data_type: Type of score (NUMERIC, BOOLEAN, CATEGORICAL, or TEXT)
             comment: Optional comment or explanation for the score
             config_id: Optional ID of a score config defined in Langfuse
             timestamp: Optional timestamp for the score (defaults to current UTC time)
@@ -425,7 +425,7 @@ def score_trace(
             value=cast(str, value),
             trace_id=self.trace_id,
             score_id=score_id,
-            data_type=cast(Literal["CATEGORICAL"], data_type),
+            data_type=cast(Literal["CATEGORICAL", "TEXT"], data_type),
             comment=comment,
             config_id=config_id,
             timestamp=timestamp,
diff --git a/langfuse/api/__init__.py b/langfuse/api/__init__.py
index d1a6414ed..d366774d5 100644
--- a/langfuse/api/__init__.py
+++ b/langfuse/api/__init__.py
@@ -88,10 +88,12 @@
     GetScoresResponseDataCategorical,
     GetScoresResponseDataCorrection,
     GetScoresResponseDataNumeric,
+    GetScoresResponseDataText,
     GetScoresResponseData_Boolean,
     GetScoresResponseData_Categorical,
     GetScoresResponseData_Correction,
     GetScoresResponseData_Numeric,
+    GetScoresResponseData_Text,
     GetScoresResponseTraceData,
     HealthResponse,
     IngestionError,
@@ -200,10 +202,12 @@
     ScoreV1_Boolean,
     ScoreV1_Categorical,
     ScoreV1_Numeric,
+    ScoreV1_Text,
     Score_Boolean,
     Score_Categorical,
     Score_Correction,
     Score_Numeric,
+    Score_Text,
     SdkLogBody,
     SdkLogEvent,
     ServiceProviderConfig,
@@ -211,6 +215,8 @@
     Session,
     SessionWithTraces,
     Sort,
+    TextScore,
+    TextScoreV1,
     TextPrompt,
     Trace,
     TraceBody,
@@ -349,10 +355,12 @@
     "GetScoresResponseDataCategorical",
     "GetScoresResponseDataCorrection",
     "GetScoresResponseDataNumeric",
+    "GetScoresResponseDataText",
     "GetScoresResponseData_Boolean",
     "GetScoresResponseData_Categorical",
     "GetScoresResponseData_Correction",
     "GetScoresResponseData_Numeric",
+    "GetScoresResponseData_Text",
     "GetScoresResponseTraceData",
     "HealthResponse",
     "IngestionError",
@@ -461,10 +469,12 @@
     "ScoreV1_Boolean",
     "ScoreV1_Categorical",
     "ScoreV1_Numeric",
+    "ScoreV1_Text",
     "Score_Boolean",
     "Score_Categorical",
     "Score_Correction",
     "Score_Numeric",
+    "Score_Text",
     "SdkLogBody",
     "SdkLogEvent",
     "ServiceProviderConfig",
@@ -472,6 +482,8 @@
     "Session",
     "SessionWithTraces",
     "Sort",
+    "TextScore",
+    "TextScoreV1",
     "TextPrompt",
     "Trace",
     "TraceBody",
diff --git a/langfuse/api/resources/__init__.py b/langfuse/api/resources/__init__.py
index 0de0a56a5..b54d98098 100644
--- a/langfuse/api/resources/__init__.py
+++ b/langfuse/api/resources/__init__.py
@@ -100,12 +100,16 @@
     ScoreV1_Boolean,
     ScoreV1_Categorical,
     ScoreV1_Numeric,
+    ScoreV1_Text,
     Score_Boolean,
     Score_Categorical,
     Score_Correction,
     Score_Numeric,
+    Score_Text,
     Session,
     SessionWithTraces,
+    TextScore,
+    TextScoreV1,
     Trace,
     TraceWithDetails,
     TraceWithFullDetails,
@@ -272,10 +276,12 @@
     GetScoresResponseDataCategorical,
     GetScoresResponseDataCorrection,
     GetScoresResponseDataNumeric,
+    GetScoresResponseDataText,
     GetScoresResponseData_Boolean,
     GetScoresResponseData_Categorical,
     GetScoresResponseData_Correction,
     GetScoresResponseData_Numeric,
+    GetScoresResponseData_Text,
     GetScoresResponseTraceData,
 )
 from .sessions import PaginatedSessions
@@ -369,10 +375,12 @@
     "GetScoresResponseDataCategorical",
     "GetScoresResponseDataCorrection",
     "GetScoresResponseDataNumeric",
+    "GetScoresResponseDataText",
     "GetScoresResponseData_Boolean",
     "GetScoresResponseData_Categorical",
     "GetScoresResponseData_Correction",
     "GetScoresResponseData_Numeric",
+    "GetScoresResponseData_Text",
     "GetScoresResponseTraceData",
     "HealthResponse",
     "IngestionError",
@@ -481,10 +489,12 @@
     "ScoreV1_Boolean",
     "ScoreV1_Categorical",
     "ScoreV1_Numeric",
+    "ScoreV1_Text",
     "Score_Boolean",
     "Score_Categorical",
     "Score_Correction",
     "Score_Numeric",
+    "Score_Text",
     "SdkLogBody",
     "SdkLogEvent",
     "ServiceProviderConfig",
@@ -492,6 +502,8 @@
     "Session",
     "SessionWithTraces",
     "Sort",
+    "TextScore",
+    "TextScoreV1",
     "TextPrompt",
     "Trace",
     "TraceBody",
diff --git a/langfuse/api/resources/commons/__init__.py b/langfuse/api/resources/commons/__init__.py
index 7105b22c5..e1a050ca5 100644
--- a/langfuse/api/resources/commons/__init__.py
+++ b/langfuse/api/resources/commons/__init__.py
@@ -40,12 +40,16 @@
     ScoreV1_Boolean,
     ScoreV1_Categorical,
     ScoreV1_Numeric,
+    ScoreV1_Text,
     Score_Boolean,
     Score_Categorical,
     Score_Correction,
     Score_Numeric,
+    Score_Text,
     Session,
     SessionWithTraces,
+    TextScore,
+    TextScoreV1,
     Trace,
     TraceWithDetails,
     TraceWithFullDetails,
@@ -103,12 +107,16 @@
     "ScoreV1_Boolean",
     "ScoreV1_Categorical",
     "ScoreV1_Numeric",
+    "ScoreV1_Text",
     "Score_Boolean",
     "Score_Categorical",
     "Score_Correction",
     "Score_Numeric",
+    "Score_Text",
     "Session",
     "SessionWithTraces",
+    "TextScore",
+    "TextScoreV1",
     "Trace",
     "TraceWithDetails",
     "TraceWithFullDetails",
diff --git a/langfuse/api/resources/commons/types/__init__.py b/langfuse/api/resources/commons/types/__init__.py
index df87680b7..b5a17e491 100644
--- a/langfuse/api/resources/commons/types/__init__.py
+++ b/langfuse/api/resources/commons/types/__init__.py
@@ -36,14 +36,23 @@
     Score_Categorical,
     Score_Correction,
     Score_Numeric,
+    Score_Text,
 )
 from .score_config import ScoreConfig
 from .score_config_data_type import ScoreConfigDataType
 from .score_data_type import ScoreDataType
 from .score_source import ScoreSource
-from .score_v_1 import ScoreV1, ScoreV1_Boolean, ScoreV1_Categorical, ScoreV1_Numeric
+from .score_v_1 import (
+    ScoreV1,
+    ScoreV1_Boolean,
+    ScoreV1_Categorical,
+    ScoreV1_Numeric,
+    ScoreV1_Text,
+)
 from .session import Session
 from .session_with_traces import SessionWithTraces
+from .text_score import TextScore
+from .text_score_v_1 import TextScoreV1
 from .trace import Trace
 from .trace_with_details import TraceWithDetails
 from .trace_with_full_details import TraceWithFullDetails
@@ -89,12 +98,16 @@
     "ScoreV1_Boolean",
     "ScoreV1_Categorical",
     "ScoreV1_Numeric",
+    "ScoreV1_Text",
     "Score_Boolean",
     "Score_Categorical",
     "Score_Correction",
     "Score_Numeric",
+    "Score_Text",
     "Session",
     "SessionWithTraces",
+    "TextScore",
+    "TextScoreV1",
     "Trace",
     "TraceWithDetails",
     "TraceWithFullDetails",
diff --git a/langfuse/api/resources/commons/types/score.py b/langfuse/api/resources/commons/types/score.py
index dab6eee43..cc008d6d2 100644
--- a/langfuse/api/resources/commons/types/score.py
+++ b/langfuse/api/resources/commons/types/score.py
@@ -204,6 +204,70 @@ class Config:
         json_encoders = {dt.datetime: serialize_datetime}
 
 
+class Score_Text(pydantic_v1.BaseModel):
+    string_value: str = pydantic_v1.Field(alias="stringValue")
+    id: str
+    trace_id: typing.Optional[str] = pydantic_v1.Field(alias="traceId", default=None)
+    session_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="sessionId", default=None
+    )
+    observation_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="observationId", default=None
+    )
+    dataset_run_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="datasetRunId", default=None
+    )
+    name: str
+    source: ScoreSource
+    timestamp: dt.datetime
+    created_at: dt.datetime = pydantic_v1.Field(alias="createdAt")
+    updated_at: dt.datetime = pydantic_v1.Field(alias="updatedAt")
+    author_user_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="authorUserId", default=None
+    )
+    comment: typing.Optional[str] = None
+    metadata: typing.Any
+    config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
+    queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
+    environment: str
+    data_type: typing.Literal["TEXT"] = pydantic_v1.Field(
+        alias="dataType", default="TEXT"
+    )
+
+    def json(self, **kwargs: typing.Any) -> str:
+        kwargs_with_defaults: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        return super().json(**kwargs_with_defaults)
+
+    def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
+        kwargs_with_defaults_exclude_unset: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        kwargs_with_defaults_exclude_none: typing.Any = {
+            "by_alias": True,
+            "exclude_none": True,
+            **kwargs,
+        }
+
+        return deep_union_pydantic_dicts(
+            super().dict(**kwargs_with_defaults_exclude_unset),
+            super().dict(**kwargs_with_defaults_exclude_none),
+        )
+
+    class Config:
+        frozen = True
+        smart_union = True
+        allow_population_by_field_name = True
+        populate_by_name = True
+        extra = pydantic_v1.Extra.allow
+        json_encoders = {dt.datetime: serialize_datetime}
+
+
 class Score_Correction(pydantic_v1.BaseModel):
     value: float
     string_value: str = pydantic_v1.Field(alias="stringValue")
@@ -269,4 +333,6 @@ class Config:
         json_encoders = {dt.datetime: serialize_datetime}
 
 
-Score = typing.Union[Score_Numeric, Score_Categorical, Score_Boolean, Score_Correction]
+Score = typing.Union[
+    Score_Numeric, Score_Categorical, Score_Boolean, Score_Text, Score_Correction
+]
diff --git a/langfuse/api/resources/commons/types/score_config_data_type.py b/langfuse/api/resources/commons/types/score_config_data_type.py
index a7c9e7251..c945db494 100644
--- a/langfuse/api/resources/commons/types/score_config_data_type.py
+++ b/langfuse/api/resources/commons/types/score_config_data_type.py
@@ -10,12 +10,14 @@ class ScoreConfigDataType(str, enum.Enum):
     NUMERIC = "NUMERIC"
     BOOLEAN = "BOOLEAN"
     CATEGORICAL = "CATEGORICAL"
+    TEXT = "TEXT"
 
     def visit(
         self,
         numeric: typing.Callable[[], T_Result],
         boolean: typing.Callable[[], T_Result],
         categorical: typing.Callable[[], T_Result],
+        text: typing.Callable[[], T_Result],
     ) -> T_Result:
         if self is ScoreConfigDataType.NUMERIC:
             return numeric()
@@ -23,3 +25,5 @@ def visit(
             return boolean()
         if self is ScoreConfigDataType.CATEGORICAL:
             return categorical()
+        if self is ScoreConfigDataType.TEXT:
+            return text()
diff --git a/langfuse/api/resources/commons/types/score_data_type.py b/langfuse/api/resources/commons/types/score_data_type.py
index 67bd9958b..aa57e3fee 100644
--- a/langfuse/api/resources/commons/types/score_data_type.py
+++ b/langfuse/api/resources/commons/types/score_data_type.py
@@ -10,6 +10,7 @@ class ScoreDataType(str, enum.Enum):
     NUMERIC = "NUMERIC"
     BOOLEAN = "BOOLEAN"
     CATEGORICAL = "CATEGORICAL"
+    TEXT = "TEXT"
     CORRECTION = "CORRECTION"
 
     def visit(
@@ -17,6 +18,7 @@ def visit(
         numeric: typing.Callable[[], T_Result],
         boolean: typing.Callable[[], T_Result],
         categorical: typing.Callable[[], T_Result],
+        text: typing.Callable[[], T_Result],
         correction: typing.Callable[[], T_Result],
     ) -> T_Result:
         if self is ScoreDataType.NUMERIC:
@@ -25,5 +27,7 @@ def visit(
             return boolean()
         if self is ScoreDataType.CATEGORICAL:
             return categorical()
+        if self is ScoreDataType.TEXT:
+            return text()
         if self is ScoreDataType.CORRECTION:
             return correction()
diff --git a/langfuse/api/resources/commons/types/score_v_1.py b/langfuse/api/resources/commons/types/score_v_1.py
index 74c3f53f9..f70e85b92 100644
--- a/langfuse/api/resources/commons/types/score_v_1.py
+++ b/langfuse/api/resources/commons/types/score_v_1.py
@@ -186,4 +186,64 @@ class Config:
         json_encoders = {dt.datetime: serialize_datetime}
 
 
-ScoreV1 = typing.Union[ScoreV1_Numeric, ScoreV1_Categorical, ScoreV1_Boolean]
+class ScoreV1_Text(pydantic_v1.BaseModel):
+    string_value: str = pydantic_v1.Field(alias="stringValue")
+    id: str
+    trace_id: str = pydantic_v1.Field(alias="traceId")
+    name: str
+    source: ScoreSource
+    observation_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="observationId", default=None
+    )
+    timestamp: dt.datetime
+    created_at: dt.datetime = pydantic_v1.Field(alias="createdAt")
+    updated_at: dt.datetime = pydantic_v1.Field(alias="updatedAt")
+    author_user_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="authorUserId", default=None
+    )
+    comment: typing.Optional[str] = None
+    metadata: typing.Any
+    config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
+    queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
+    environment: str
+    data_type: typing.Literal["TEXT"] = pydantic_v1.Field(
+        alias="dataType", default="TEXT"
+    )
+
+    def json(self, **kwargs: typing.Any) -> str:
+        kwargs_with_defaults: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        return super().json(**kwargs_with_defaults)
+
+    def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
+        kwargs_with_defaults_exclude_unset: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        kwargs_with_defaults_exclude_none: typing.Any = {
+            "by_alias": True,
+            "exclude_none": True,
+            **kwargs,
+        }
+
+        return deep_union_pydantic_dicts(
+            super().dict(**kwargs_with_defaults_exclude_unset),
+            super().dict(**kwargs_with_defaults_exclude_none),
+        )
+
+    class Config:
+        frozen = True
+        smart_union = True
+        allow_population_by_field_name = True
+        populate_by_name = True
+        extra = pydantic_v1.Extra.allow
+        json_encoders = {dt.datetime: serialize_datetime}
+
+
+ScoreV1 = typing.Union[
+    ScoreV1_Numeric, ScoreV1_Categorical, ScoreV1_Boolean, ScoreV1_Text
+]
diff --git a/langfuse/api/resources/commons/types/text_score.py b/langfuse/api/resources/commons/types/text_score.py
new file mode 100644
index 000000000..3fe17f06c
--- /dev/null
+++ b/langfuse/api/resources/commons/types/text_score.py
@@ -0,0 +1,48 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import datetime as dt
+import typing
+
+from ....core.datetime_utils import serialize_datetime
+from ....core.pydantic_utilities import deep_union_pydantic_dicts, pydantic_v1
+from .base_score import BaseScore
+
+
+class TextScore(BaseScore):
+    string_value: str = pydantic_v1.Field(alias="stringValue")
+    """
+    The string representation of the score value. Must be between 1 and 500 characters.
+    """
+
+    def json(self, **kwargs: typing.Any) -> str:
+        kwargs_with_defaults: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        return super().json(**kwargs_with_defaults)
+
+    def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
+        kwargs_with_defaults_exclude_unset: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        kwargs_with_defaults_exclude_none: typing.Any = {
+            "by_alias": True,
+            "exclude_none": True,
+            **kwargs,
+        }
+
+        return deep_union_pydantic_dicts(
+            super().dict(**kwargs_with_defaults_exclude_unset),
+            super().dict(**kwargs_with_defaults_exclude_none),
+        )
+
+    class Config:
+        frozen = True
+        smart_union = True
+        allow_population_by_field_name = True
+        populate_by_name = True
+        extra = pydantic_v1.Extra.allow
+        json_encoders = {dt.datetime: serialize_datetime}
diff --git a/langfuse/api/resources/commons/types/text_score_v_1.py b/langfuse/api/resources/commons/types/text_score_v_1.py
new file mode 100644
index 000000000..4315d906d
--- /dev/null
+++ b/langfuse/api/resources/commons/types/text_score_v_1.py
@@ -0,0 +1,48 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import datetime as dt
+import typing
+
+from ....core.datetime_utils import serialize_datetime
+from ....core.pydantic_utilities import deep_union_pydantic_dicts, pydantic_v1
+from .base_score_v_1 import BaseScoreV1
+
+
+class TextScoreV1(BaseScoreV1):
+    string_value: str = pydantic_v1.Field(alias="stringValue")
+    """
+    The string representation of the score value. Must be between 1 and 500 characters.
+    """
+
+    def json(self, **kwargs: typing.Any) -> str:
+        kwargs_with_defaults: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        return super().json(**kwargs_with_defaults)
+
+    def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
+        kwargs_with_defaults_exclude_unset: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        kwargs_with_defaults_exclude_none: typing.Any = {
+            "by_alias": True,
+            "exclude_none": True,
+            **kwargs,
+        }
+
+        return deep_union_pydantic_dicts(
+            super().dict(**kwargs_with_defaults_exclude_unset),
+            super().dict(**kwargs_with_defaults_exclude_none),
+        )
+
+    class Config:
+        frozen = True
+        smart_union = True
+        allow_population_by_field_name = True
+        populate_by_name = True
+        extra = pydantic_v1.Extra.allow
+        json_encoders = {dt.datetime: serialize_datetime}
diff --git a/langfuse/api/resources/ingestion/types/score_body.py b/langfuse/api/resources/ingestion/types/score_body.py
index 8e72a5682..46932f2e5 100644
--- a/langfuse/api/resources/ingestion/types/score_body.py
+++ b/langfuse/api/resources/ingestion/types/score_body.py
@@ -46,7 +46,7 @@ class ScoreBody(pydantic_v1.BaseModel):
 
     value: CreateScoreValue = pydantic_v1.Field()
     """
-    The value of the score. Must be passed as string for categorical scores, and numeric for boolean and numeric scores. Boolean score values must equal either 1 or 0 (true or false)
+    The value of the score. Must be passed as string for categorical and text scores, and numeric for boolean and numeric scores. Boolean score values must equal either 1 or 0 (true or false). Text score values must be between 1 and 500 characters.
     """
 
     comment: typing.Optional[str] = None
diff --git a/langfuse/api/resources/score/types/create_score_request.py b/langfuse/api/resources/score/types/create_score_request.py
index 1f79f4a64..e2c252e05 100644
--- a/langfuse/api/resources/score/types/create_score_request.py
+++ b/langfuse/api/resources/score/types/create_score_request.py
@@ -36,7 +36,7 @@ class CreateScoreRequest(pydantic_v1.BaseModel):
     name: str
     value: CreateScoreValue = pydantic_v1.Field()
     """
-    The value of the score. Must be passed as string for categorical scores, and numeric for boolean and numeric scores. Boolean score values must equal either 1 or 0 (true or false)
+    The value of the score. Must be passed as string for categorical and text scores, and numeric for boolean and numeric scores. Boolean score values must equal either 1 or 0 (true or false). Text score values must be between 1 and 500 characters.
     """
 
     comment: typing.Optional[str] = None
diff --git a/langfuse/api/resources/score_v_2/__init__.py b/langfuse/api/resources/score_v_2/__init__.py
index 4e333a693..73aba551a 100644
--- a/langfuse/api/resources/score_v_2/__init__.py
+++ b/langfuse/api/resources/score_v_2/__init__.py
@@ -7,10 +7,12 @@
     GetScoresResponseDataCategorical,
     GetScoresResponseDataCorrection,
     GetScoresResponseDataNumeric,
+    GetScoresResponseDataText,
     GetScoresResponseData_Boolean,
     GetScoresResponseData_Categorical,
     GetScoresResponseData_Correction,
     GetScoresResponseData_Numeric,
+    GetScoresResponseData_Text,
     GetScoresResponseTraceData,
 )
 
@@ -21,9 +23,11 @@
     "GetScoresResponseDataCategorical",
     "GetScoresResponseDataCorrection",
     "GetScoresResponseDataNumeric",
+    "GetScoresResponseDataText",
     "GetScoresResponseData_Boolean",
     "GetScoresResponseData_Categorical",
     "GetScoresResponseData_Correction",
     "GetScoresResponseData_Numeric",
+    "GetScoresResponseData_Text",
     "GetScoresResponseTraceData",
 ]
diff --git a/langfuse/api/resources/score_v_2/types/__init__.py b/langfuse/api/resources/score_v_2/types/__init__.py
index d08e687ef..2e68e389e 100644
--- a/langfuse/api/resources/score_v_2/types/__init__.py
+++ b/langfuse/api/resources/score_v_2/types/__init__.py
@@ -7,11 +7,13 @@
     GetScoresResponseData_Categorical,
     GetScoresResponseData_Correction,
     GetScoresResponseData_Numeric,
+    GetScoresResponseData_Text,
 )
 from .get_scores_response_data_boolean import GetScoresResponseDataBoolean
 from .get_scores_response_data_categorical import GetScoresResponseDataCategorical
 from .get_scores_response_data_correction import GetScoresResponseDataCorrection
 from .get_scores_response_data_numeric import GetScoresResponseDataNumeric
+from .get_scores_response_data_text import GetScoresResponseDataText
 from .get_scores_response_trace_data import GetScoresResponseTraceData
 
 __all__ = [
@@ -21,9 +23,11 @@
     "GetScoresResponseDataCategorical",
     "GetScoresResponseDataCorrection",
     "GetScoresResponseDataNumeric",
+    "GetScoresResponseDataText",
     "GetScoresResponseData_Boolean",
     "GetScoresResponseData_Categorical",
     "GetScoresResponseData_Correction",
     "GetScoresResponseData_Numeric",
+    "GetScoresResponseData_Text",
     "GetScoresResponseTraceData",
 ]
diff --git a/langfuse/api/resources/score_v_2/types/get_scores_response_data.py b/langfuse/api/resources/score_v_2/types/get_scores_response_data.py
index 4f73fbcae..045e2085c 100644
--- a/langfuse/api/resources/score_v_2/types/get_scores_response_data.py
+++ b/langfuse/api/resources/score_v_2/types/get_scores_response_data.py
@@ -208,6 +208,71 @@ class Config:
         json_encoders = {dt.datetime: serialize_datetime}
 
 
+class GetScoresResponseData_Text(pydantic_v1.BaseModel):
+    trace: typing.Optional[GetScoresResponseTraceData] = None
+    string_value: str = pydantic_v1.Field(alias="stringValue")
+    id: str
+    trace_id: typing.Optional[str] = pydantic_v1.Field(alias="traceId", default=None)
+    session_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="sessionId", default=None
+    )
+    observation_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="observationId", default=None
+    )
+    dataset_run_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="datasetRunId", default=None
+    )
+    name: str
+    source: ScoreSource
+    timestamp: dt.datetime
+    created_at: dt.datetime = pydantic_v1.Field(alias="createdAt")
+    updated_at: dt.datetime = pydantic_v1.Field(alias="updatedAt")
+    author_user_id: typing.Optional[str] = pydantic_v1.Field(
+        alias="authorUserId", default=None
+    )
+    comment: typing.Optional[str] = None
+    metadata: typing.Any
+    config_id: typing.Optional[str] = pydantic_v1.Field(alias="configId", default=None)
+    queue_id: typing.Optional[str] = pydantic_v1.Field(alias="queueId", default=None)
+    environment: str
+    data_type: typing.Literal["TEXT"] = pydantic_v1.Field(
+        alias="dataType", default="TEXT"
+    )
+
+    def json(self, **kwargs: typing.Any) -> str:
+        kwargs_with_defaults: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        return super().json(**kwargs_with_defaults)
+
+    def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
+        kwargs_with_defaults_exclude_unset: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        kwargs_with_defaults_exclude_none: typing.Any = {
+            "by_alias": True,
+            "exclude_none": True,
+            **kwargs,
+        }
+
+        return deep_union_pydantic_dicts(
+            super().dict(**kwargs_with_defaults_exclude_unset),
+            super().dict(**kwargs_with_defaults_exclude_none),
+        )
+
+    class Config:
+        frozen = True
+        smart_union = True
+        allow_population_by_field_name = True
+        populate_by_name = True
+        extra = pydantic_v1.Extra.allow
+        json_encoders = {dt.datetime: serialize_datetime}
+
+
 class GetScoresResponseData_Correction(pydantic_v1.BaseModel):
     trace: typing.Optional[GetScoresResponseTraceData] = None
     value: float
@@ -278,5 +343,6 @@ class Config:
     GetScoresResponseData_Numeric,
     GetScoresResponseData_Categorical,
     GetScoresResponseData_Boolean,
+    GetScoresResponseData_Text,
     GetScoresResponseData_Correction,
 ]
diff --git a/langfuse/api/resources/score_v_2/types/get_scores_response_data_text.py b/langfuse/api/resources/score_v_2/types/get_scores_response_data_text.py
new file mode 100644
index 000000000..d51385788
--- /dev/null
+++ b/langfuse/api/resources/score_v_2/types/get_scores_response_data_text.py
@@ -0,0 +1,46 @@
+# This file was auto-generated by Fern from our API Definition.
+
+import datetime as dt
+import typing
+
+from ....core.datetime_utils import serialize_datetime
+from ....core.pydantic_utilities import deep_union_pydantic_dicts, pydantic_v1
+from ...commons.types.text_score import TextScore
+from .get_scores_response_trace_data import GetScoresResponseTraceData
+
+
+class GetScoresResponseDataText(TextScore):
+    trace: typing.Optional[GetScoresResponseTraceData] = None
+
+    def json(self, **kwargs: typing.Any) -> str:
+        kwargs_with_defaults: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        return super().json(**kwargs_with_defaults)
+
+    def dict(self, **kwargs: typing.Any) -> typing.Dict[str, typing.Any]:
+        kwargs_with_defaults_exclude_unset: typing.Any = {
+            "by_alias": True,
+            "exclude_unset": True,
+            **kwargs,
+        }
+        kwargs_with_defaults_exclude_none: typing.Any = {
+            "by_alias": True,
+            "exclude_none": True,
+            **kwargs,
+        }
+
+        return deep_union_pydantic_dicts(
+            super().dict(**kwargs_with_defaults_exclude_unset),
+            super().dict(**kwargs_with_defaults_exclude_none),
+        )
+
+    class Config:
+        frozen = True
+        smart_union = True
+        allow_population_by_field_name = True
+        populate_by_name = True
+        extra = pydantic_v1.Extra.allow
+        json_encoders = {dt.datetime: serialize_datetime}
diff --git a/langfuse/experiment.py b/langfuse/experiment.py
index 00c54fe74..db1d70dc3 100644
--- a/langfuse/experiment.py
+++ b/langfuse/experiment.py
@@ -19,7 +19,7 @@
     Union,
 )
 
-from langfuse.api import ScoreDataType
+from langfuse.types import ExperimentScoreType
 
 if TYPE_CHECKING:
     from langfuse._client.datasets import DatasetItemClient
@@ -188,7 +188,7 @@ def __init__(
         value: Union[int, float, str, bool],
         comment: Optional[str] = None,
         metadata: Optional[Dict[str, Any]] = None,
-        data_type: Optional[ScoreDataType] = None,
+        data_type: Optional[ExperimentScoreType] = None,
         config_id: Optional[str] = None,
     ):
         """Initialize an Evaluation with the provided data.
diff --git a/langfuse/types.py b/langfuse/types.py
index 32ebb32d4..a1fe75273 100644
--- a/langfuse/types.py
+++ b/langfuse/types.py
@@ -41,7 +41,9 @@ def my_evaluator(*, output: str, **kwargs) -> Evaluation:
 
 SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]
 
-ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
+ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN", "TEXT"]
+
+ExperimentScoreType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
 
 
 class TraceMetadata(TypedDict):
@@ -106,6 +108,7 @@ class TraceContext(TypedDict):
 __all__ = [
     "SpanLevel",
     "ScoreDataType",
+    "ExperimentScoreType",
     "TraceMetadata",
     "ObservationParams",
     "MaskFunction",
diff --git a/tests/test_core_sdk.py b/tests/test_core_sdk.py
index 2888c0554..63bdaeecc 100644
--- a/tests/test_core_sdk.py
+++ b/tests/test_core_sdk.py
@@ -9,6 +9,11 @@
 from langfuse import Langfuse
 from langfuse._client.resource_manager import LangfuseResourceManager
 from langfuse._utils import _get_timestamp
+from langfuse.api import (
+    CreateScoreConfigRequest,
+    ScoreConfigDataType,
+    UpdateScoreConfigRequest,
+)
 from tests.api_wrapper import LangfuseAPI
 from tests.utils import (
     create_uuid,
@@ -257,6 +262,94 @@ def test_create_categorical_score():
     assert trace["scores"][0]["stringValue"] == "high score"
 
 
+def test_create_text_score():
+    langfuse = Langfuse()
+    api_wrapper = LangfuseAPI()
+
+    # Create a span and set trace properties
+    with langfuse.start_as_current_span(name="test-span") as span:
+        span.update_trace(
+            name="this-is-so-great-new",
+            user_id="test",
+            metadata="test",
+        )
+        # Get trace ID for later use
+        trace_id = span.trace_id
+
+    # Ensure data is sent
+    langfuse.flush()
+    sleep(2)
+
+    # Create a text score
+    score_id = create_uuid()
+    langfuse.create_score(
+        score_id=score_id,
+        trace_id=trace_id,
+        name="this-is-a-score",
+        value="this is a text score",
+        data_type="TEXT",
+    )
+
+    # Create a generation in the same trace
+    generation = langfuse.start_generation(
+        name="yet another child", metadata="test", trace_context={"trace_id": trace_id}
+    )
+    generation.end()
+
+    # Ensure data is sent
+    langfuse.flush()
+    sleep(2)
+
+    # Retrieve and verify
+    trace = api_wrapper.get_trace(trace_id)
+
+    assert trace["scores"][0]["id"] == score_id
+    assert trace["scores"][0]["dataType"] == "TEXT"
+    assert trace["scores"][0]["value"] is None
+    assert trace["scores"][0]["stringValue"] == "this is a text score"
+
+
+def test_create_and_list_text_score_config():
+    api = get_api()
+    score_config_name = f"text-score-config-{create_uuid()}"
+
+    score_config = api.score_configs.create(
+        request=CreateScoreConfigRequest(
+            name=score_config_name,
+            data_type=ScoreConfigDataType.TEXT,
+        )
+    )
+
+    try:
+        matching_score_config = None
+        score_configs_response = api.score_configs.get(page=1, limit=100)
+
+        for page in range(1, score_configs_response.meta.total_pages + 1):
+            if page > 1:
+                score_configs_response = api.score_configs.get(page=page, limit=100)
+
+            matching_score_config = next(
+                (
+                    config
+                    for config in score_configs_response.data
+                    if config.id == score_config.id
+                ),
+                None,
+            )
+
+            if matching_score_config is not None:
+                break
+
+        assert matching_score_config is not None
+        assert matching_score_config.name == score_config_name
+        assert matching_score_config.data_type == ScoreConfigDataType.TEXT
+    finally:
+        api.score_configs.update(
+            score_config.id,
+            request=UpdateScoreConfigRequest(is_archived=True),
+        )
+
+
 def test_create_score_with_custom_timestamp():
     langfuse = Langfuse()
     api_wrapper = LangfuseAPI()

From c1b1b38b938d9743bba5168138febbae62709192 Mon Sep 17 00:00:00 2001
From: Tobias Wochinger <tobias.wochinger@clickhouse.com>
Date: Wed, 20 May 2026 10:42:07 +0200
Subject: [PATCH 2/5] ci: update pnpm action setup

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 74013b0b6..00054d71e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -75,9 +75,9 @@ jobs:
     name: Test on Python version ${{ matrix.python-version }}
     steps:
       - uses: actions/checkout@v3
-      - uses: pnpm/action-setup@v3
+      - uses: pnpm/action-setup@739bfe42ca9233c5e6aca07c1a25a9d34aca49b0 # v6.0.7
         with:
-          version: 9.5.0
+          version: 11.1.3
 
       - name: Clone langfuse server
         run: |

From f322849a8592e6b1e878bd3475409916591ac86c Mon Sep 17 00:00:00 2001
From: Tobias Wochinger <tobias.wochinger@clickhouse.com>
Date: Wed, 20 May 2026 10:57:21 +0200
Subject: [PATCH 3/5] test: fix v3 stable CI failures

Backport dataset archived-item assertions and update flaky live test expectations.
---
 tests/test_core_sdk.py | 50 ++++++++++++++++++++++++------------------
 tests/test_datasets.py | 20 +++++++++++++++--
 tests/test_openai.py   |  5 +++--
 3 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/tests/test_core_sdk.py b/tests/test_core_sdk.py
index 63bdaeecc..ecf0a0967 100644
--- a/tests/test_core_sdk.py
+++ b/tests/test_core_sdk.py
@@ -143,10 +143,11 @@ def test_create_numeric_score():
 
     # Create a numeric score
     score_id = create_uuid()
+    score_name = f"score-{create_uuid()[:8]}"
     langfuse.create_score(
         score_id=score_id,
         trace_id=trace_id,
-        name="this-is-a-score",
+        name=score_name,
         value=1,
     )
 
@@ -163,10 +164,11 @@ def test_create_numeric_score():
     # Retrieve and verify
     trace = api_wrapper.get_trace(trace_id)
 
-    assert trace["scores"][0]["id"] == score_id
-    assert trace["scores"][0]["value"] == 1
-    assert trace["scores"][0]["dataType"] == "NUMERIC"
-    assert trace["scores"][0]["stringValue"] is None
+    created_score = next((s for s in trace["scores"] if s["name"] == score_name), None)
+    assert created_score is not None, "Score not found in trace"
+    assert created_score["value"] == 1
+    assert created_score["dataType"] == "NUMERIC"
+    assert created_score["stringValue"] is None
 
 
 def test_create_boolean_score():
@@ -189,10 +191,11 @@ def test_create_boolean_score():
 
     # Create a boolean score
     score_id = create_uuid()
+    score_name = f"score-{create_uuid()[:8]}"
     langfuse.create_score(
         score_id=score_id,
         trace_id=trace_id,
-        name="this-is-a-score",
+        name=score_name,
         value=1,
         data_type="BOOLEAN",
     )
@@ -210,10 +213,11 @@ def test_create_boolean_score():
     # Retrieve and verify
     trace = api_wrapper.get_trace(trace_id)
 
-    assert trace["scores"][0]["id"] == score_id
-    assert trace["scores"][0]["dataType"] == "BOOLEAN"
-    assert trace["scores"][0]["value"] == 1
-    assert trace["scores"][0]["stringValue"] == "True"
+    created_score = next((s for s in trace["scores"] if s["name"] == score_name), None)
+    assert created_score is not None, "Score not found in trace"
+    assert created_score["dataType"] == "BOOLEAN"
+    assert created_score["value"] == 1
+    assert created_score["stringValue"] == "True"
 
 
 def test_create_categorical_score():
@@ -236,10 +240,11 @@ def test_create_categorical_score():
 
     # Create a categorical score
     score_id = create_uuid()
+    score_name = f"score-{create_uuid()[:8]}"
     langfuse.create_score(
         score_id=score_id,
         trace_id=trace_id,
-        name="this-is-a-score",
+        name=score_name,
         value="high score",
     )
 
@@ -256,10 +261,11 @@ def test_create_categorical_score():
     # Retrieve and verify
     trace = api_wrapper.get_trace(trace_id)
 
-    assert trace["scores"][0]["id"] == score_id
-    assert trace["scores"][0]["dataType"] == "CATEGORICAL"
-    assert trace["scores"][0]["value"] == 0
-    assert trace["scores"][0]["stringValue"] == "high score"
+    created_score = next((s for s in trace["scores"] if s["name"] == score_name), None)
+    assert created_score is not None, "Score not found in trace"
+    assert created_score["dataType"] == "CATEGORICAL"
+    assert created_score["value"] == 0
+    assert created_score["stringValue"] == "high score"
 
 
 def test_create_text_score():
@@ -282,10 +288,11 @@ def test_create_text_score():
 
     # Create a text score
     score_id = create_uuid()
+    score_name = f"score-{create_uuid()[:8]}"
     langfuse.create_score(
         score_id=score_id,
         trace_id=trace_id,
-        name="this-is-a-score",
+        name=score_name,
         value="this is a text score",
         data_type="TEXT",
     )
@@ -303,15 +310,16 @@ def test_create_text_score():
     # Retrieve and verify
     trace = api_wrapper.get_trace(trace_id)
 
-    assert trace["scores"][0]["id"] == score_id
-    assert trace["scores"][0]["dataType"] == "TEXT"
-    assert trace["scores"][0]["value"] is None
-    assert trace["scores"][0]["stringValue"] == "this is a text score"
+    created_score = next((s for s in trace["scores"] if s["name"] == score_name), None)
+    assert created_score is not None, "Score not found in trace"
+    assert created_score["dataType"] == "TEXT"
+    assert created_score["value"] is None
+    assert created_score["stringValue"] == "this is a text score"
 
 
 def test_create_and_list_text_score_config():
     api = get_api()
-    score_config_name = f"text-score-config-{create_uuid()}"
+    score_config_name = f"text-score-config-{create_uuid()[:8]}"
 
     score_config = api.score_configs.create(
         request=CreateScoreConfigRequest(
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index f20c76f24..a28233c1a 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -124,7 +124,6 @@ def test_upsert_and_get_dataset_item():
         input=new_input,
         id=item.id,
         expected_output=new_input,
-        status=DatasetStatus.ARCHIVED,
     )
 
     # Refresh dataset and find updated item
@@ -139,7 +138,24 @@ def test_upsert_and_get_dataset_item():
     assert get_new_item.input == new_input
     assert get_new_item.id == item.id
     assert get_new_item.expected_output == new_input
-    assert get_new_item.status == DatasetStatus.ARCHIVED
+    assert get_new_item.status == DatasetStatus.ACTIVE
+
+    langfuse.create_dataset_item(
+        dataset_name=name,
+        input=new_input,
+        id=item.id,
+        expected_output=new_input,
+        status=DatasetStatus.ARCHIVED,
+    )
+
+    dataset = langfuse.get_dataset(name)
+    assert all(dataset_item.id != item.id for dataset_item in dataset.items)
+
+    archived_item = langfuse.api.dataset_items.get(item.id)
+    assert archived_item.input == new_input
+    assert archived_item.id == item.id
+    assert archived_item.expected_output == new_input
+    assert archived_item.status == DatasetStatus.ARCHIVED
 
 
 def test_dataset_run_with_metadata_and_description():
diff --git a/tests/test_openai.py b/tests/test_openai.py
index f24bf93cf..ab7332073 100644
--- a/tests/test_openai.py
+++ b/tests/test_openai.py
@@ -1220,13 +1220,14 @@ def test_audio_input_and_output(openai):
     client = openai.OpenAI()
     openai.langfuse_debug = True
     generation_name = "test_audio_input_and_output" + create_uuid()[:8]
+    model = "gpt-4o-mini-audio-preview"
 
     content_path = "static/joke_prompt.wav"
     base64_string = encode_file_to_base64(content_path)
 
     client.chat.completions.create(
         name=generation_name,
-        model="gpt-4o-audio-preview",
+        model=model,
         modalities=["text", "audio"],
         audio={"voice": "alloy", "format": "wav"},
         messages=[
@@ -1260,7 +1261,7 @@ def test_audio_input_and_output(openai):
         in generation.data[0].input[0]["content"][1]["input_audio"]["data"]
     )
     assert generation.data[0].type == "GENERATION"
-    assert "gpt-4o-audio-preview" in generation.data[0].model
+    assert model in generation.data[0].model
     assert generation.data[0].start_time is not None
     assert generation.data[0].end_time is not None
     assert generation.data[0].start_time < generation.data[0].end_time

From 3d1b0ae212142cc3bc1b8065da9d8c846863181c Mon Sep 17 00:00:00 2001
From: Tobias Wochinger <tobias.wochinger@clickhouse.com>
Date: Wed, 20 May 2026 11:07:56 +0200
Subject: [PATCH 4/5] test: address v3 review feedback

Update generated export ordering and use the requested audio model.
---
 langfuse/api/__init__.py           | 4 ++--
 langfuse/api/resources/__init__.py | 2 +-
 tests/test_openai.py               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/langfuse/api/__init__.py b/langfuse/api/__init__.py
index d366774d5..d3eaa2218 100644
--- a/langfuse/api/__init__.py
+++ b/langfuse/api/__init__.py
@@ -215,9 +215,9 @@
     Session,
     SessionWithTraces,
     Sort,
+    TextPrompt,
     TextScore,
     TextScoreV1,
-    TextPrompt,
     Trace,
     TraceBody,
     TraceEvent,
@@ -482,9 +482,9 @@
     "Session",
     "SessionWithTraces",
     "Sort",
+    "TextPrompt",
     "TextScore",
     "TextScoreV1",
-    "TextPrompt",
     "Trace",
     "TraceBody",
     "TraceEvent",
diff --git a/langfuse/api/resources/__init__.py b/langfuse/api/resources/__init__.py
index b54d98098..1bc2d0222 100644
--- a/langfuse/api/resources/__init__.py
+++ b/langfuse/api/resources/__init__.py
@@ -502,9 +502,9 @@
     "Session",
     "SessionWithTraces",
     "Sort",
+    "TextPrompt",
     "TextScore",
     "TextScoreV1",
-    "TextPrompt",
     "Trace",
     "TraceBody",
     "TraceEvent",
diff --git a/tests/test_openai.py b/tests/test_openai.py
index ab7332073..d8068c5ee 100644
--- a/tests/test_openai.py
+++ b/tests/test_openai.py
@@ -1220,7 +1220,7 @@ def test_audio_input_and_output(openai):
     client = openai.OpenAI()
     openai.langfuse_debug = True
     generation_name = "test_audio_input_and_output" + create_uuid()[:8]
-    model = "gpt-4o-mini-audio-preview"
+    model = "gpt-audio-mini-2025-12-15"
 
     content_path = "static/joke_prompt.wav"
     base64_string = encode_file_to_base64(content_path)

From f9ba977dfb1bc622c40b5e1299780d72e3467afc Mon Sep 17 00:00:00 2001
From: Tobias Wochinger <tobias.wochinger@clickhouse.com>
Date: Wed, 20 May 2026 11:27:24 +0200
Subject: [PATCH 5/5] test: tolerate omitted text score value

Allow the TEXT score response to omit null numeric values while asserting stringValue.
---
 tests/test_core_sdk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_core_sdk.py b/tests/test_core_sdk.py
index ecf0a0967..30b052928 100644
--- a/tests/test_core_sdk.py
+++ b/tests/test_core_sdk.py
@@ -313,7 +313,7 @@ def test_create_text_score():
     created_score = next((s for s in trace["scores"] if s["name"] == score_name), None)
     assert created_score is not None, "Score not found in trace"
     assert created_score["dataType"] == "TEXT"
-    assert created_score["value"] is None
+    assert created_score.get("value") is None
     assert created_score["stringValue"] == "this is a text score"