gooddata · romrak · Jun 8, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -162,7 +162,44 @@ A dataset is a folder of `.json` files, one per question:
 ```
 
 Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
-`search_tool`, `general_question`, `guardrail`.
+`search_tool`, `general_question`, `guardrail`, `dashboard_summary`.
+
+### `dashboard_summary` items
+
+Summary items call the dedicated summary endpoint
+(`POST /api/v1/ai/workspaces/{ws}/summary`) instead of the chat endpoint, so
+they carry an extra `summary_input` block, and the `expected_output` is a
+**rubric** rather than an exact answer (summaries are free text):
+
+```json
+{
+  "id": "summary-001",
+  "dataset_name": "summary_pilot",
+  "test_kind": "dashboard_summary",
+  "question": "Summarize the Sales Overview dashboard.",
+  "summary_input": {
+    "dashboard_id": "sales_overview"
+  },
+  "expected_output": {
+    "must_include":     ["States the overall revenue trend", "Identifies the top segment"],
+    "must_not_include": ["Numbers or segments not present in the visualizations"],
+    "rubric":           ["Reads as a coherent business summary"]
+  }
+}
+```
+
+`summary_input` requires only `dashboard_id` (the endpoint summarizes the whole
+dashboard). Optional fields narrow the scope: `visualizations` (list of ids),
+`filter_context` (AFM filters), `tab_id`, and `format_hint`.
+
+The `expected_output` rubric:
+
+- `must_include` — facts a good summary must contain; **all** must pass for the item to pass.
+- `must_not_include` — hallucination/accuracy guards; **any** violation fails the item.
+- `rubric` — soft quality dimensions; they affect `quality_score` but do not gate pass/fail.
+
+Each criterion is scored independently by the LLM judge, so `quality_score`
+is the fraction of satisfied criteria.
 
 ## Supported test kinds
 
@@ -174,6 +211,7 @@ Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
 | `search_tool` | `search_objects` tool call (correct function called = pass; correct arguments = quality score) | — |
 | `general_question` | Text answer judged by LLM | `[llm-judge]` |
 | `guardrail` | Refusal/redirect (visualization response auto-fails) | `[llm-judge]` |
+| `dashboard_summary` | Dashboard summary (via `/summary` endpoint) scored against a rubric by LLM | `[llm-judge]` |
 
 ## Optional extras
 

@@ -16,14 +16,38 @@
 from gooddata_eval.core.connection import ConnectionError_, resolve_connection
 from gooddata_eval.core.dataset.local import load_local_dataset
 from gooddata_eval.core.langfuse.sink import LangfuseSink
-from gooddata_eval.core.models import DatasetItem
+from gooddata_eval.core.models import ChatResult, DatasetItem
 from gooddata_eval.core.reporting.console import render_comparison, render_console
 from gooddata_eval.core.reporting.json_report import write_multi_model_report
 from gooddata_eval.core.runner import ItemReport, run_items
+from gooddata_eval.core.summary.http_client import SummaryClient
 from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController
 
 _EXIT_OK = 0
 _EXIT_OPERATIONAL_ERROR = 2
+_SUMMARY_TEST_KIND = "dashboard_summary"
+
+
+class _RoutingBackend:
+    """Dispatch each item to the right backend by test_kind.
+
+    `dashboard_summary` items go to the dedicated summary endpoint; everything
+    else uses the conversational chat endpoint.
+    """
+
+    def __init__(self, chat: ChatClient, summary: SummaryClient):
+        self._chat = chat
+        self._summary = summary
+
+    def ask(self, item: DatasetItem) -> ChatResult:
+        if item.test_kind == _SUMMARY_TEST_KIND:
+            return self._summary.ask(item)
+        return self._chat.ask(item)
+
+    def close(self) -> None:
+        for backend in (self._chat, self._summary):
+            if hasattr(backend, "close"):
+                backend.close()
 
 
 def _build_parser() -> argparse.ArgumentParser:
@@ -256,7 +280,10 @@ def on_langfuse_item_done(
                 ) -> None:
                     _sink.log_item(report, dataset_item_id=report.id)
 
-            backend = ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id)
+            backend = _RoutingBackend(
+                ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
+                SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
+            )
             try:
                 report = run_items(
                     items,

@@ -19,7 +19,7 @@
 
 import httpx
 
-from gooddata_eval.core.models import ChatResult
+from gooddata_eval.core.models import ChatResult, DatasetItem
 
 SSE_DATA_PREFIX = "data: "
 
@@ -169,11 +169,11 @@ def _send_message(self, conversation_id: str, question: str) -> ChatResult:
             resp.raise_for_status()
             return parse_sse_lines(resp.iter_lines())
 
-    def ask(self, question: str) -> ChatResult:
+    def ask(self, item: DatasetItem) -> ChatResult:
         """Run one single-turn conversation: create, send, parse, clean up."""
         conversation_id = self._create_conversation()
         try:
-            return self._send_message(conversation_id, question)
+            return self._send_message(conversation_id, item.question)
         finally:
             self._delete_conversation(conversation_id)
 

@@ -17,7 +17,7 @@
 
 import httpx
 
-from gooddata_eval.core.models import DatasetItem
+from gooddata_eval.core.models import DatasetItem, SummaryInput
 
 _DEFAULT_HOST = "https://cloud.langfuse.com"
 _PAGE_SIZE = 100
@@ -47,6 +47,24 @@ def _question_from_input(raw_input: Any) -> str:
     raise ValueError(f"Unsupported Langfuse item input shape: {raw_input!r}")
 
 
+def _summary_input_from_raw(raw: dict, expected_output: Any) -> SummaryInput | None:
+    """Locate a dashboard_summary item's `summary_input`.
+
+    Langfuse items have no dedicated field for it, so accept it (in priority
+    order) from the item input object, the item metadata, or the expectedOutput.
+    """
+    candidate: Any = None
+    raw_input = raw.get("input")
+    metadata = raw.get("metadata")
+    if isinstance(raw_input, dict) and isinstance(raw_input.get("summary_input"), dict):
+        candidate = raw_input["summary_input"]
+    elif isinstance(metadata, dict) and isinstance(metadata.get("summary_input"), dict):
+        candidate = metadata["summary_input"]
+    elif isinstance(expected_output, dict) and isinstance(expected_output.get("summary_input"), dict):
+        candidate = expected_output["summary_input"]
+    return SummaryInput.model_validate(candidate) if candidate is not None else None
+
+
 def _item_from_raw(raw: dict, *, dataset_name: str, test_kind: str) -> DatasetItem:
     """Map a Langfuse REST API dataset-item dict to a DatasetItem."""
     # REST API returns camelCase: expectedOutput, not expected_output
@@ -60,6 +78,7 @@ def _item_from_raw(raw: dict, *, dataset_name: str, test_kind: str) -> DatasetIt
         test_kind=resolved_kind,
         question=_question_from_input(raw.get("input")),
         expected_output=expected_output,
+        summary_input=_summary_input_from_raw(raw, expected_output),
     )
 
 

@@ -20,15 +20,18 @@
     )
 }
 
-# LLM-judge evaluators (general_question, guardrail) require the [llm-judge] extra.
-# Their modules are imported lazily on first use so the CLI starts without openai.
+# LLM-judge evaluators (general_question, guardrail, dashboard_summary) require the
+# [llm-judge] extra. Their modules are imported lazily on first use so the CLI
+# starts without openai.
 _LAZY_EVALUATOR_MODULES: dict[str, str] = {
     "general_question": "gooddata_eval.core.evaluators.general_question",
     "guardrail": "gooddata_eval.core.evaluators.guardrail",
+    "dashboard_summary": "gooddata_eval.core.evaluators.summary",
 }
 _LAZY_EVALUATOR_CLASSES: dict[str, str] = {
     "general_question": "GeneralQuestionEvaluator",
     "guardrail": "GuardrailEvaluator",
+    "dashboard_summary": "DashboardSummaryEvaluator",
 }
 
 

@@ -0,0 +1,96 @@
+# (C) 2026 GoodData Corporation
+"""Evaluator for dashboard_summary: rubric-based LLM-as-judge scoring.
+
+Summaries are free text, so we do not match strings. Instead, `expected_output`
+is a rubric of checkable criteria:
+
+    {
+      "must_include":     ["...facts a good summary must contain..."],
+      "must_not_include": ["...things a good summary must avoid (hallucinations)..."],
+      "rubric":           ["...soft quality dimensions..."]
+    }
+
+Each criterion is scored independently by the judge (True/False), so the
+runner's `quality_score` becomes the fraction of satisfied criteria. The item
+*passes* only when every `must_include` is satisfied and no `must_not_include`
+is violated; `rubric` items contribute to quality but do not gate pass/fail.
+
+As a fallback, a non-dict `expected_output` is treated as a single rubric
+criterion (same behaviour as `general_question`).
+"""
+
+from typing import Any
+
+from gooddata_eval.core.evaluators._llm_judge import LLMJudge
+from gooddata_eval.core.evaluators._text_utils import extract_text
+from gooddata_eval.core.evaluators.base import ItemEvaluation
+from gooddata_eval.core.models import ChatResult, DatasetItem
+
+_POSITIVE_STEPS = [
+    "Read the INPUT (the user's request) and the EXPECTED OUTPUT (one criterion the summary must satisfy).",
+    "Read the ACTUAL OUTPUT (the generated summary).",
+    "Score 1 if the actual output clearly satisfies the criterion (allow paraphrasing and reasonable numeric tolerance).",
+    "Score 0 if the criterion is missing, contradicted, or only partially addressed.",
+]
+
+# For must_not_include we ask the judge a plain presence question and invert the
+# result in code. Scoring "does the summary AVOID X?" via a field labelled
+# EXPECTED OUTPUT is unreliable: the model reads the forbidden behaviour as
+# desired and flips the verdict. Detecting presence (no negation, no
+# contradictory label) is far more robust.
+_VIOLATION_STEPS = [
+    "Read the CHARACTERISTIC described in EXPECTED OUTPUT.",
+    "Read the ACTUAL OUTPUT (the generated summary).",
+    "Score 1 if the actual output clearly exhibits the described characteristic.",
+    "Score 0 if it does not exhibit it.",
+]
+
+
+class DashboardSummaryEvaluator:
+    test_kind = "dashboard_summary"
+
+    def __init__(self):
+        self._positive_judge = LLMJudge(evaluation_steps=_POSITIVE_STEPS)
+        self._violation_judge = LLMJudge(evaluation_steps=_VIOLATION_STEPS)
+
+    @staticmethod
+    def _criteria(expected_output: Any) -> tuple[list[str], list[str], list[str]]:
+        if isinstance(expected_output, dict):
+            must_include = [str(c) for c in expected_output.get("must_include", [])]
+            must_not_include = [str(c) for c in expected_output.get("must_not_include", [])]
+            rubric = [str(c) for c in expected_output.get("rubric", [])]
+            if must_include or must_not_include or rubric:
+                return must_include, must_not_include, rubric
+        # Fallback: treat the whole expected_output as a single gating criterion
+        # (same pass/fail semantics as general_question).
+        return [str(expected_output)], [], []
+
+    def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
+        actual = extract_text(chat_result)
+        must_include, must_not_include, rubric = self._criteria(item.expected_output)
+
+        detail: dict[str, Any] = {"actual_output": actual}
+        passed = True
+
+        for i, criterion in enumerate(must_include):
+            ok, reason = self._positive_judge.score(item.question, criterion, actual)
+            detail[f"include_{i}"] = ok
+            detail[f"include_{i}_reason"] = reason
+            passed = passed and ok
+
+        for i, criterion in enumerate(must_not_include):
+            violated, reason = self._violation_judge.score(item.question, criterion, actual)
+            ok = not violated  # True == characteristic absent == correctly avoided
+            detail[f"exclude_{i}"] = ok
+            detail[f"exclude_{i}_reason"] = reason
+            passed = passed and ok
+
+        for i, criterion in enumerate(rubric):
+            ok, reason = self._positive_judge.score(item.question, criterion, actual)
+            detail[f"rubric_{i}"] = ok
+            detail[f"rubric_{i}_reason"] = reason
+
+        bool_checks = [v for v in detail.values() if isinstance(v, bool)]
+        quality = sum(1 for v in bool_checks if v) / len(bool_checks) if bool_checks else 0.0
+
+        return ItemEvaluation(passed=passed, rank_key=(int(passed), quality), detail=detail)
@@ -89,6 +89,10 @@ def _event(event_type: str, body: dict[str, Any]) -> dict[str, Any]:
                     "id": trace_id,
                     "timestamp": now,
                     "name": f"gd-eval: {report.question[:80]}",
+                    # Expose the model on a first-class trace field so Langfuse
+                    # dashboards can filter / break down by it ("Version"); trace
+                    # metadata is not available as a breakdown dimension.
+                    "version": self._model_id or None,
                     "input": {"question": report.question},
                     "output": report.best_detail,
                     "metadata": {

@@ -85,6 +85,23 @@ class ChatResult(BaseModel):
     tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents")
 
 
+class SummaryInput(BaseModel):
+    """Structured input for the `dashboard_summary` test kind.
+
+    Maps onto the dedicated summary endpoint's request body
+    (`POST /api/v1/ai/workspaces/{ws}/summary`). Authored in snake_case in the
+    dataset; the SummaryClient maps it to the endpoint's camelCase fields.
+    """
+
+    model_config = ConfigDict(extra="ignore")
+
+    dashboard_id: str
+    visualizations: list[str] | None = None
+    filter_context: list[dict] | None = None
+    tab_id: str | None = None
+    format_hint: str | None = None
+
+
 class DatasetItem(BaseModel):
     """Common dataset envelope. `expected_output` stays raw; each evaluator parses its own shape."""
 
@@ -95,3 +112,5 @@ class DatasetItem(BaseModel):
     test_kind: str
     question: str
     expected_output: Any
+    # Only used by the `dashboard_summary` test kind; ignored by all others.
+    summary_input: SummaryInput | None = None
@@ -32,13 +32,11 @@ def render_console(report: EvalReport, *, console: Console | None = None) -> str
         elif item.pass_at_k:
             result, notes = "PASS", ""
         else:
-            d = item.best_detail
-            failing = [
-                k
-                for k in ("metrics_correct", "dimensions_correct", "filters_correct", "viz_type_hard")
-                if d.get(k) is False
-            ]
-            notes = "failed: " + ", ".join(failing) if failing else "no visualization created"
+            # Evaluator-agnostic: report whichever boolean checks came back False
+            # (visualization uses metrics_correct/…; dashboard_summary uses
+            # include_*/exclude_*/rubric_*). Falls back to a generic message.
+            failing = [k for k, v in item.best_detail.items() if v is False]
+            notes = "failed: " + ", ".join(failing) if failing else "did not pass strict checks"
             result = "FAIL"
         latency = "-" if item.runs == 0 else f"{item.latency_s:.2f}s"
         avg = "-" if item.runs == 0 else f"{item.avg_latency_s:.2f}s"

@@ -12,7 +12,9 @@
 
 
 class ChatBackend(Protocol):
-    def ask(self, question: str) -> ChatResult: ...
+    # Receives the whole item so backends can use per-item context beyond the
+    # question text (e.g. dashboard_summary needs item.summary_input).
+    def ask(self, item: DatasetItem) -> ChatResult: ...
 
 
 @dataclass
@@ -106,7 +108,7 @@ def _run_one_item(
     try:
         for run_index in range(1, runs + 1):
             t0 = time.perf_counter()
-            chat_result = backend.ask(item.question)
+            chat_result = backend.ask(item)
             evaluation = evaluator.evaluate(item, chat_result)
             latency = time.perf_counter() - t0
             report.runs += 1

@@ -0,0 +1 @@
+# (C) 2026 GoodData Corporation