Skip to content
40 changes: 39 additions & 1 deletion packages/gooddata-eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,44 @@ A dataset is a folder of `.json` files, one per question:
```

Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
`search_tool`, `general_question`, `guardrail`.
`search_tool`, `general_question`, `guardrail`, `dashboard_summary`.

### `dashboard_summary` items

Summary items call the dedicated summary endpoint
(`POST /api/v1/ai/workspaces/{ws}/summary`) instead of the chat endpoint, so
they carry an extra `summary_input` block, and the `expected_output` is a
**rubric** rather than an exact answer (summaries are free text):

```json
{
"id": "summary-001",
"dataset_name": "summary_pilot",
"test_kind": "dashboard_summary",
"question": "Summarize the Sales Overview dashboard.",
"summary_input": {
"dashboard_id": "sales_overview"
},
"expected_output": {
"must_include": ["States the overall revenue trend", "Identifies the top segment"],
"must_not_include": ["Numbers or segments not present in the visualizations"],
"rubric": ["Reads as a coherent business summary"]
}
}
```

`summary_input` requires only `dashboard_id` (the endpoint summarizes the whole
dashboard). Optional fields narrow the scope: `visualizations` (list of ids),
`filter_context` (AFM filters), `tab_id`, and `format_hint`.

The `expected_output` rubric:

- `must_include` — facts a good summary must contain; **all** must pass for the item to pass.
- `must_not_include` — hallucination/accuracy guards; **any** violation fails the item.
- `rubric` — soft quality dimensions; they affect `quality_score` but do not gate pass/fail.

Each criterion is scored independently by the LLM judge, so `quality_score`
is the fraction of satisfied criteria.

## Supported test kinds

Expand All @@ -174,6 +211,7 @@ Supported `test_kind` values: `visualization`, `metric_skill`, `alert_skill`,
| `search_tool` | `search_objects` tool call (correct function called = pass; correct arguments = quality score) | — |
| `general_question` | Text answer judged by LLM | `[llm-judge]` |
| `guardrail` | Refusal/redirect (visualization response auto-fails) | `[llm-judge]` |
| `dashboard_summary` | Dashboard summary (via `/summary` endpoint) scored against a rubric by LLM | `[llm-judge]` |

## Optional extras

Expand Down
31 changes: 29 additions & 2 deletions packages/gooddata-eval/src/gooddata_eval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,38 @@
from gooddata_eval.core.connection import ConnectionError_, resolve_connection
from gooddata_eval.core.dataset.local import load_local_dataset
from gooddata_eval.core.langfuse.sink import LangfuseSink
from gooddata_eval.core.models import DatasetItem
from gooddata_eval.core.models import ChatResult, DatasetItem
from gooddata_eval.core.reporting.console import render_comparison, render_console
from gooddata_eval.core.reporting.json_report import write_multi_model_report
from gooddata_eval.core.runner import ItemReport, run_items
from gooddata_eval.core.summary.http_client import SummaryClient
from gooddata_eval.core.workspace import ModelResolutionError, WorkspaceModelController

_EXIT_OK = 0
_EXIT_OPERATIONAL_ERROR = 2
_SUMMARY_TEST_KIND = "dashboard_summary"


class _RoutingBackend:
"""Dispatch each item to the right backend by test_kind.

`dashboard_summary` items go to the dedicated summary endpoint; everything
else uses the conversational chat endpoint.
"""

def __init__(self, chat: ChatClient, summary: SummaryClient):
self._chat = chat
self._summary = summary

def ask(self, item: DatasetItem) -> ChatResult:
if item.test_kind == _SUMMARY_TEST_KIND:
return self._summary.ask(item)
return self._chat.ask(item)

def close(self) -> None:
for backend in (self._chat, self._summary):
if hasattr(backend, "close"):
backend.close()


def _build_parser() -> argparse.ArgumentParser:
Expand Down Expand Up @@ -256,7 +280,10 @@ def on_langfuse_item_done(
) -> None:
_sink.log_item(report, dataset_item_id=report.id)

backend = ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id)
backend = _RoutingBackend(
ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
)
try:
report = run_items(
items,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import httpx

from gooddata_eval.core.models import ChatResult
from gooddata_eval.core.models import ChatResult, DatasetItem

SSE_DATA_PREFIX = "data: "

Expand Down Expand Up @@ -169,11 +169,11 @@ def _send_message(self, conversation_id: str, question: str) -> ChatResult:
resp.raise_for_status()
return parse_sse_lines(resp.iter_lines())

def ask(self, question: str) -> ChatResult:
def ask(self, item: DatasetItem) -> ChatResult:
"""Run one single-turn conversation: create, send, parse, clean up."""
conversation_id = self._create_conversation()
try:
return self._send_message(conversation_id, question)
return self._send_message(conversation_id, item.question)
finally:
self._delete_conversation(conversation_id)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import httpx

from gooddata_eval.core.models import DatasetItem
from gooddata_eval.core.models import DatasetItem, SummaryInput

_DEFAULT_HOST = "https://cloud.langfuse.com"
_PAGE_SIZE = 100
Expand Down Expand Up @@ -47,6 +47,24 @@ def _question_from_input(raw_input: Any) -> str:
raise ValueError(f"Unsupported Langfuse item input shape: {raw_input!r}")


def _summary_input_from_raw(raw: dict, expected_output: Any) -> SummaryInput | None:
"""Locate a dashboard_summary item's `summary_input`.

Langfuse items have no dedicated field for it, so accept it (in priority
order) from the item input object, the item metadata, or the expectedOutput.
"""
candidate: Any = None
raw_input = raw.get("input")
metadata = raw.get("metadata")
if isinstance(raw_input, dict) and isinstance(raw_input.get("summary_input"), dict):
candidate = raw_input["summary_input"]
elif isinstance(metadata, dict) and isinstance(metadata.get("summary_input"), dict):
candidate = metadata["summary_input"]
elif isinstance(expected_output, dict) and isinstance(expected_output.get("summary_input"), dict):
candidate = expected_output["summary_input"]
return SummaryInput.model_validate(candidate) if candidate is not None else None


def _item_from_raw(raw: dict, *, dataset_name: str, test_kind: str) -> DatasetItem:
"""Map a Langfuse REST API dataset-item dict to a DatasetItem."""
# REST API returns camelCase: expectedOutput, not expected_output
Expand All @@ -60,6 +78,7 @@ def _item_from_raw(raw: dict, *, dataset_name: str, test_kind: str) -> DatasetIt
test_kind=resolved_kind,
question=_question_from_input(raw.get("input")),
expected_output=expected_output,
summary_input=_summary_input_from_raw(raw, expected_output),
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,18 @@
)
}

# LLM-judge evaluators (general_question, guardrail) require the [llm-judge] extra.
# Their modules are imported lazily on first use so the CLI starts without openai.
# LLM-judge evaluators (general_question, guardrail, dashboard_summary) require the
# [llm-judge] extra. Their modules are imported lazily on first use so the CLI
# starts without openai.
_LAZY_EVALUATOR_MODULES: dict[str, str] = {
"general_question": "gooddata_eval.core.evaluators.general_question",
"guardrail": "gooddata_eval.core.evaluators.guardrail",
"dashboard_summary": "gooddata_eval.core.evaluators.summary",
}
_LAZY_EVALUATOR_CLASSES: dict[str, str] = {
"general_question": "GeneralQuestionEvaluator",
"guardrail": "GuardrailEvaluator",
"dashboard_summary": "DashboardSummaryEvaluator",
}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# (C) 2026 GoodData Corporation
"""Evaluator for dashboard_summary: rubric-based LLM-as-judge scoring.

Summaries are free text, so we do not match strings. Instead, `expected_output`
is a rubric of checkable criteria:

{
"must_include": ["...facts a good summary must contain..."],
"must_not_include": ["...things a good summary must avoid (hallucinations)..."],
"rubric": ["...soft quality dimensions..."]
}

Each criterion is scored independently by the judge (True/False), so the
runner's `quality_score` becomes the fraction of satisfied criteria. The item
*passes* only when every `must_include` is satisfied and no `must_not_include`
is violated; `rubric` items contribute to quality but do not gate pass/fail.

As a fallback, a non-dict `expected_output` is treated as a single rubric
criterion (same behaviour as `general_question`).
"""

from typing import Any

from gooddata_eval.core.evaluators._llm_judge import LLMJudge
from gooddata_eval.core.evaluators._text_utils import extract_text
from gooddata_eval.core.evaluators.base import ItemEvaluation
from gooddata_eval.core.models import ChatResult, DatasetItem

_POSITIVE_STEPS = [
"Read the INPUT (the user's request) and the EXPECTED OUTPUT (one criterion the summary must satisfy).",
"Read the ACTUAL OUTPUT (the generated summary).",
"Score 1 if the actual output clearly satisfies the criterion (allow paraphrasing and reasonable numeric tolerance).",
"Score 0 if the criterion is missing, contradicted, or only partially addressed.",
]

# For must_not_include we ask the judge a plain presence question and invert the
# result in code. Scoring "does the summary AVOID X?" via a field labelled
# EXPECTED OUTPUT is unreliable: the model reads the forbidden behaviour as
# desired and flips the verdict. Detecting presence (no negation, no
# contradictory label) is far more robust.
_VIOLATION_STEPS = [
"Read the CHARACTERISTIC described in EXPECTED OUTPUT.",
"Read the ACTUAL OUTPUT (the generated summary).",
"Score 1 if the actual output clearly exhibits the described characteristic.",
"Score 0 if it does not exhibit it.",
]


class DashboardSummaryEvaluator:
test_kind = "dashboard_summary"

def __init__(self):
self._positive_judge = LLMJudge(evaluation_steps=_POSITIVE_STEPS)
self._violation_judge = LLMJudge(evaluation_steps=_VIOLATION_STEPS)

@staticmethod
def _criteria(expected_output: Any) -> tuple[list[str], list[str], list[str]]:
if isinstance(expected_output, dict):
must_include = [str(c) for c in expected_output.get("must_include", [])]
must_not_include = [str(c) for c in expected_output.get("must_not_include", [])]
rubric = [str(c) for c in expected_output.get("rubric", [])]
if must_include or must_not_include or rubric:
return must_include, must_not_include, rubric
# Fallback: treat the whole expected_output as a single gating criterion
# (same pass/fail semantics as general_question).
return [str(expected_output)], [], []

def evaluate(self, item: DatasetItem, chat_result: ChatResult) -> ItemEvaluation:
actual = extract_text(chat_result)
must_include, must_not_include, rubric = self._criteria(item.expected_output)

detail: dict[str, Any] = {"actual_output": actual}
passed = True

for i, criterion in enumerate(must_include):
ok, reason = self._positive_judge.score(item.question, criterion, actual)
detail[f"include_{i}"] = ok
detail[f"include_{i}_reason"] = reason
passed = passed and ok

for i, criterion in enumerate(must_not_include):
violated, reason = self._violation_judge.score(item.question, criterion, actual)
ok = not violated # True == characteristic absent == correctly avoided
detail[f"exclude_{i}"] = ok
detail[f"exclude_{i}_reason"] = reason
passed = passed and ok

for i, criterion in enumerate(rubric):
ok, reason = self._positive_judge.score(item.question, criterion, actual)
detail[f"rubric_{i}"] = ok
detail[f"rubric_{i}_reason"] = reason

bool_checks = [v for v in detail.values() if isinstance(v, bool)]
quality = sum(1 for v in bool_checks if v) / len(bool_checks) if bool_checks else 0.0

return ItemEvaluation(passed=passed, rank_key=(int(passed), quality), detail=detail)
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def _event(event_type: str, body: dict[str, Any]) -> dict[str, Any]:
"id": trace_id,
"timestamp": now,
"name": f"gd-eval: {report.question[:80]}",
# Expose the model on a first-class trace field so Langfuse
# dashboards can filter / break down by it ("Version"); trace
# metadata is not available as a breakdown dimension.
"version": self._model_id or None,
"input": {"question": report.question},
"output": report.best_detail,
"metadata": {
Expand Down
19 changes: 19 additions & 0 deletions packages/gooddata-eval/src/gooddata_eval/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,23 @@ class ChatResult(BaseModel):
tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents")


class SummaryInput(BaseModel):
"""Structured input for the `dashboard_summary` test kind.

Maps onto the dedicated summary endpoint's request body
(`POST /api/v1/ai/workspaces/{ws}/summary`). Authored in snake_case in the
dataset; the SummaryClient maps it to the endpoint's camelCase fields.
"""

model_config = ConfigDict(extra="ignore")

dashboard_id: str
visualizations: list[str] | None = None
filter_context: list[dict] | None = None
tab_id: str | None = None
format_hint: str | None = None


class DatasetItem(BaseModel):
"""Common dataset envelope. `expected_output` stays raw; each evaluator parses its own shape."""

Expand All @@ -95,3 +112,5 @@ class DatasetItem(BaseModel):
test_kind: str
question: str
expected_output: Any
# Only used by the `dashboard_summary` test kind; ignored by all others.
summary_input: SummaryInput | None = None
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,11 @@ def render_console(report: EvalReport, *, console: Console | None = None) -> str
elif item.pass_at_k:
result, notes = "PASS", ""
else:
d = item.best_detail
failing = [
k
for k in ("metrics_correct", "dimensions_correct", "filters_correct", "viz_type_hard")
if d.get(k) is False
]
notes = "failed: " + ", ".join(failing) if failing else "no visualization created"
# Evaluator-agnostic: report whichever boolean checks came back False
# (visualization uses metrics_correct/…; dashboard_summary uses
# include_*/exclude_*/rubric_*). Falls back to a generic message.
failing = [k for k, v in item.best_detail.items() if v is False]
notes = "failed: " + ", ".join(failing) if failing else "did not pass strict checks"
result = "FAIL"
latency = "-" if item.runs == 0 else f"{item.latency_s:.2f}s"
avg = "-" if item.runs == 0 else f"{item.avg_latency_s:.2f}s"
Expand Down
6 changes: 4 additions & 2 deletions packages/gooddata-eval/src/gooddata_eval/core/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@


class ChatBackend(Protocol):
def ask(self, question: str) -> ChatResult: ...
# Receives the whole item so backends can use per-item context beyond the
# question text (e.g. dashboard_summary needs item.summary_input).
def ask(self, item: DatasetItem) -> ChatResult: ...


@dataclass
Expand Down Expand Up @@ -106,7 +108,7 @@ def _run_one_item(
try:
for run_index in range(1, runs + 1):
t0 = time.perf_counter()
chat_result = backend.ask(item.question)
chat_result = backend.ask(item)
evaluation = evaluator.evaluate(item, chat_result)
latency = time.perf_counter() - t0
report.runs += 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# (C) 2026 GoodData Corporation
Loading
Loading