github · data-douser · Apr 21, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,9 @@
 # query compilation caches
 .cache
 
+# MISRA help generator docling cache
+scripts/generate_rules/misra_help/cache/
+
 # qltest projects and artifacts
 **/test/**/*.testproj
 **/test/**/*.actual

diff --git a/scripts/generate_rules/misra_help/README.md b/scripts/generate_rules/misra_help/README.md
@@ -0,0 +1,64 @@
+# MISRA help-file populator
+
+Generates per-query Markdown help files in
+`codeql-coding-standards-help/{c,cpp}/misra/src/rules/<RULE-ID>/<Name>.md`
+from the licensed MISRA PDFs.
+
+## Prerequisites
+
+1. **Python venv with docling** (~3 GB, not in `scripts/requirements.txt`):
+
+   ```bash
+   python3 -m venv .venv && .venv/bin/pip install docling
+   ```
+
+2. **MISRA PDFs** — licensed material, excluded from version control.
+   Place them in your `codeql-coding-standards-help` checkout:
+
+   ```bash
+   cp ~/Downloads/MISRA-C-2023-*.pdf   ../codeql-coding-standards-help/
+   cp ~/Downloads/MISRA-CPP-2023-*.pdf ../codeql-coding-standards-help/
+   ```
+
+   The tool resolves PDFs via: `--pdf` flag > `$MISRA_C_PDF` /
+   `$MISRA_CPP_PDF` env vars > glob in `--help-repo`.
+
+## Usage
+
+```bash
+# Deterministic render (Stage 1 only):
+.venv/bin/python populate_help.py --standard MISRA-C++-2023
+.venv/bin/python populate_help.py --standard MISRA-C-2012
+
+# Single rule:
+.venv/bin/python populate_help.py --standard MISRA-C++-2023 --rule RULE-8-1
+
+# Fill in missing help only (don't overwrite existing):
+.venv/bin/python populate_help.py --standard MISRA-C++-2023 --no-overwrite
+
+# Preview without writing:
+.venv/bin/python populate_help.py --standard MISRA-C++-2023 --dry-run
+```
+
+### Two-pass mode (deterministic + LLM lint)
+
+```bash
+# 1. Build the JSON sidecar:
+.venv/bin/python dump_rules_json.py --standard MISRA-C-2012
+
+# 2. Re-render + LLM proofread:
+.venv/bin/python refresh_help.py --standard MISRA-C-2012
+```
+
+## Files
+
+| File                  | Purpose                                                 |
+| --------------------- | ------------------------------------------------------- |
+| `extract_rules.py`    | docling PDF → `Rule` dataclasses (deterministic core)   |
+| `populate_help.py`    | Walk `.ql` queries, render and write `.md` help files   |
+| `dump_rules_json.py`  | Emit JSON sidecar for the LLM rewrite pass              |
+| `rewrite_help.py`     | Headless Copilot driver for LLM lint/proofread          |
+| `refresh_help.py`     | Combined Stage 1 + cache patch + Stage 2 runner         |
+| `harness.py`          | Determinism harness (per-section hashing across N runs) |
+| `cache.py`            | Shared helpers for cache path resolution and I/O        |
+
diff --git a/scripts/generate_rules/misra_help/__init__.py b/scripts/generate_rules/misra_help/__init__.py
@@ -0,0 +1,4 @@
+"""MISRA help-file populator.
+
+See `populate_help.py` for the entry point.
+"""
diff --git a/scripts/generate_rules/misra_help/cache.py b/scripts/generate_rules/misra_help/cache.py
@@ -0,0 +1,34 @@
+"""Shared helpers for locating and reading the MISRA rule cache."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+DEFAULT_CACHE_DIR = SCRIPT_DIR / "cache"
+DEFAULT_HELP_REPO = SCRIPT_DIR.parents[2].parent / "codeql-coding-standards-help"
+
+
+def cache_path_for(help_repo: Path, standard: str) -> Path:
+    """Return the path to the JSON cache file for a standard."""
+    return help_repo / ".misra-rule-cache" / f"{standard}.json"
+
+
+def load_cache(help_repo: Path, standard: str) -> dict[str, Any]:
+    """Load and return the JSON cache for a standard."""
+    path = cache_path_for(help_repo, standard)
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Cache not found: {path}. Run dump_rules_json.py first."
+        )
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def save_cache(help_repo: Path, standard: str, data: dict[str, Any]) -> Path:
+    """Write the JSON cache for a standard and return the path."""
+    path = cache_path_for(help_repo, standard)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8"
+    )
+    return path
diff --git a/scripts/generate_rules/misra_help/dump_rules_json.py b/scripts/generate_rules/misra_help/dump_rules_json.py
@@ -0,0 +1,202 @@
+"""Emit a per-standard JSON sidecar containing every extracted MISRA
+rule plus, for each `.ql` query that targets the rule, the query's
+`@name` title, target `.md` path, and the existing `.md` content (if
+any). This file is the input to the agent extension's LLM-driven
+"rewrite help docs" pass: docling extracts the structured rule data
+deterministically, then the LLM uses both the structured data AND the
+.ql title to produce a polished, idiomatic help file.
+
+Output layout:
+
+    <help-repo>/.misra-rule-cache/<standard>.json
+
+Schema (top-level):
+
+    {
+      "standard": "MISRA-C-2012",
+      "lang": "c",
+      "lang_src": "c/misra/src/rules",
+      "generated_at": "2026-04-20T10:11:12Z",
+      "rules": {
+        "RULE-9-2": {
+          "rule_id": "RULE-9-2",
+          "raw_id": "Rule 9.2",
+          "standard": "MISRA-C-2012",
+          "title": "...",
+          "category": "Required",
+          "analysis": "Decidable, Single Translation Unit",
+          "applies_to": "C90, C99, C11",
+          "amplification": "...",
+          "rationale": "...",
+          "exceptions": ["...", "..."],
+          "example_layout": [
+            {"kind": "code", "text": "..."},
+            {"kind": "text", "text": "..."}
+          ],
+          "see_also": [...]
+        },
+        ...
+      },
+      "queries": {
+        "RULE-9-2": [
+          {
+            "ql_path": "c/misra/src/rules/RULE-9-2/Init...braces.ql",
+            "ql_name_title": "The initializer for an aggregate ...",
+            "md_path": "c/misra/src/rules/RULE-9-2/Init...braces.md",
+            "existing_md": "..."  // null if the .md does not exist
+          },
+          ...
+        ],
+        ...
+      }
+    }
+
+The `existing_md` content is included so the LLM pass can preserve
+human-authored details (alert message wording, special examples) that
+docling did not capture.
+"""
+from __future__ import annotations
+import argparse
+import datetime as _dt
+import json
+import sys
+from dataclasses import asdict
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent))
+from extract_rules import extract_rules, Rule  # noqa: E402
+from cache import cache_path_for, save_cache  # noqa: E402
+from populate_help import (  # noqa: E402
+    STANDARD_INFO,
+    SUPPORTED_STANDARDS,
+    DEFAULT_HELP_REPO,
+    DEFAULT_QUERY_REPO,
+    collect_queries,
+    resolve_pdf,
+    _read_ql_name,
+)
+
+
+def _load_impl_scope_lookup(
+    query_repo: Path, standard: str,
+) -> dict[tuple[str, str], dict]:
+    """Build a (rule_id, short_name) -> implementation_scope lookup
+    from the rule_packages JSON files."""
+    lang, _ = STANDARD_INFO[standard]
+    pkg_dir = query_repo / "rule_packages" / lang
+    if not pkg_dir.is_dir():
+        return {}
+    lookup: dict[tuple[str, str], dict] = {}
+    for pkg_file in sorted(pkg_dir.glob("*.json")):
+        try:
+            data = json.loads(pkg_file.read_text(encoding="utf-8"))
+        except (OSError, json.JSONDecodeError):
+            continue
+        # Top-level key is the standard name (e.g. "MISRA-C-2012").
+        for std_key, rules in data.items():
+            if not isinstance(rules, dict):
+                continue
+            for rule_id, rule_data in rules.items():
+                if not isinstance(rule_data, dict):
+                    continue
+                for q in rule_data.get("queries", []):
+                    sn = q.get("short_name")
+                    impl = q.get("implementation_scope")
+                    if sn and impl:
+                        lookup[(rule_id, sn)] = impl
+    return lookup
+
+
+def _rule_to_jsonable(rule: Rule) -> dict:
+    """Serialize a Rule to JSON, including the example layout."""
+    d = asdict(rule)
+    layout = getattr(rule, "_example_layout", None)
+    if layout:
+        d["example_layout"] = [{"kind": k, "text": s} for (k, s) in layout]
+    else:
+        d["example_layout"] = []
+    return d
+
+
+def _query_entries(rule_id: str, ql_paths: list[Path],
+                   query_repo: Path, help_repo: Path,
+                   lang_src: Path,
+                   impl_lookup: dict[tuple[str, str], dict] | None = None,
+                   ) -> list[dict]:
+    out: list[dict] = []
+    for ql in sorted(ql_paths):
+        rel_dir = ql.parent.relative_to(query_repo / lang_src)
+        md = help_repo / lang_src / rel_dir / (ql.stem + ".md")
+        try:
+            existing = md.read_text(encoding="utf-8")
+        except FileNotFoundError:
+            existing = None
+        entry: dict = {
+            "ql_path": str(ql.relative_to(query_repo)),
+            "ql_name_title": _read_ql_name(ql) or "",
+            "md_path": str(md.relative_to(help_repo)),
+            "existing_md": existing,
+        }
+        if impl_lookup:
+            impl = impl_lookup.get((rule_id, ql.stem))
+            if impl:
+                entry["implementation_scope"] = impl
+        out.append(entry)
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument("--standard", required=True, choices=SUPPORTED_STANDARDS)
+    ap.add_argument("--query-repo", type=Path, default=DEFAULT_QUERY_REPO)
+    ap.add_argument("--help-repo", type=Path, default=DEFAULT_HELP_REPO)
+    ap.add_argument("--pdf", type=Path, default=None)
+    ap.add_argument("--cache-dir", type=Path,
+                    default=Path(__file__).resolve().parent / "cache",
+                    help="docling JSON cache dir")
+    ap.add_argument("--output", type=Path, default=None,
+                    help="output path (default: "
+                         "<help-repo>/.misra-rule-cache/<standard>.json)")
+    args = ap.parse_args()
+
+    pdf = resolve_pdf(args.standard, args.pdf, args.help_repo)
+    args.cache_dir.mkdir(parents=True, exist_ok=True)
+    rules = extract_rules(pdf, args.standard, args.cache_dir)
+
+    lang, lang_src = STANDARD_INFO[args.standard]
+    queries = collect_queries(args.query_repo, args.standard)
+
+    impl_lookup = _load_impl_scope_lookup(args.query_repo, args.standard)
+
+    rules_json: dict[str, dict] = {}
+    for r in rules:
+        rules_json[r.rule_id] = _rule_to_jsonable(r)
+
+    queries_json: dict[str, list[dict]] = {}
+    for rule_id, ql_paths in queries.items():
+        queries_json[rule_id] = _query_entries(
+            rule_id, ql_paths, args.query_repo, args.help_repo, lang_src,
+            impl_lookup)
+
+    payload = {
+        "standard": args.standard,
+        "lang": lang,
+        "lang_src": str(lang_src),
+        "generated_at": _dt.datetime.now(_dt.timezone.utc)
+            .strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "rules": rules_json,
+        "queries": queries_json,
+    }
+
+    out_path = args.output or cache_path_for(args.help_repo, args.standard)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
+                        encoding="utf-8")
+    print(f"wrote {out_path} ({len(rules_json)} rules, "
+          f"{sum(len(v) for v in queries_json.values())} queries)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())