|
| 1 | +"""Re-generate query help files in two stages without needing docling. |
| 2 | +
|
| 3 | +This script reuses the existing .misra-rule-cache/<standard>.json |
| 4 | +(produced by a prior dump_rules_json.py run) to: |
| 5 | +
|
| 6 | + Stage 1: Deterministically re-render every .md from the cached rule |
| 7 | + data via render_help(). |
| 8 | + Patch: Update the cache JSON with current existing_md content and |
| 9 | + implementation_scope from rule_packages/*.json. |
| 10 | + Stage 2: Run rewrite_help.py (LLM lint/proofread) over the patched |
| 11 | + cache. |
| 12 | +
|
| 13 | +Usage: |
| 14 | + python refresh_help.py --standard MISRA-C-2012 |
| 15 | + python refresh_help.py --standard MISRA-C++-2023 |
| 16 | + python refresh_help.py --standard MISRA-C-2012 --stage1-only |
| 17 | +""" |
| 18 | +from __future__ import annotations |
| 19 | + |
| 20 | +import argparse |
| 21 | +import json |
| 22 | +import subprocess |
| 23 | +import sys |
| 24 | +from dataclasses import dataclass |
| 25 | +from pathlib import Path |
| 26 | +from typing import Any |
| 27 | + |
| 28 | +sys.path.insert(0, str(Path(__file__).parent)) |
| 29 | +from extract_rules import Rule, render_help, _format_code_lines # noqa: E402 |
| 30 | + |
| 31 | +SCRIPT_DIR = Path(__file__).resolve().parent |
| 32 | +QUERY_REPO = SCRIPT_DIR.parents[2] |
| 33 | +DEFAULT_HELP_REPO = QUERY_REPO.parent / "codeql-coding-standards-help" |
| 34 | + |
| 35 | +STANDARD_INFO = { |
| 36 | + "MISRA-C-2012": ("c", "c/misra/src/rules"), |
| 37 | + "MISRA-C-2023": ("c", "c/misra/src/rules"), |
| 38 | + "MISRA-C++-2023": ("cpp", "cpp/misra/src/rules"), |
| 39 | +} |
| 40 | + |
| 41 | + |
| 42 | +def _rule_from_json(d: dict[str, Any]) -> Rule: |
| 43 | + """Reconstruct a Rule from the cache JSON dict.""" |
| 44 | + r = Rule( |
| 45 | + rule_id=d["rule_id"], |
| 46 | + raw_id=d["raw_id"], |
| 47 | + standard=d["standard"], |
| 48 | + title=d["title"], |
| 49 | + category=d.get("category", ""), |
| 50 | + analysis=d.get("analysis", ""), |
| 51 | + applies_to=d.get("applies_to", ""), |
| 52 | + amplification=d.get("amplification", ""), |
| 53 | + rationale=d.get("rationale", ""), |
| 54 | + exceptions=d.get("exceptions", []), |
| 55 | + example=d.get("example", ""), |
| 56 | + see_also=d.get("see_also", []), |
| 57 | + ) |
| 58 | + # Restore example_layout if present. |
| 59 | + layout = d.get("example_layout", []) |
| 60 | + if layout: |
| 61 | + r._example_layout = [(item["kind"], item["text"]) for item in layout] |
| 62 | + return r |
| 63 | + |
| 64 | + |
| 65 | +def _load_impl_scope_lookup( |
| 66 | + query_repo: Path, standard: str, |
| 67 | +) -> dict[tuple[str, str], dict]: |
| 68 | + """Build (rule_id, short_name) -> implementation_scope from rule_packages.""" |
| 69 | + lang, _ = STANDARD_INFO[standard] |
| 70 | + pkg_dir = query_repo / "rule_packages" / lang |
| 71 | + if not pkg_dir.is_dir(): |
| 72 | + return {} |
| 73 | + lookup: dict[tuple[str, str], dict] = {} |
| 74 | + for pkg_file in sorted(pkg_dir.glob("*.json")): |
| 75 | + try: |
| 76 | + data = json.loads(pkg_file.read_text(encoding="utf-8")) |
| 77 | + except (OSError, json.JSONDecodeError): |
| 78 | + continue |
| 79 | + for _std_key, rules in data.items(): |
| 80 | + if not isinstance(rules, dict): |
| 81 | + continue |
| 82 | + for rule_id, rule_data in rules.items(): |
| 83 | + if not isinstance(rule_data, dict): |
| 84 | + continue |
| 85 | + for q in rule_data.get("queries", []): |
| 86 | + sn = q.get("short_name") |
| 87 | + impl = q.get("implementation_scope") |
| 88 | + if sn and impl: |
| 89 | + lookup[(rule_id, sn)] = impl |
| 90 | + return lookup |
| 91 | + |
| 92 | + |
| 93 | +def stage1_render(cache: dict, help_repo: Path) -> tuple[int, int]: |
| 94 | + """Re-render all .md files from cached rule data. Returns (wrote, skipped).""" |
| 95 | + lang = cache["lang"] |
| 96 | + rules_json = cache["rules"] |
| 97 | + queries_json = cache["queries"] |
| 98 | + |
| 99 | + wrote = skipped = 0 |
| 100 | + for rule_id, query_list in sorted(queries_json.items()): |
| 101 | + rule_data = rules_json.get(rule_id) |
| 102 | + if not rule_data: |
| 103 | + skipped += len(query_list) |
| 104 | + continue |
| 105 | + rule = _rule_from_json(rule_data) |
| 106 | + body = render_help(rule, lang) |
| 107 | + for q in query_list: |
| 108 | + md_path = help_repo / q["md_path"] |
| 109 | + md_path.parent.mkdir(parents=True, exist_ok=True) |
| 110 | + md_path.write_text(body, encoding="utf-8") |
| 111 | + wrote += 1 |
| 112 | + |
| 113 | + return wrote, skipped |
| 114 | + |
| 115 | + |
| 116 | +def patch_cache( |
| 117 | + cache: dict, help_repo: Path, query_repo: Path, standard: str, |
| 118 | +) -> dict: |
| 119 | + """Update existing_md and add implementation_scope to the cache.""" |
| 120 | + impl_lookup = _load_impl_scope_lookup(query_repo, standard) |
| 121 | + queries_json = cache["queries"] |
| 122 | + |
| 123 | + for rule_id, query_list in queries_json.items(): |
| 124 | + for q in query_list: |
| 125 | + md_path = help_repo / q["md_path"] |
| 126 | + try: |
| 127 | + q["existing_md"] = md_path.read_text(encoding="utf-8") |
| 128 | + except FileNotFoundError: |
| 129 | + q["existing_md"] = None |
| 130 | + |
| 131 | + # Add implementation_scope from rule_packages. |
| 132 | + ql_stem = Path(q["ql_path"]).stem |
| 133 | + impl = impl_lookup.get((rule_id, ql_stem)) |
| 134 | + if impl: |
| 135 | + q["implementation_scope"] = impl |
| 136 | + elif "implementation_scope" in q: |
| 137 | + del q["implementation_scope"] |
| 138 | + |
| 139 | + return cache |
| 140 | + |
| 141 | + |
| 142 | +def main() -> int: |
| 143 | + p = argparse.ArgumentParser(description=__doc__, |
| 144 | + formatter_class=argparse.RawDescriptionHelpFormatter) |
| 145 | + p.add_argument("--standard", required=True, choices=sorted(STANDARD_INFO)) |
| 146 | + p.add_argument("--help-repo", type=Path, default=DEFAULT_HELP_REPO) |
| 147 | + p.add_argument("--query-repo", type=Path, default=QUERY_REPO) |
| 148 | + p.add_argument("--stage1-only", action="store_true", |
| 149 | + help="Only run deterministic stage 1 (no LLM).") |
| 150 | + p.add_argument("--model", default=None, |
| 151 | + help="Copilot model id for stage 2.") |
| 152 | + args = p.parse_args() |
| 153 | + |
| 154 | + help_repo = args.help_repo.resolve() |
| 155 | + cache_path = help_repo / ".misra-rule-cache" / f"{args.standard}.json" |
| 156 | + if not cache_path.exists(): |
| 157 | + print(f"Cache not found: {cache_path}", file=sys.stderr) |
| 158 | + return 2 |
| 159 | + |
| 160 | + cache = json.loads(cache_path.read_text(encoding="utf-8")) |
| 161 | + total_queries = sum(len(v) for v in cache["queries"].values()) |
| 162 | + print(f"Loaded cache: {len(cache['rules'])} rules, {total_queries} queries") |
| 163 | + |
| 164 | + # Stage 1: deterministic render. |
| 165 | + print("\n=== Stage 1: deterministic render ===") |
| 166 | + wrote, skipped = stage1_render(cache, help_repo) |
| 167 | + print(f"Stage 1 done: wrote={wrote} skipped={skipped}") |
| 168 | + |
| 169 | + # Patch cache with fresh existing_md + implementation_scope. |
| 170 | + print("\n=== Patching cache ===") |
| 171 | + cache = patch_cache(cache, help_repo, args.query_repo, args.standard) |
| 172 | + cache_path.write_text( |
| 173 | + json.dumps(cache, indent=2, ensure_ascii=False), encoding="utf-8") |
| 174 | + impl_count = sum( |
| 175 | + 1 for qs in cache["queries"].values() |
| 176 | + for q in qs if q.get("implementation_scope") |
| 177 | + ) |
| 178 | + print(f"Cache updated: implementation_scope on {impl_count} queries") |
| 179 | + |
| 180 | + if args.stage1_only: |
| 181 | + print("\n--stage1-only: skipping LLM pass.") |
| 182 | + return 0 |
| 183 | + |
| 184 | + # Stage 2: LLM lint/proofread via rewrite_help.py. |
| 185 | + print("\n=== Stage 2: LLM lint/proofread ===") |
| 186 | + cmd = [ |
| 187 | + sys.executable, |
| 188 | + str(SCRIPT_DIR / "rewrite_help.py"), |
| 189 | + "--standard", args.standard, |
| 190 | + "--help-repo", str(help_repo), |
| 191 | + ] |
| 192 | + if args.model: |
| 193 | + cmd += ["--model", args.model] |
| 194 | + print(f"Running: {' '.join(cmd)}") |
| 195 | + return subprocess.call(cmd) |
| 196 | + |
| 197 | + |
| 198 | +if __name__ == "__main__": |
| 199 | + raise SystemExit(main()) |
0 commit comments