Skip to content

Commit ad32b03

Browse files
Copilotdata-douser
andauthored
feat: add sarif_diff_by_commits tool for SARIF-to-git-diff correlation
Implements a new MCP tool that accepts a SARIF file path and git ref range, partitions SARIF results into "new" vs "pre-existing" based on file-level or line-level overlap with the git diff, and returns structured output for triage workflows. - Add diffSarifByCommits() pure utility in sarif-utils.ts with types - Register sarif_diff_by_commits tool in sarif-tools.ts - Add parseGitDiffOutput() helper for unified diff parsing - Add 14 unit tests for diffSarifByCommits() utility - Add 5 unit tests for sarif_diff_by_commits tool handler - Update server-tools.md documentation - Update tool registration count from 7 to 8 Closes #209 Agent-Logs-Url: https://github.com/advanced-security/codeql-development-mcp-server/sessions/8abb21bb-8877-4628-90da-36ffc8eeb742 Co-authored-by: data-douser <70299490+data-douser@users.noreply.github.com>
1 parent 7ece339 commit ad32b03

File tree

7 files changed

+791
-20
lines changed

7 files changed

+791
-20
lines changed

server/dist/codeql-development-mcp-server.js

Lines changed: 136 additions & 4 deletions
Large diffs are not rendered by default.

server/dist/codeql-development-mcp-server.js.map

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

server/src/lib/sarif-utils.ts

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,52 @@ export interface SarifDiffResult {
8787
unchangedRules: SarifRuleSummary[];
8888
}
8989

90+
/** A file changed in a git diff, with optional line ranges. */
91+
export interface DiffFileEntry {
92+
/** Changed line ranges (hunks). Empty array means file-level only. */
93+
hunks: Array<{ startLine: number; lineCount: number }>;
94+
/** File path relative to the repository root. */
95+
path: string;
96+
}
97+
98+
/** Granularity for matching SARIF results against a git diff. */
99+
export type DiffGranularity = 'file' | 'line';
100+
101+
/** A SARIF result classified by its relationship to a git diff. */
102+
export interface ClassifiedResult {
103+
/** File path of the primary location. */
104+
file: string;
105+
/** Line number of the primary location (if available). */
106+
line?: number;
107+
/** Original result index in the SARIF run. */
108+
resultIndex: number;
109+
/** The rule ID that produced this result. */
110+
ruleId: string;
111+
}
112+
113+
/** Result of partitioning SARIF results by git diff overlap. */
114+
export interface SarifDiffByCommitsResult {
115+
/** Granularity used for the classification. */
116+
granularity: DiffGranularity;
117+
/** Results whose primary location is in a file (and optionally line range) touched by the diff. */
118+
newResults: ClassifiedResult[];
119+
/** Results whose primary location is NOT in a changed file/line. */
120+
preExistingResults: ClassifiedResult[];
121+
/** Summary statistics. */
122+
summary: {
123+
/** Number of files in the diff. */
124+
diffFileCount: number;
125+
/** Git ref range used. */
126+
refRange: string;
127+
/** Total SARIF results examined. */
128+
totalResults: number;
129+
/** Number of results classified as new. */
130+
totalNew: number;
131+
/** Number of results classified as pre-existing. */
132+
totalPreExisting: number;
133+
};
134+
}
135+
90136
// ---------------------------------------------------------------------------
91137
// SARIF rule helpers
92138
// ---------------------------------------------------------------------------
@@ -812,3 +858,117 @@ export function findOverlappingAlerts(
812858

813859
return overlaps;
814860
}
861+
862+
// ---------------------------------------------------------------------------
863+
// SARIF-to-git-diff correlation
864+
// ---------------------------------------------------------------------------
865+
866+
/**
867+
* Check whether a SARIF URI matches a diff file path.
868+
*
869+
* SARIF URIs may be absolute (`file:///…`) or relative (`src/db.js`),
870+
* while git diff paths are always relative to the repo root (e.g. `src/db.js`).
871+
* We normalize both and use suffix matching so that cross-environment
872+
* comparisons work (e.g. CI vs local).
873+
*/
874+
function diffPathMatchesSarifUri(diffPath: string, sarifUri: string): boolean {
875+
return urisMatch(diffPath, sarifUri);
876+
}
877+
878+
/**
879+
* Check whether a line number falls within any of a file's changed hunks.
880+
*/
881+
function lineInHunks(line: number, hunks: Array<{ startLine: number; lineCount: number }>): boolean {
882+
for (const hunk of hunks) {
883+
const hunkEnd = hunk.startLine + Math.max(hunk.lineCount - 1, 0);
884+
if (line >= hunk.startLine && line <= hunkEnd) {
885+
return true;
886+
}
887+
}
888+
return false;
889+
}
890+
891+
/**
892+
* Partition SARIF results into "new" (touched by the diff) vs "pre-existing"
893+
* based on file-level or line-level overlap with a set of changed files.
894+
*
895+
* This is a pure function — git operations are the caller's responsibility.
896+
*
897+
* @param sarif The SARIF document to classify.
898+
* @param diffFiles Files changed in the git diff (with optional hunk info).
899+
* @param refRange Git ref range string for metadata (e.g. "main..HEAD").
900+
* @param granularity 'file' (default) matches any result in a changed file;
901+
* 'line' additionally checks that the result's primary
902+
* location line falls within a changed hunk.
903+
*/
904+
export function diffSarifByCommits(
905+
sarif: SarifDocument,
906+
diffFiles: DiffFileEntry[],
907+
refRange: string,
908+
granularity: DiffGranularity = 'file',
909+
): SarifDiffByCommitsResult {
910+
const results = sarif.runs[0]?.results ?? [];
911+
912+
const newResults: ClassifiedResult[] = [];
913+
const preExistingResults: ClassifiedResult[] = [];
914+
915+
for (let i = 0; i < results.length; i++) {
916+
const result = results[i];
917+
const primaryLoc = result.locations?.[0]?.physicalLocation;
918+
const uri = primaryLoc?.artifactLocation?.uri;
919+
920+
if (!uri) {
921+
// No location info — classify as pre-existing (conservative)
922+
preExistingResults.push({
923+
file: '<unknown>',
924+
resultIndex: i,
925+
ruleId: result.ruleId,
926+
});
927+
continue;
928+
}
929+
930+
const startLine = primaryLoc?.region?.startLine;
931+
932+
// Find matching diff file
933+
const matchingDiff = diffFiles.find(df => diffPathMatchesSarifUri(df.path, uri));
934+
935+
let isNew = false;
936+
if (matchingDiff) {
937+
if (granularity === 'file') {
938+
isNew = true;
939+
} else if (startLine !== undefined && matchingDiff.hunks.length > 0) {
940+
isNew = lineInHunks(startLine, matchingDiff.hunks);
941+
} else if (matchingDiff.hunks.length === 0) {
942+
// No hunk info available — treat as file-level match
943+
isNew = true;
944+
}
945+
// else: line granularity requested but no startLine on the result → pre-existing
946+
}
947+
948+
const classified: ClassifiedResult = {
949+
file: normalizeUri(uri),
950+
line: startLine,
951+
resultIndex: i,
952+
ruleId: result.ruleId,
953+
};
954+
955+
if (isNew) {
956+
newResults.push(classified);
957+
} else {
958+
preExistingResults.push(classified);
959+
}
960+
}
961+
962+
return {
963+
granularity,
964+
newResults,
965+
preExistingResults,
966+
summary: {
967+
diffFileCount: diffFiles.length,
968+
refRange,
969+
totalNew: newResults.length,
970+
totalPreExisting: preExistingResults.length,
971+
totalResults: results.length,
972+
},
973+
};
974+
}

server/src/resources/server-tools.md

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,14 @@ This resource provides a complete reference of the default tools exposed by the
6060

6161
## SARIF Analysis Tools
6262

63-
| Tool | Description |
64-
| ------------------------ | ---------------------------------------------------------------------------------------------------- |
65-
| `sarif_extract_rule` | Extract all data for a specific rule from multi-rule SARIF. Returns a valid SARIF JSON subset |
66-
| `sarif_list_rules` | List all rules in a SARIF file with result counts, severity, precision, and tags |
67-
| `sarif_rule_to_markdown` | Convert per-rule SARIF data to markdown with Mermaid dataflow diagrams |
68-
| `sarif_compare_alerts` | Compare code locations of two SARIF alerts for overlap (sink, source, any-location, full-path modes) |
69-
| `sarif_diff_runs` | Diff two SARIF files to find added, removed, and changed rules/results across analysis runs |
63+
| Tool | Description |
64+
| ------------------------- | ---------------------------------------------------------------------------------------------------- |
65+
| `sarif_compare_alerts` | Compare code locations of two SARIF alerts for overlap (sink, source, any-location, full-path modes) |
66+
| `sarif_diff_by_commits` | Correlate SARIF results with a git diff to classify findings as "new" or "pre-existing" |
67+
| `sarif_diff_runs` | Diff two SARIF files to find added, removed, and changed rules/results across analysis runs |
68+
| `sarif_extract_rule` | Extract all data for a specific rule from multi-rule SARIF. Returns a valid SARIF JSON subset |
69+
| `sarif_list_rules` | List all rules in a SARIF file with result counts, severity, precision, and tags |
70+
| `sarif_rule_to_markdown` | Convert per-rule SARIF data to markdown with Mermaid dataflow diagrams |
7071

7172
### `sarif_list_rules` Response Format
7273

@@ -157,7 +158,8 @@ Each rule object:
157158
5. `sarif_rule_to_markdown` — generate markdown report with Mermaid dataflow diagrams
158159
6. `sarif_compare_alerts` — compare two alerts for location overlap
159160
7. `sarif_diff_runs` — diff two SARIF files to detect behavioral changes across runs
160-
8. `query_results_cache_compare` with `ruleId` — compare results across databases
161+
8. `sarif_diff_by_commits` — correlate SARIF results with git diff to triage new vs pre-existing
162+
9. `query_results_cache_compare` with `ruleId` — compare results across databases
161163

162164
## Tool Input Conventions
163165

server/src/tools/sarif-tools.ts

Lines changed: 98 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,14 @@ import { z } from 'zod';
1111
import {
1212
computeFingerprintOverlap,
1313
computeLocationOverlap,
14+
diffSarifByCommits,
1415
diffSarifRules,
1516
extractRuleFromSarif,
1617
findOverlappingAlerts,
1718
listSarifRules,
1819
sarifRuleToMarkdown,
1920
} from '../lib/sarif-utils';
21+
import type { DiffFileEntry, DiffGranularity } from '../lib/sarif-utils';
2022
import { sessionDataManager } from '../lib/session-data-manager';
2123
import type { SarifResult, SarifRule } from '../types/sarif';
2224
import type { SarifDocument } from '../types/sarif';
@@ -26,12 +28,13 @@ import { logger } from '../utils/logger';
2628
* Register all SARIF analysis tools with the MCP server.
2729
*/
2830
export function registerSarifTools(server: McpServer): void {
29-
registerSarifExtractRuleTool(server);
30-
registerSarifListRulesTool(server);
31-
registerSarifRuleToMarkdownTool(server);
3231
registerSarifCompareAlertsTool(server);
3332
registerSarifDeduplicateRulesTool(server);
33+
registerSarifDiffByCommitsTool(server);
3434
registerSarifDiffRunsTool(server);
35+
registerSarifExtractRuleTool(server);
36+
registerSarifListRulesTool(server);
37+
registerSarifRuleToMarkdownTool(server);
3538
registerSarifStoreTool(server);
3639

3740
logger.info('Registered SARIF analysis tools');
@@ -316,6 +319,98 @@ function registerSarifCompareAlertsTool(server: McpServer): void {
316319
);
317320
}
318321

322+
// ---------------------------------------------------------------------------
323+
// sarif_diff_by_commits
324+
// ---------------------------------------------------------------------------
325+
326+
/**
327+
* Parse the output of `git diff --unified=0 --diff-filter=ACMR --no-color`
328+
* into structured DiffFileEntry objects with hunk information.
329+
*
330+
* The unified diff format marks file headers with `--- a/path` / `+++ b/path`
331+
* and hunk headers with `@@ -oldStart,oldCount +newStart,newCount @@`.
332+
* We extract the new-side ("+") start/count from each hunk header.
333+
*/
334+
function parseGitDiffOutput(diffOutput: string): DiffFileEntry[] {
335+
const files: DiffFileEntry[] = [];
336+
let currentFile: DiffFileEntry | null = null;
337+
338+
for (const line of diffOutput.split('\n')) {
339+
// New file header: +++ b/path/to/file
340+
if (line.startsWith('+++ b/')) {
341+
if (currentFile) files.push(currentFile);
342+
currentFile = { hunks: [], path: line.substring(6) };
343+
continue;
344+
}
345+
346+
// Hunk header: @@ -old +newStart,newCount @@ or @@ -old +newStart @@
347+
if (currentFile && line.startsWith('@@')) {
348+
const match = line.match(/@@ [^ ]+ \+(\d+)(?:,(\d+))? @@/);
349+
if (match) {
350+
const startLine = parseInt(match[1], 10);
351+
const lineCount = match[2] !== undefined ? parseInt(match[2], 10) : 1;
352+
if (lineCount > 0) {
353+
currentFile.hunks.push({ startLine, lineCount });
354+
}
355+
}
356+
}
357+
}
358+
if (currentFile) files.push(currentFile);
359+
360+
return files;
361+
}
362+
363+
function registerSarifDiffByCommitsTool(server: McpServer): void {
364+
server.tool(
365+
'sarif_diff_by_commits',
366+
'Correlate SARIF results with a git diff to classify findings as "new" (introduced in the diff) or "pre-existing". Accepts a SARIF file and a git ref range (e.g. "main..HEAD"). Supports file-level or line-level granularity.',
367+
{
368+
cacheKey: z.string().optional().describe('Cache key to read SARIF from (alternative to sarifPath).'),
369+
granularity: z.enum(['file', 'line']).optional().default('file')
370+
.describe('Matching granularity: "file" classifies any result in a changed file as new; "line" additionally checks that the result line falls within a changed hunk. Default: "file".'),
371+
refRange: z.string().describe('Git ref range for the diff (e.g. "main..HEAD", "abc123..def456"). Passed directly to `git diff`.'),
372+
repoPath: z.string().optional().describe('Path to the git repository. Defaults to the current working directory.'),
373+
sarifPath: z.string().optional().describe('Path to the SARIF file.'),
374+
},
375+
async ({ sarifPath, cacheKey, refRange, repoPath, granularity }) => {
376+
// Load SARIF
377+
const loaded = loadSarif({ sarifPath, cacheKey });
378+
if (loaded.error) {
379+
return { content: [{ type: 'text' as const, text: loaded.error }] };
380+
}
381+
382+
// Run git diff to get changed files with hunk info
383+
const { executeCLICommand } = await import('../lib/cli-executor');
384+
const gitArgs = ['diff', '--unified=0', '--diff-filter=ACMR', '--no-color', refRange];
385+
const gitResult = await executeCLICommand({
386+
args: gitArgs,
387+
command: 'git',
388+
cwd: repoPath,
389+
});
390+
391+
if (!gitResult.success) {
392+
return {
393+
content: [{
394+
type: 'text' as const,
395+
text: `git diff failed: ${gitResult.error ?? gitResult.stderr}`,
396+
}],
397+
};
398+
}
399+
400+
const diffFiles = parseGitDiffOutput(gitResult.stdout);
401+
const g = granularity as DiffGranularity;
402+
const result = diffSarifByCommits(loaded.sarif!, diffFiles, refRange, g);
403+
404+
return {
405+
content: [{
406+
type: 'text' as const,
407+
text: JSON.stringify(result, null, 2),
408+
}],
409+
};
410+
},
411+
);
412+
}
413+
319414
// ---------------------------------------------------------------------------
320415
// sarif_diff_runs
321416
// ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)