Skip to content

Commit 8247978

Browse files
Implement duplicated code detection prompts, supported by tools. (#109)
1 parent 46be962 commit 8247978

17 files changed

+1149
-26
lines changed

server/dist/codeql-development-mcp-server.js

Lines changed: 197 additions & 5 deletions
Large diffs are not rendered by default.

server/dist/codeql-development-mcp-server.js.map

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

server/src/lib/cli-tool-registry.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,14 @@ export function registerCLITool(server: McpServer, definition: CLIToolDefinition
389389
positionalArgs = [...positionalArgs, query as string];
390390
}
391391
break;
392+
393+
case 'codeql_resolve_library-path':
394+
// --query is a named flag for resolve library-path, not positional.
395+
// It was extracted from options so we need to restore it.
396+
if (query) {
397+
options.query = query;
398+
}
399+
break;
392400

393401
case 'codeql_resolve_queries':
394402
// Handle directory parameter as positional argument

server/src/lib/language-server.ts

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,62 @@ export interface CompletionItem {
102102
sortText?: string;
103103
}
104104

105+
/**
106+
* Symbol kinds as defined by the LSP spec.
107+
*/
108+
/* eslint-disable no-unused-vars */
109+
export enum SymbolKind {
110+
File = 1,
111+
Module = 2,
112+
Namespace = 3,
113+
Package = 4,
114+
Class = 5,
115+
Method = 6,
116+
Property = 7,
117+
Field = 8,
118+
Constructor = 9,
119+
Enum = 10,
120+
Interface = 11,
121+
Function = 12,
122+
Variable = 13,
123+
Constant = 14,
124+
String = 15,
125+
Number = 16,
126+
Boolean = 17,
127+
Array = 18,
128+
Object = 19,
129+
Key = 20,
130+
Null = 21,
131+
EnumMember = 22,
132+
Struct = 23,
133+
Event = 24,
134+
Operator = 25,
135+
TypeParameter = 26,
136+
}
137+
/* eslint-enable no-unused-vars */
138+
139+
/**
140+
* Hierarchical document symbol (returned when the server supports hierarchical symbols).
141+
*/
142+
export interface DocumentSymbol {
143+
children?: DocumentSymbol[];
144+
detail?: string;
145+
kind: SymbolKind;
146+
name: string;
147+
range: LSPRange;
148+
selectionRange: LSPRange;
149+
}
150+
151+
/**
152+
* Flat symbol information (returned when the server does not support hierarchical symbols).
153+
*/
154+
export interface SymbolInformation {
155+
containerName?: string;
156+
kind: SymbolKind;
157+
location: LSPLocation;
158+
name: string;
159+
}
160+
105161
export class CodeQLLanguageServer extends EventEmitter {
106162
private server: ChildProcess | null = null;
107163
private messageId = 1;
@@ -471,6 +527,31 @@ export class CodeQLLanguageServer extends EventEmitter {
471527
return this.normalizeLocations(result);
472528
}
473529

530+
/**
531+
* Get document symbols (i.e. top-level declarations) for a file.
532+
* Returns a hierarchical DocumentSymbol[] when the server supports it, or a
533+
* flat SymbolInformation[] otherwise. Top-level symbols are the root nodes
534+
* of the returned array.
535+
*/
536+
async getDocumentSymbols(params: { textDocument: TextDocumentIdentifier }): Promise<DocumentSymbol[] | SymbolInformation[]> {
537+
if (!this.isInitialized) {
538+
throw new Error('Language server is not initialized');
539+
}
540+
if (!this.isRunning()) {
541+
throw new Error('Language server process is not running');
542+
}
543+
const result = await this.sendRequest('textDocument/documentSymbol', params);
544+
if (!result || !Array.isArray(result) || result.length === 0) {
545+
return [];
546+
}
547+
// Hierarchical DocumentSymbol items have a `selectionRange` field.
548+
if ('selectionRange' in (result[0] as object)) {
549+
return result as DocumentSymbol[];
550+
}
551+
// Flat SymbolInformation items have a `location` field.
552+
return result as SymbolInformation[];
553+
}
554+
474555
/**
475556
* Open a text document in the language server.
476557
* The document must be opened before requesting completions, definitions, etc.
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
---
2+
agent: agent
3+
---
4+
5+
# Check for Duplicated Code
6+
7+
Use the MCP server tools to identify classes, modules, and predicates defined in a
8+
`.ql` or `.qll` file and check for possible "duplicated code," where duplicated code
9+
is defined to be:
10+
11+
- Reimplementing functionality that already exists in the standard library, or shared project `.qll` files, and
12+
- The local definition is identical, or semantically equivalent, or superior to the library definition, or
13+
- The local definition could be simplified by reusing the existing definition (e.g. a base class already exists that captures some of the shared logic)
14+
15+
Here are some examples:
16+
17+
```ql
18+
import cpp
19+
20+
// Duplicated: `StandardNamespace` already exists in the standard library and is identical
21+
class NamespaceStd extends Namespace {
22+
NamespaceStd() { this.getName() = "std" }
23+
}
24+
25+
// Duplicated: class should extend `Operator`, not `Function`
26+
class ThrowingOperator extends Function {
27+
ThrowingOperator() {
28+
// Duplicated: this check is implied by using base class `Operator`
29+
this.getName().matches("%operator%") and
30+
and exists(ThrowExpr te |
31+
// Duplicated: this is equivalent to `te.getEnclosingFunction() = this`
32+
te.getParent*() = this.getAChild()
33+
)
34+
}
35+
36+
// Duplicated: member predicate `getDeclaringType()` already does this.
37+
Class getDefiningClass() { ... }
38+
}
39+
40+
// Duplicated: `ControlFlowNode.getASuccessor()` already exists in `cpp` and is superior
41+
predicate getASuccessor(Stmt a, Stmt b) {
42+
exists(Block b, int i | a = b.getChild(i) and b = b.getChild(i + 1))
43+
}
44+
45+
// Duplicated: prefer to import `semmle.code.cpp.controlflow.Dominance`, defined in dependency pack `cpp-all`
46+
predicate dominates(Block a, Block b) { ... }
47+
```
48+
49+
Duplicate code removal isn't done arbitrarily, but for several key reasons:
50+
51+
- **Maintainability**: Duplicated code must be maintained separately, may diverge and have different bugs
52+
- **Simplicity**: Relying on existing definitions reduces the amount of code to read and understand
53+
- **Readability**: Existing definitions map wrap complex ideas into readable names
54+
- **Consistency**: A single source of truth makes for a more consistent user experience across queries
55+
- **Completeness/Correctness**: Recreating an already-existing definition can miss edge cases, resulting in false positives or false negatives
56+
57+
## Use This Prompt When
58+
59+
- A query file defines a class or predicate whose name sounds generic (e.g.
60+
`StandardNamespace`, `Callable`, `SecurityFeature`)
61+
- Refactoring a query that was written before a relevant library predicate existed
62+
- Reviewing a shared `.qll` file to check whether its helpers have been upstreamed
63+
into the standard library in a newer CodeQL version
64+
- Performing a code-quality audit across a suite of custom queries
65+
66+
## Prerequisites
67+
68+
1. The file path of the `.ql` or `.qll` file to audit for code duplication
69+
2. Understand which packs are imported by `qlpack.yml`
70+
3. Understand where the relevant language library packs are located (e.g. `~/.codeql`)
71+
4. Understand where project-specific library packs are located (e.g. `$LANGUAGE/lib` or `$LANGUAGE/common`)
72+
5. Understand where pack-specific shared `.qll` files are located (e.g. `$PACKROOT/common/*.qll`)
73+
74+
## Overview of the Approach
75+
76+
The core idea is to enumerate every top-level name defined in the file under review.
77+
Then, find candidate `.qll` files, based on file name and path, that are available to
78+
that `.ql` file in review. Then, enumerate the top-level names in each candidate
79+
`.qll` file, to find potential duplicates, dive further if necessary, and then report
80+
the findings as code improvement recommendations to the user.
81+
82+
1. **Read the file** to see its imports and top-level structure
83+
2. **Enumerate top-level definitions** with `codeql_lsp_document_symbols`
84+
3. **Find available .qll files** in the `.ql` file's pack, and its dependencies, including the standard library
85+
4. **Identify promising .qll file candidates** based on their file name and path
86+
5. **Enumerate top-level definitions in candidate `.qll` files** with `codeql_lsp_document_symbols`
87+
6. **Detect overlap, comparing definitions if unclear** (e.g. by using `find_predicate_position` and `find_class_position` tools)
88+
7. **Report findings** as a set of recommendations to the user about which definitions could be improved, by reusing which existing definitions
89+
90+
## Step 1: Read the File and Note Its Imports
91+
92+
```text
93+
Tool: read_file
94+
Parameters:
95+
file_path: /path/to/query.ql
96+
start_line: 1
97+
end_line: 60 # enough to see all imports
98+
```
99+
100+
Record every `import` statement. These are the namespaces the standard library
101+
exposes; duplication is only meaningful for libraries that are already imported (or
102+
easily importable).
103+
104+
## Step 2: Enumerate Top-Level Definitions
105+
106+
Use `codeql_lsp_document_symbols` to retrieve every class, predicate, and module
107+
defined at the top level of the file in a single call:
108+
109+
```text
110+
Tool: codeql_lsp_document_symbols
111+
Parameters:
112+
file_path: /path/to/query.ql
113+
names_only: true # provides significantly smaller response payload
114+
workspace_uri: /path/to/pack-root # directory containing codeql-pack.yml
115+
```
116+
117+
The response contains a `symbols` array. Each entry has:
118+
119+
- `name` — the identifier as written in source
120+
- `kind` — numeric SymbolKind (5 = Class, 12 = Function/predicate, 2 = Module, etc.)
121+
- `range` — the full definition range (0-based lines)
122+
- `selectionRange` — the range of just the name token
123+
- `children` — nested members (for classes and modules)
124+
125+
Top-level symbols are the root nodes of the array; `children` hold member
126+
predicates and fields.
127+
128+
## Step 3. Read the filesystem to find candidate library `.qll` files
129+
130+
Run the tool `codeql_resolve_library-path` with the given ql query file to find where
131+
the available library sources live.
132+
133+
For each source root, run `find $ROOT -name "*.qll"` to find all `.qll` files
134+
available in that pack. Do not preemptively filter this list of qll files. The names
135+
of the files may be broad or nondescriptive. Read all file names for each project to
136+
understand its structure and responsibilities before proceeding to step 3d.
137+
138+
Choose promising candidate `.qll` files you found in the previous step. Pick
139+
candidates that may potentially define behavior relevant to the current query and its
140+
enumerated definitions, based on the candidate filename, path, and priority.
141+
142+
Prioritize as follows:
143+
144+
- `.qll` files in the same directory as the query file have the absolute highest priority
145+
- `.qll` files in the same pack have the next extremely high priority
146+
- `.qll` files in project-specific library packs have the very high priority
147+
- `.qll` files in downloaded direct dependencies have standard priority
148+
- `.qll` files in transitive dependencies have the least priority.
149+
150+
## Step 4: Identify candidate terms in the candidate library `.qll` files
151+
152+
Enumerate the top-level definitions for each candidate `.qll` file using the tool
153+
`codeql_lsp_document_symbols` again. Some top levels may clearly match the name or
154+
purpose of a definition in the query file, while others may only appear as possibly
155+
related.
156+
157+
```text
158+
Tool: codeql_lsp_document_symbols
159+
Parameters:
160+
file_path: /path/to/library/file.qll
161+
workspace_uri: /path/to/pack-root
162+
```
163+
164+
## Step 5: Perform final overlap analysis
165+
166+
For each promising candidate, identify the predicate or class definitions that may overlap. One definition will be in the query file (`.ql`) and the other will be in the library file (`.qll`).
167+
168+
Using the tools `find_predicate_position` and `find_class_position`, you can retrieve the full definition of each predicate or class, and compare them to determine whether they are identical, equivalent, overlapping, or if one is a superior implementation that could be reused by the query file.
169+
170+
Dutifully analyze whether the shared library file definition would reduce code duplication in the categories identified before: maintenance, simplicity, readability, consistency, and completeness/correctness. Consider contextual factors such as comments explaining why the local definition differs from the library one, or whether the local definition is a thin wrapper around the library definition that adds value (e.g. by improving naming or adding extra checks).
171+
172+
Do not go on a wild goose chase trying to find every possible overlap. Consider the likelihood of overlap based on the broadness of functionality, and the value that would be brought be reuse. Do not waste significant time on unimportant or unlikely overlaps.
173+
174+
## Step 6: Report Findings
175+
176+
For each duplicate found, report:
177+
178+
| Local name | Local file | import path | Notes |
179+
| ------------------- | ------------- | -------------------------------- | ---------- |
180+
| `StandardNamespace` | `query.ql:42` | already imported in `import cpp` | Identical |
181+
| `myHelper` | `query.ql:80` | `import myproject.Helpers` | Equivalent |
182+
| `myHelper` | `query.ql:80` | `import myproject.Helpers` | Equivalent |
183+
184+
Recommend one of:
185+
186+
- **Replace**: remove the local definition and use the standard definition directly instead
187+
- **Integrate**: refactor and simplify the local definition by making use of the standard definition
188+
- **Annotate**: add comments to the local definition to explain how it differs from the standard definition and why the duplication is necessary
189+
190+
Additionally, report if any issues came up in using the tools, or finding the qll files.
191+
192+
For each concept for which no duplicate was found, provide at most a **brief** description of what the concept is. Do not provide a long detailed explanation of a non-finding.
193+
194+
# Conclusion
195+
196+
Do **not** perform any updates to any code during this analysis.
197+
198+
As you work through completing this task, ask yourself:
199+
200+
- Have I changed any code, even though that was not my task, or am I about to? Stop, do not change any code!
201+
- Did I sufficiently analyze the definitions such that I likely found most overlapping definitions?
202+
- Will my suggestions improve the maintainability, simplicity, readability, consistency, or completeness/correctness of the codebase?
203+
- Did I report my findings clearly?
204+
- Did I use the suggested LLM tools to their fullest extent?
205+
- Did I follow the steps in the recommended order, and not skip any steps?
206+
- Did I report any issues I had in finding the relevant `.qll` files, or using the tools to analyze definitions?

0 commit comments

Comments
 (0)