Address latest PR review comments

data-douser · data-douser · commit 1f4b543b2d35 · 2026-02-23T05:00:57.000-07:00
- Register run-query-and-summarize-false-positives prompt in PROMPT_TEMPLATES
  so it is embedded in the bundle instead of returning the fallback message
- Replace backtick tool references with #tool_name form in the prompt
  template for consistent Copilot Chat rendering
- Fix CRLF line splitting in applyLineRange (split on /\r?\n/)
- Add default listing cap (1000 entries) to prevent oversized MCP responses
  from large src.zip archives
- Enforce max uncompressed size (10 MB) before decompressing zip entries
- Use dirname() instead of join(.., '..') in test helper for clarity
diff --git a/server/dist/codeql-development-mcp-server.js b/server/dist/codeql-development-mcp-server.js
diff --git a/server/dist/codeql-development-mcp-server.js.map b/server/dist/codeql-development-mcp-server.js.map
diff --git a/server/src/prompts/prompt-loader.ts b/server/src/prompts/prompt-loader.ts
@@ -13,6 +13,7 @@ import explainCodeqlQuery from './explain-codeql-query.prompt.md';
 import qlLspIterativeDevelopment from './ql-lsp-iterative-development.prompt.md';
 import qlTddAdvanced from './ql-tdd-advanced.prompt.md';
 import qlTddBasic from './ql-tdd-basic.prompt.md';
+import runQueryAndSummarizeFalsePositives from './run-query-and-summarize-false-positives.prompt.md';
 import sarifRankFalsePositives from './sarif-rank-false-positives.prompt.md';
 import sarifRankTruePositives from './sarif-rank-true-positives.prompt.md';
 import toolsQueryWorkflow from './tools-query-workflow.prompt.md';
@@ -31,6 +32,7 @@ const PROMPT_TEMPLATES: Record<string, string> = {
   'ql-lsp-iterative-development.prompt.md': qlLspIterativeDevelopment,
   'ql-tdd-advanced.prompt.md': qlTddAdvanced,
   'ql-tdd-basic.prompt.md': qlTddBasic,
+  'run-query-and-summarize-false-positives.prompt.md': runQueryAndSummarizeFalsePositives,
   'sarif-rank-false-positives.prompt.md': sarifRankFalsePositives,
   'sarif-rank-true-positives.prompt.md': sarifRankTruePositives,
   'tools-query-workflow.prompt.md': toolsQueryWorkflow,
diff --git a/server/src/prompts/run-query-and-summarize-false-positives.prompt.md b/server/src/prompts/run-query-and-summarize-false-positives.prompt.md
@@ -12,13 +12,13 @@ Help a developer discover what kinds of false positives are produced by their cu
 
 1. Read the provided CodeQL query to understand what patterns it is designed to detect.
 2. Discover the results of this query on a real database, by:
-   - Running the tool `list_query_run_results` to find existing runs for this query
-   - If no existing runs are found, run the query on a relevant database using `codeql_query_run` tool
+   - Running the tool #list_query_run_results to find existing runs for this query
+   - If no existing runs are found, run the query on a relevant database using #codeql_query_run tool
 3. Analyze and group the results into what appear to be similar types of results. This may mean:
    - Grouping results in the same file
    - Grouping results that reference the same elements
    - Grouping results with similar messages
-4. For each group, explore the actual code for a sample of alerts in that group, using the `read_database_source` tool to triage the results and determine which groups appear to be false positives
+4. For each group, explore the actual code for a sample of alerts in that group, using the #read_database_source tool to triage the results and determine which groups appear to be false positives
 5. For each false positive case discovered in this exploration, group them into categories of similar root causes. For example, a query might not properly account for unreachable code, or there may be a commonly used library that violates the query's assumptions but is actually safe.
 6. Explain these results to the user in order of most common to least common, so they can understand where their query may need improvement to reduce false positives.
 
@@ -32,7 +32,7 @@ You will be provided with:
 
 ### Exploring code paths
 
-The tool `read_database_source` can be used to read the code of a particular finding. A good strategy to explore the code paths of a finding is:
+The tool #read_database_source can be used to read the code of a particular finding. A good strategy to explore the code paths of a finding is:
 
 1. Read in the immediate context of the violation.
    - Some queries may depend on later context (e.g., an "unused variable" may only be used after its declaration)
diff --git a/server/src/tools/codeql/read-database-source.ts b/server/src/tools/codeql/read-database-source.ts
@@ -25,6 +25,25 @@ import { fileURLToPath } from 'url';
 import { z } from 'zod';
 import { logger } from '../../utils/logger';
 
+// ---------------------------------------------------------------------------
+// Constants
+// ---------------------------------------------------------------------------
+
+/**
+ * Default maximum number of entries returned in listing mode when the caller
+ * does not provide an explicit `maxEntries` value.  Real databases can contain
+ * very large numbers of files; capping the response avoids oversized MCP
+ * responses and high memory usage.
+ */
+const DEFAULT_MAX_LISTING_ENTRIES = 1000;
+
+/**
+ * Maximum uncompressed size (in bytes) for a single zip entry that the tool
+ * will decompress into memory.  Prevents runaway memory usage when a src.zip
+ * contains unexpectedly large files.  10 MB.
+ */
+const MAX_UNCOMPRESSED_BYTES = 10 * 1024 * 1024;
+
 // ---------------------------------------------------------------------------
 // Core implementation
 // ---------------------------------------------------------------------------
@@ -161,7 +180,7 @@ function applyLineRange(
   startLine?: number,
   endLine?: number,
 ): { content: string; effectiveEnd: number; effectiveStart: number; totalLines: number } {
-  const lines = content.split('\n');
+  const lines = content.split(/\r?\n/);
   const totalLines = lines.length;
   const effectiveStart = Math.max(1, startLine ?? 1);
   const effectiveEnd = Math.min(totalLines, endLine ?? totalLines);
@@ -226,8 +245,9 @@ export async function readDatabaseSource(
     }
 
     const totalEntries = allEntries.length;
-    const truncated = maxEntries !== undefined && maxEntries < totalEntries;
-    const entries = truncated ? allEntries.slice(0, maxEntries) : allEntries;
+    const effectiveMax = maxEntries ?? DEFAULT_MAX_LISTING_ENTRIES;
+    const truncated = effectiveMax < totalEntries;
+    const entries = truncated ? allEntries.slice(0, effectiveMax) : allEntries;
 
     return {
       entries,
@@ -262,6 +282,14 @@ export async function readDatabaseSource(
       throw new Error(`Failed to read entry from src.zip: ${matchedEntry}`);
     }
 
+    const rawSize = entry.header.size;
+    if (rawSize > MAX_UNCOMPRESSED_BYTES) {
+      throw new Error(
+        `Entry "${matchedEntry}" is too large to read (${rawSize} bytes, limit ${MAX_UNCOMPRESSED_BYTES}). ` +
+          `Use startLine/endLine on a smaller file, or increase the limit.`,
+      );
+    }
+
     const rawContent = entry.getData().toString('utf-8');
     const { content, effectiveEnd, effectiveStart, totalLines } = applyLineRange(
       rawContent,
@@ -347,7 +375,8 @@ export function registerReadDatabaseSourceTool(server: McpServer): void {
         .optional()
         .describe(
           'Maximum number of entries to return in listing mode (when filePath is omitted). ' +
-            'When the total exceeds this limit the response includes truncated: true.',
+            'Defaults to 1000. When the total exceeds this limit the response includes truncated: true. ' +
+            'Use prefix to narrow results for large databases.',
         ),
       prefix: z
         .string()
diff --git a/server/test/src/tools/codeql/read-database-source.test.ts b/server/test/src/tools/codeql/read-database-source.test.ts
@@ -4,7 +4,7 @@
 
 import AdmZip from 'adm-zip';
 import { promises as fs } from 'fs';
-import { join } from 'path';
+import { dirname, join } from 'path';
 import { afterEach, beforeEach, describe, expect, it } from 'vitest';
 import {
   readDatabaseSource,
@@ -48,7 +48,7 @@ async function createDirDatabase(dir: string): Promise<void> {
   await fs.writeFile(join(dir, 'codeql-database.yml'), 'primaryLanguage: java\n');
   for (const [entryPath, content] of Object.entries(SAMPLE_FILES)) {
     const fullPath = join(dir, 'src', entryPath);
-    await fs.mkdir(join(fullPath, '..'), { recursive: true });
+    await fs.mkdir(dirname(fullPath), { recursive: true });
     await fs.writeFile(fullPath, content, 'utf-8');
   }
 }