diff --git a/CHANGELOG.md b/CHANGELOG.md index fbf4c5d4e..b0b3a2780 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### New Features +- Markdown and shell scripts are now indexed instead of being treated as unknown files. Markdown files contribute searchable heading sections and local documentation links, while `.sh` and `.bash` files contribute shell functions and command-call relationships, so project docs and build/demo scripts show up in search, explore, and impact analysis after re-indexing. - **Subagents and non-MCP agents can now reach CodeGraph.** Two new CLI commands — `codegraph explore ""` and `codegraph node ` — print exactly what the matching MCP tools return (relevant symbols' source + call paths; one symbol's source + callers; file reads with line numbers), so any agent with a shell can use the graph. And `codegraph install` now writes a small marker-fenced CodeGraph section into each agent's instructions file (`CLAUDE.md` / `AGENTS.md` / `GEMINI.md`) pointing at both surfaces — that file is what Task-tool subagents actually see, where the MCP server's own guidance only reaches the main agent. Measured on a delegated code-exploration task: subagents went from almost never using CodeGraph (~1 in 9 runs) to using it in every run, including runs with zero grep/file-reading fallback. The section is small, survives your own content, upgrades cleanly from the old long block, and `codegraph uninstall` removes it. Thanks @liuyao37511. (#704) - **The MCP tool list is now a focused default of four** — `codegraph_explore`, `codegraph_node`, `codegraph_search`, and `codegraph_callers`. The other four (`codegraph_callees`, `codegraph_impact`, `codegraph_files`, `codegraph_status`) remain fully functional — the CLI and library API are unchanged, and `CODEGRAPH_MCP_TOOLS` re-enables any of them — but they're no longer listed to agents by default: measured agent behavior shows they're never or rarely picked, and the information they carry already arrives inline on the tools agents do use (explore's blast-radius section, node's dependents note, a symbol's own body as its callee list). A leaner list saves context tokens every session and steers agents to the right tool by presence alone. - **CodeGraph now goes quiet instead of failing loudly in unindexed projects.** When an AI agent's session starts in a workspace that has no CodeGraph index, the MCP server now announces itself as inactive with a short note and lists no tools at all — instead of presenting the full toolset and erroring on every call, which taught agents to distrust CodeGraph even where it works. Querying another project that isn't indexed likewise returns clear guidance (use your regular tools for that codebase; the user can run `codegraph init` there to enable CodeGraph) instead of an error, and genuine internal errors now tell the agent to retry once rather than give up on CodeGraph entirely. Indexing stays your decision — agents are told not to run it themselves. (#769) diff --git a/__tests__/extraction.test.ts b/__tests__/extraction.test.ts index 7f2d13f5f..d7cc72a86 100644 --- a/__tests__/extraction.test.ts +++ b/__tests__/extraction.test.ts @@ -101,6 +101,16 @@ describe('Language Detection', () => { expect(detectLanguage('stdio.h', '#ifndef STDIO_H\nvoid printf();\n#endif\n')).toBe('c'); }); + it('should detect Markdown files', () => { + expect(detectLanguage('README.md')).toBe('markdown'); + expect(detectLanguage('docs/guide.markdown')).toBe('markdown'); + }); + + it('should detect shell scripts by extension', () => { + expect(detectLanguage('scripts/build.sh')).toBe('shell'); + expect(detectLanguage('scripts/bootstrap.bash')).toBe('shell'); + }); + it('should return unknown for unsupported extensions', () => { expect(detectLanguage('styles.css')).toBe('unknown'); expect(detectLanguage('data.json')).toBe('unknown'); @@ -129,6 +139,8 @@ describe('Language Support', () => { expect(languages).toContain('swift'); expect(languages).toContain('kotlin'); expect(languages).toContain('dart'); + expect(languages).toContain('markdown'); + expect(languages).toContain('shell'); }); }); @@ -5127,6 +5139,57 @@ export function multiply(a: number, b: number): number { cg.close(); }); + + it('should index Markdown headings and local links', async () => { + const source = `# Project Overview + +See [Setup](docs/setup.md) and [API](#api). + +## API + +Details for callers. +`; + + const result = extractFromSource('README.md', source); + + expect(result.errors).toEqual([]); + expect(result.nodes.find((n) => n.kind === 'file')).toMatchObject({ + name: 'README.md', + language: 'markdown', + }); + + const headings = result.nodes.filter((n) => n.kind === 'module').map((n) => n.name); + expect(headings).toEqual(['Project Overview', 'API']); + expect(result.unresolvedReferences.map((r) => r.referenceName)).toEqual(['docs/setup.md', '#api']); + }); + + it('should index shell functions and command calls', () => { + const source = `#!/usr/bin/env bash + +main() { + echo "starting" + helper +} + +helper() { + curl -fsSL https://example.invalid | jq . +} + +main "$@" +`; + + const result = extractFromSource('scripts/build.sh', source); + + expect(result.errors).toEqual([]); + const functions = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name).sort(); + expect(functions).toEqual(['helper', 'main']); + + const calls = result.unresolvedReferences.map((r) => r.referenceName); + expect(calls).toContain('helper'); + expect(calls).toContain('main'); + expect(calls).toContain('curl'); + expect(calls).toContain('jq'); + }); }); describe('Path Normalization', () => { diff --git a/__tests__/security.test.ts b/__tests__/security.test.ts index f46798283..1c8d921c5 100644 --- a/__tests__/security.test.ts +++ b/__tests__/security.test.ts @@ -463,11 +463,13 @@ describe('Source file detection (isSourceFile)', () => { expect(isSourceFile('src/component.tsx')).toBe(true); expect(isSourceFile('lib/util.js')).toBe(true); expect(isSourceFile('src/main.py')).toBe(true); + expect(isSourceFile('README.md')).toBe(true); + expect(isSourceFile('scripts/build.sh')).toBe(true); + expect(isSourceFile('scripts/bootstrap.bash')).toBe(true); }); it('rejects unsupported extensions and extensionless files', () => { expect(isSourceFile('src/component.css')).toBe(false); - expect(isSourceFile('README.md')).toBe(false); expect(isSourceFile('Makefile')).toBe(false); expect(isSourceFile('.gitignore')).toBe(false); }); diff --git a/src/extraction/grammars.ts b/src/extraction/grammars.ts index eabdb598e..1c951f0b8 100644 --- a/src/extraction/grammars.ts +++ b/src/extraction/grammars.ts @@ -10,7 +10,7 @@ import * as path from 'path'; import { Parser, Language as WasmLanguage } from 'web-tree-sitter'; import { Language } from '../types'; -export type GrammarLanguage = Exclude; +export type GrammarLanguage = Exclude; /** * WASM filename map — maps each language to its .wasm grammar file @@ -113,6 +113,10 @@ export const EXTENSION_MAP: Record = { // shape as the `.yml` variants — the YAML/properties extractor emits one node // per leaf key, and the Spring resolver links `@Value("${k}")` references. '.properties': 'properties', + '.md': 'markdown', + '.markdown': 'markdown', + '.sh': 'shell', + '.bash': 'shell', }; /** @@ -316,6 +320,8 @@ export function isLanguageSupported(language: Language): boolean { if (language === 'twig') return true; // file-level tracking only if (language === 'xml') return true; // MyBatis mapper extractor if (language === 'properties') return true; // Spring config keys + if (language === 'markdown') return true; // lightweight documentation extractor + if (language === 'shell') return true; // lightweight shell script extractor if (language === 'unknown') return false; return language in WASM_GRAMMAR_FILES; } @@ -326,7 +332,7 @@ export function isLanguageSupported(language: Language): boolean { export function isGrammarLoaded(language: Language): boolean { if (language === 'svelte' || language === 'vue' || language === 'astro' || language === 'liquid' || language === 'razor') return true; if (language === 'yaml' || language === 'twig') return true; // no WASM grammar needed - if (language === 'xml' || language === 'properties') return true; // no WASM grammar needed + if (language === 'xml' || language === 'properties' || language === 'markdown' || language === 'shell') return true; // no WASM grammar needed return languageCache.has(language); } @@ -347,7 +353,7 @@ export function isFileLevelOnlyLanguage(language: Language): boolean { * Get all supported languages (those with grammar definitions). */ export function getSupportedLanguages(): Language[] { - return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'astro', 'liquid']; + return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'astro', 'liquid', 'markdown', 'shell']; } /** @@ -424,6 +430,8 @@ export function getLanguageDisplayName(language: Language): string { twig: 'Twig', xml: 'XML', properties: 'Java properties', + markdown: 'Markdown', + shell: 'Shell', unknown: 'Unknown', }; return names[language] || language; diff --git a/src/extraction/markdown-extractor.ts b/src/extraction/markdown-extractor.ts new file mode 100644 index 000000000..28479e460 --- /dev/null +++ b/src/extraction/markdown-extractor.ts @@ -0,0 +1,157 @@ +import * as path from 'path'; +import { Edge, ExtractionError, ExtractionResult, Node, UnresolvedReference } from '../types'; +import { generateNodeId } from './tree-sitter-helpers'; + +/** + * Lightweight extractor for Markdown documentation. + * + * Markdown has useful project structure even when it is not "code": headings + * are navigable sections, and local links often point at related source/docs. + * This extractor keeps the full file searchable via a capped docstring while + * emitting heading nodes and local-link references. + */ +export class MarkdownExtractor { + private filePath: string; + private source: string; + private nodes: Node[] = []; + private edges: Edge[] = []; + private unresolvedReferences: UnresolvedReference[] = []; + private errors: ExtractionError[] = []; + + constructor(filePath: string, source: string) { + this.filePath = filePath; + this.source = source; + } + + extract(): ExtractionResult { + const startTime = Date.now(); + + try { + const fileNode = this.createFileNode(); + this.extractHeadingsAndLinks(fileNode.id); + } catch (error) { + this.errors.push({ + message: `Markdown extraction error: ${error instanceof Error ? error.message : String(error)}`, + filePath: this.filePath, + severity: 'error', + code: 'parse_error', + }); + } + + return { + nodes: this.nodes, + edges: this.edges, + unresolvedReferences: this.unresolvedReferences, + errors: this.errors, + durationMs: Date.now() - startTime, + }; + } + + private createFileNode(): Node { + const lines = this.source.split('\n'); + const node: Node = { + id: `file:${this.filePath}`, + kind: 'file', + name: path.basename(this.filePath), + qualifiedName: this.filePath, + filePath: this.filePath, + language: 'markdown', + startLine: 1, + endLine: lines.length || 1, + startColumn: 0, + endColumn: lines[lines.length - 1]?.length ?? 0, + docstring: this.source.slice(0, 12_000), + updatedAt: Date.now(), + }; + this.nodes.push(node); + return node; + } + + private extractHeadingsAndLinks(fileNodeId: string): void { + const lines = this.source.split('\n'); + const headingStack: Array<{ level: number; nodeId: string; slug: string }> = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]!; + const lineNumber = i + 1; + + const headingMatch = /^(#{1,6})\s+(.+?)\s*#*\s*$/.exec(line); + if (headingMatch) { + const level = headingMatch[1]!.length; + const title = headingMatch[2]!.trim(); + const slug = slugify(title); + const parent = findHeadingParent(headingStack, level)?.nodeId ?? fileNodeId; + const nodeId = generateNodeId(this.filePath, 'module', `${level}:${title}`, lineNumber); + + this.nodes.push({ + id: nodeId, + kind: 'module', + name: title, + qualifiedName: `${this.filePath}#${slug}`, + filePath: this.filePath, + language: 'markdown', + startLine: lineNumber, + endLine: lineNumber, + startColumn: 0, + endColumn: line.length, + signature: `${'#'.repeat(level)} ${title}`, + updatedAt: Date.now(), + }); + this.edges.push({ source: parent, target: nodeId, kind: 'contains' }); + + while (headingStack.length > 0 && headingStack[headingStack.length - 1]!.level >= level) { + headingStack.pop(); + } + headingStack.push({ level, nodeId, slug }); + } + + this.extractLocalLinks(line, lineNumber, headingStack[headingStack.length - 1]?.nodeId ?? fileNodeId); + } + } + + private extractLocalLinks(line: string, lineNumber: number, fromNodeId: string): void { + const linkPattern = /!?\[[^\]]*\]\(([^)\s]+)(?:\s+"[^"]*")?\)/g; + let match: RegExpExecArray | null; + + while ((match = linkPattern.exec(line)) !== null) { + const target = match[1]!; + if (!isLocalLinkTarget(target)) continue; + + this.unresolvedReferences.push({ + fromNodeId, + referenceName: target, + referenceKind: 'references', + line: lineNumber, + column: match.index, + filePath: this.filePath, + language: 'markdown', + }); + } + } +} + +function findHeadingParent( + stack: Array<{ level: number; nodeId: string; slug: string }>, + level: number +): { level: number; nodeId: string; slug: string } | undefined { + for (let i = stack.length - 1; i >= 0; i--) { + if (stack[i]!.level < level) return stack[i]; + } + return undefined; +} + +function slugify(title: string): string { + return title + .toLowerCase() + .replace(/[`*_~]/g, '') + .replace(/[^\w\s-]/g, '') + .trim() + .replace(/\s+/g, '-'); +} + +function isLocalLinkTarget(target: string): boolean { + if (target.startsWith('#')) return true; + if (/^[a-z][a-z0-9+.-]*:/i.test(target)) return false; + if (target.startsWith('//')) return false; + return true; +} diff --git a/src/extraction/shell-extractor.ts b/src/extraction/shell-extractor.ts new file mode 100644 index 000000000..dc5e52906 --- /dev/null +++ b/src/extraction/shell-extractor.ts @@ -0,0 +1,181 @@ +import * as path from 'path'; +import { Edge, ExtractionError, ExtractionResult, Node, UnresolvedReference } from '../types'; +import { generateNodeId } from './tree-sitter-helpers'; + +/** + * Lightweight extractor for shell scripts. + * + * Full Bash parsing is expensive to load in the existing `loadAllGrammars()` + * test path. For CodeGraph's practical shell use case, function declarations + * and command references provide the useful graph surface without loading an + * additional WASM grammar. + */ +export class ShellExtractor { + private filePath: string; + private source: string; + private nodes: Node[] = []; + private edges: Edge[] = []; + private unresolvedReferences: UnresolvedReference[] = []; + private errors: ExtractionError[] = []; + + constructor(filePath: string, source: string) { + this.filePath = filePath; + this.source = source; + } + + extract(): ExtractionResult { + const startTime = Date.now(); + + try { + const fileNode = this.createFileNode(); + this.extractFunctionsAndCommands(fileNode.id); + } catch (error) { + this.errors.push({ + message: `Shell extraction error: ${error instanceof Error ? error.message : String(error)}`, + filePath: this.filePath, + severity: 'error', + code: 'parse_error', + }); + } + + return { + nodes: this.nodes, + edges: this.edges, + unresolvedReferences: this.unresolvedReferences, + errors: this.errors, + durationMs: Date.now() - startTime, + }; + } + + private createFileNode(): Node { + const lines = this.source.split('\n'); + const node: Node = { + id: `file:${this.filePath}`, + kind: 'file', + name: path.basename(this.filePath), + qualifiedName: this.filePath, + filePath: this.filePath, + language: 'shell', + startLine: 1, + endLine: lines.length || 1, + startColumn: 0, + endColumn: lines[lines.length - 1]?.length ?? 0, + docstring: this.source.slice(0, 12_000), + updatedAt: Date.now(), + }; + this.nodes.push(node); + return node; + } + + private extractFunctionsAndCommands(fileNodeId: string): void { + const lines = this.source.split('\n'); + let currentFunctionId = fileNodeId; + let braceDepth = 0; + + for (let i = 0; i < lines.length; i++) { + const rawLine = lines[i]!; + const lineNumber = i + 1; + const line = stripTrailingComment(rawLine).trim(); + if (!line) continue; + + const functionName = matchFunctionName(line); + if (functionName) { + const functionId = generateNodeId(this.filePath, 'function', functionName, lineNumber); + this.nodes.push({ + id: functionId, + kind: 'function', + name: functionName, + qualifiedName: `${this.filePath}::${functionName}`, + filePath: this.filePath, + language: 'shell', + startLine: lineNumber, + endLine: lineNumber, + startColumn: rawLine.indexOf(functionName), + endColumn: rawLine.length, + signature: `${functionName}()`, + updatedAt: Date.now(), + }); + this.edges.push({ source: fileNodeId, target: functionId, kind: 'contains' }); + currentFunctionId = functionId; + braceDepth = countBraceDelta(line); + continue; + } + + for (const command of extractCommands(line)) { + this.unresolvedReferences.push({ + fromNodeId: currentFunctionId, + referenceName: command, + referenceKind: 'calls', + line: lineNumber, + column: rawLine.indexOf(command), + filePath: this.filePath, + language: 'shell', + }); + } + + if (currentFunctionId !== fileNodeId) { + braceDepth += countBraceDelta(line); + if (braceDepth <= 0) { + currentFunctionId = fileNodeId; + braceDepth = 0; + } + } + } + } +} + +function matchFunctionName(line: string): string | null { + const functionKeyword = /^function\s+([A-Za-z_][A-Za-z0-9_-]*)\s*(?:\(\s*\))?\s*\{?/.exec(line); + if (functionKeyword) return functionKeyword[1]!; + + const posixFunction = /^([A-Za-z_][A-Za-z0-9_-]*)\s*\(\s*\)\s*\{?/.exec(line); + return posixFunction?.[1] ?? null; +} + +function extractCommands(line: string): string[] { + const commands: string[] = []; + for (const segment of line.split(/[|;&]+/)) { + const command = firstCommandToken(segment.trim()); + if (command) commands.push(command); + } + return commands; +} + +function firstCommandToken(segment: string): string | null { + if (!segment || segment === '{' || segment === '}') return null; + if (/^(then|do|done|else|elif|fi|esac|case|if|for|while|until)\b/.test(segment)) return null; + + let remaining = segment; + while (true) { + const assignment = /^[A-Za-z_][A-Za-z0-9_]*=(?:"[^"]*"|'[^']*'|\S*)\s*/.exec(remaining); + if (!assignment) break; + remaining = remaining.slice(assignment[0].length); + } + + const match = /^([A-Za-z_./][A-Za-z0-9_./:-]*)/.exec(remaining); + if (!match) return null; + return path.basename(match[1]!); +} + +function stripTrailingComment(line: string): string { + let inSingle = false; + let inDouble = false; + for (let i = 0; i < line.length; i++) { + const ch = line[i]; + if (ch === "'" && !inDouble) inSingle = !inSingle; + if (ch === '"' && !inSingle) inDouble = !inDouble; + if (ch === '#' && !inSingle && !inDouble && (i === 0 || /\s/.test(line[i - 1]!))) { + return line.slice(0, i); + } + } + return line; +} + +function countBraceDelta(line: string): number { + let delta = 0; + for (const ch of line) { + if (ch === '{') delta++; + if (ch === '}') delta--; + } + return delta; +} diff --git a/src/extraction/tree-sitter.ts b/src/extraction/tree-sitter.ts index e62f97578..54f574410 100644 --- a/src/extraction/tree-sitter.ts +++ b/src/extraction/tree-sitter.ts @@ -28,6 +28,8 @@ import { AstroExtractor } from './astro-extractor'; import { DfmExtractor } from './dfm-extractor'; import { VueExtractor } from './vue-extractor'; import { MyBatisExtractor } from './mybatis-extractor'; +import { MarkdownExtractor } from './markdown-extractor'; +import { ShellExtractor } from './shell-extractor'; import { getAllFrameworkResolvers, getApplicableFrameworks, @@ -4770,6 +4772,14 @@ export function extractFromSource( // file node so the watcher tracks it without emitting symbols. const extractor = new MyBatisExtractor(filePath, source); result = extractor.extract(); + } else if (detectedLanguage === 'markdown') { + // Lightweight docs extractor: file content, headings, and local links. + const extractor = new MarkdownExtractor(filePath, source); + result = extractor.extract(); + } else if (detectedLanguage === 'shell') { + // Lightweight shell extractor: file content, functions, and command calls. + const extractor = new ShellExtractor(filePath, source); + result = extractor.extract(); } else if (isFileLevelOnlyLanguage(detectedLanguage)) { // No symbol extraction at this stage — files are tracked at the file-record // level only. Framework extractors (Drupal routing yml, Spring `@Value` diff --git a/src/types.ts b/src/types.ts index e57a74229..5858a080f 100644 --- a/src/types.ts +++ b/src/types.ts @@ -94,6 +94,8 @@ export const LANGUAGES = [ 'twig', 'xml', 'properties', + 'markdown', + 'shell', 'unknown', ] as const;