Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

### New Features

- Markdown and shell scripts are now indexed instead of being treated as unknown files. Markdown files contribute searchable heading sections and local documentation links, while `.sh` and `.bash` files contribute shell functions and command-call relationships, so project docs and build/demo scripts show up in search, explore, and impact analysis after re-indexing.
- **Subagents and non-MCP agents can now reach CodeGraph.** Two new CLI commands — `codegraph explore "<symbols or question>"` and `codegraph node <symbol-or-file>` — print exactly what the matching MCP tools return (relevant symbols' source + call paths; one symbol's source + callers; file reads with line numbers), so any agent with a shell can use the graph. And `codegraph install` now writes a small marker-fenced CodeGraph section into each agent's instructions file (`CLAUDE.md` / `AGENTS.md` / `GEMINI.md`) pointing at both surfaces — that file is what Task-tool subagents actually see, where the MCP server's own guidance only reaches the main agent. Measured on a delegated code-exploration task: subagents went from almost never using CodeGraph (~1 in 9 runs) to using it in every run, including runs with zero grep/file-reading fallback. The section is small, survives your own content, upgrades cleanly from the old long block, and `codegraph uninstall` removes it. Thanks @liuyao37511. (#704)
- **The MCP tool list is now a focused default of four** — `codegraph_explore`, `codegraph_node`, `codegraph_search`, and `codegraph_callers`. The other four (`codegraph_callees`, `codegraph_impact`, `codegraph_files`, `codegraph_status`) remain fully functional — the CLI and library API are unchanged, and `CODEGRAPH_MCP_TOOLS` re-enables any of them — but they're no longer listed to agents by default: measured agent behavior shows they're never or rarely picked, and the information they carry already arrives inline on the tools agents do use (explore's blast-radius section, node's dependents note, a symbol's own body as its callee list). A leaner list saves context tokens every session and steers agents to the right tool by presence alone.
- **CodeGraph now goes quiet instead of failing loudly in unindexed projects.** When an AI agent's session starts in a workspace that has no CodeGraph index, the MCP server now announces itself as inactive with a short note and lists no tools at all — instead of presenting the full toolset and erroring on every call, which taught agents to distrust CodeGraph even where it works. Querying another project that isn't indexed likewise returns clear guidance (use your regular tools for that codebase; the user can run `codegraph init` there to enable CodeGraph) instead of an error, and genuine internal errors now tell the agent to retry once rather than give up on CodeGraph entirely. Indexing stays your decision — agents are told not to run it themselves. (#769)
Expand Down
63 changes: 63 additions & 0 deletions __tests__/extraction.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,16 @@ describe('Language Detection', () => {
expect(detectLanguage('stdio.h', '#ifndef STDIO_H\nvoid printf();\n#endif\n')).toBe('c');
});

it('should detect Markdown files', () => {
expect(detectLanguage('README.md')).toBe('markdown');
expect(detectLanguage('docs/guide.markdown')).toBe('markdown');
});

it('should detect shell scripts by extension', () => {
expect(detectLanguage('scripts/build.sh')).toBe('shell');
expect(detectLanguage('scripts/bootstrap.bash')).toBe('shell');
});

it('should return unknown for unsupported extensions', () => {
expect(detectLanguage('styles.css')).toBe('unknown');
expect(detectLanguage('data.json')).toBe('unknown');
Expand Down Expand Up @@ -129,6 +139,8 @@ describe('Language Support', () => {
expect(languages).toContain('swift');
expect(languages).toContain('kotlin');
expect(languages).toContain('dart');
expect(languages).toContain('markdown');
expect(languages).toContain('shell');
});
});

Expand Down Expand Up @@ -5127,6 +5139,57 @@ export function multiply(a: number, b: number): number {

cg.close();
});

it('should index Markdown headings and local links', async () => {
const source = `# Project Overview

See [Setup](docs/setup.md) and [API](#api).

## API

Details for callers.
`;

const result = extractFromSource('README.md', source);

expect(result.errors).toEqual([]);
expect(result.nodes.find((n) => n.kind === 'file')).toMatchObject({
name: 'README.md',
language: 'markdown',
});

const headings = result.nodes.filter((n) => n.kind === 'module').map((n) => n.name);
expect(headings).toEqual(['Project Overview', 'API']);
expect(result.unresolvedReferences.map((r) => r.referenceName)).toEqual(['docs/setup.md', '#api']);
});

it('should index shell functions and command calls', () => {
const source = `#!/usr/bin/env bash

main() {
echo "starting"
helper
}

helper() {
curl -fsSL https://example.invalid | jq .
}

main "$@"
`;

const result = extractFromSource('scripts/build.sh', source);

expect(result.errors).toEqual([]);
const functions = result.nodes.filter((n) => n.kind === 'function').map((n) => n.name).sort();
expect(functions).toEqual(['helper', 'main']);

const calls = result.unresolvedReferences.map((r) => r.referenceName);
expect(calls).toContain('helper');
expect(calls).toContain('main');
expect(calls).toContain('curl');
expect(calls).toContain('jq');
});
});

describe('Path Normalization', () => {
Expand Down
4 changes: 3 additions & 1 deletion __tests__/security.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -463,11 +463,13 @@ describe('Source file detection (isSourceFile)', () => {
expect(isSourceFile('src/component.tsx')).toBe(true);
expect(isSourceFile('lib/util.js')).toBe(true);
expect(isSourceFile('src/main.py')).toBe(true);
expect(isSourceFile('README.md')).toBe(true);
expect(isSourceFile('scripts/build.sh')).toBe(true);
expect(isSourceFile('scripts/bootstrap.bash')).toBe(true);
});

it('rejects unsupported extensions and extensionless files', () => {
expect(isSourceFile('src/component.css')).toBe(false);
expect(isSourceFile('README.md')).toBe(false);
expect(isSourceFile('Makefile')).toBe(false);
expect(isSourceFile('.gitignore')).toBe(false);
});
Expand Down
14 changes: 11 additions & 3 deletions src/extraction/grammars.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import * as path from 'path';
import { Parser, Language as WasmLanguage } from 'web-tree-sitter';
import { Language } from '../types';

export type GrammarLanguage = Exclude<Language, 'svelte' | 'vue' | 'astro' | 'liquid' | 'razor' | 'yaml' | 'twig' | 'xml' | 'properties' | 'unknown'>;
export type GrammarLanguage = Exclude<Language, 'svelte' | 'vue' | 'astro' | 'liquid' | 'razor' | 'yaml' | 'twig' | 'xml' | 'properties' | 'markdown' | 'shell' | 'unknown'>;

/**
* WASM filename map — maps each language to its .wasm grammar file
Expand Down Expand Up @@ -113,6 +113,10 @@ export const EXTENSION_MAP: Record<string, Language> = {
// shape as the `.yml` variants — the YAML/properties extractor emits one node
// per leaf key, and the Spring resolver links `@Value("${k}")` references.
'.properties': 'properties',
'.md': 'markdown',
'.markdown': 'markdown',
'.sh': 'shell',
'.bash': 'shell',
};

/**
Expand Down Expand Up @@ -316,6 +320,8 @@ export function isLanguageSupported(language: Language): boolean {
if (language === 'twig') return true; // file-level tracking only
if (language === 'xml') return true; // MyBatis mapper extractor
if (language === 'properties') return true; // Spring config keys
if (language === 'markdown') return true; // lightweight documentation extractor
if (language === 'shell') return true; // lightweight shell script extractor
if (language === 'unknown') return false;
return language in WASM_GRAMMAR_FILES;
}
Expand All @@ -326,7 +332,7 @@ export function isLanguageSupported(language: Language): boolean {
export function isGrammarLoaded(language: Language): boolean {
if (language === 'svelte' || language === 'vue' || language === 'astro' || language === 'liquid' || language === 'razor') return true;
if (language === 'yaml' || language === 'twig') return true; // no WASM grammar needed
if (language === 'xml' || language === 'properties') return true; // no WASM grammar needed
if (language === 'xml' || language === 'properties' || language === 'markdown' || language === 'shell') return true; // no WASM grammar needed
return languageCache.has(language);
}

Expand All @@ -347,7 +353,7 @@ export function isFileLevelOnlyLanguage(language: Language): boolean {
* Get all supported languages (those with grammar definitions).
*/
export function getSupportedLanguages(): Language[] {
return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'astro', 'liquid'];
return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'astro', 'liquid', 'markdown', 'shell'];
}

/**
Expand Down Expand Up @@ -424,6 +430,8 @@ export function getLanguageDisplayName(language: Language): string {
twig: 'Twig',
xml: 'XML',
properties: 'Java properties',
markdown: 'Markdown',
shell: 'Shell',
unknown: 'Unknown',
};
return names[language] || language;
Expand Down
157 changes: 157 additions & 0 deletions src/extraction/markdown-extractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import * as path from 'path';
import { Edge, ExtractionError, ExtractionResult, Node, UnresolvedReference } from '../types';
import { generateNodeId } from './tree-sitter-helpers';

/**
* Lightweight extractor for Markdown documentation.
*
* Markdown has useful project structure even when it is not "code": headings
* are navigable sections, and local links often point at related source/docs.
* This extractor keeps the full file searchable via a capped docstring while
* emitting heading nodes and local-link references.
*/
export class MarkdownExtractor {
private filePath: string;
private source: string;
private nodes: Node[] = [];
private edges: Edge[] = [];
private unresolvedReferences: UnresolvedReference[] = [];
private errors: ExtractionError[] = [];

constructor(filePath: string, source: string) {
this.filePath = filePath;
this.source = source;
}

extract(): ExtractionResult {
const startTime = Date.now();

try {
const fileNode = this.createFileNode();
this.extractHeadingsAndLinks(fileNode.id);
} catch (error) {
this.errors.push({
message: `Markdown extraction error: ${error instanceof Error ? error.message : String(error)}`,
filePath: this.filePath,
severity: 'error',
code: 'parse_error',
});
}

return {
nodes: this.nodes,
edges: this.edges,
unresolvedReferences: this.unresolvedReferences,
errors: this.errors,
durationMs: Date.now() - startTime,
};
}

private createFileNode(): Node {
const lines = this.source.split('\n');
const node: Node = {
id: `file:${this.filePath}`,
kind: 'file',
name: path.basename(this.filePath),
qualifiedName: this.filePath,
filePath: this.filePath,
language: 'markdown',
startLine: 1,
endLine: lines.length || 1,
startColumn: 0,
endColumn: lines[lines.length - 1]?.length ?? 0,
docstring: this.source.slice(0, 12_000),
updatedAt: Date.now(),
};
this.nodes.push(node);
return node;
}

private extractHeadingsAndLinks(fileNodeId: string): void {
const lines = this.source.split('\n');
const headingStack: Array<{ level: number; nodeId: string; slug: string }> = [];

for (let i = 0; i < lines.length; i++) {
const line = lines[i]!;
const lineNumber = i + 1;

const headingMatch = /^(#{1,6})\s+(.+?)\s*#*\s*$/.exec(line);
if (headingMatch) {
const level = headingMatch[1]!.length;
const title = headingMatch[2]!.trim();
const slug = slugify(title);
const parent = findHeadingParent(headingStack, level)?.nodeId ?? fileNodeId;
const nodeId = generateNodeId(this.filePath, 'module', `${level}:${title}`, lineNumber);

this.nodes.push({
id: nodeId,
kind: 'module',
name: title,
qualifiedName: `${this.filePath}#${slug}`,
filePath: this.filePath,
language: 'markdown',
startLine: lineNumber,
endLine: lineNumber,
startColumn: 0,
endColumn: line.length,
signature: `${'#'.repeat(level)} ${title}`,
updatedAt: Date.now(),
});
this.edges.push({ source: parent, target: nodeId, kind: 'contains' });

while (headingStack.length > 0 && headingStack[headingStack.length - 1]!.level >= level) {
headingStack.pop();
}
headingStack.push({ level, nodeId, slug });
}

this.extractLocalLinks(line, lineNumber, headingStack[headingStack.length - 1]?.nodeId ?? fileNodeId);
}
}

private extractLocalLinks(line: string, lineNumber: number, fromNodeId: string): void {
const linkPattern = /!?\[[^\]]*\]\(([^)\s]+)(?:\s+"[^"]*")?\)/g;
let match: RegExpExecArray | null;

while ((match = linkPattern.exec(line)) !== null) {
const target = match[1]!;
if (!isLocalLinkTarget(target)) continue;

this.unresolvedReferences.push({
fromNodeId,
referenceName: target,
referenceKind: 'references',
line: lineNumber,
column: match.index,
filePath: this.filePath,
language: 'markdown',
});
}
}
}

function findHeadingParent(
stack: Array<{ level: number; nodeId: string; slug: string }>,
level: number
): { level: number; nodeId: string; slug: string } | undefined {
for (let i = stack.length - 1; i >= 0; i--) {
if (stack[i]!.level < level) return stack[i];
}
return undefined;
}

function slugify(title: string): string {
return title
.toLowerCase()
.replace(/[`*_~]/g, '')
.replace(/[^\w\s-]/g, '')
.trim()
.replace(/\s+/g, '-');
}

function isLocalLinkTarget(target: string): boolean {
if (target.startsWith('#')) return true;
if (/^[a-z][a-z0-9+.-]*:/i.test(target)) return false;
if (target.startsWith('//')) return false;
return true;
}
Loading