Add core modules (parser, chunking, formatter, dependency, summarizer)
Some checks failed
CI / test (push) Has been cancelled
CI / build (push) Has been cancelled

This commit is contained in:
2026-02-01 23:44:50 +00:00
parent 0c1826892e
commit be6ed40df2

197
codechunk/core/formatter.py Normal file
View File

@@ -0,0 +1,197 @@
from typing import List, Optional
from codechunk.core.chunking import ParsedChunk
class OutputFormatter:
"""Format output for different LLM platforms."""
def __init__(self, format_type: str = "markdown", max_tokens: int = 8192):
self.format_type = format_type
self.max_tokens = max_tokens
self.token_warning_thresholds = [0.7, 0.9, 1.0]
def format(self, chunks: List[ParsedChunk]) -> str:
"""Format chunks for output."""
if self.format_type == "ollama":
return self._format_ollama(chunks)
elif self.format_type == "lmstudio":
return self._format_lmstudio(chunks)
else:
return self._format_markdown(chunks)
def _format_ollama(self, chunks: List[ParsedChunk]) -> str:
"""Format for Ollama."""
lines = []
lines.append("### System")
lines.append("You are a helpful programming assistant analyzing a codebase.")
lines.append("")
lines.append("### User")
lines.append("Analyze the following code:")
lines.append("")
for chunk in chunks:
lines.append(f"**File: {chunk.metadata.file_path}**")
lines.append(f"**Type: {chunk.chunk_type}**")
lines.append(f"**Name: {chunk.name}**")
if chunk.metadata.docstring:
lines.append(f"**Description: {chunk.metadata.docstring}**")
if chunk.metadata.parameters:
lines.append(f"**Parameters: {', '.join(chunk.metadata.parameters)}**")
if chunk.metadata.return_type:
lines.append(f"**Returns: {chunk.metadata.return_type}**")
if chunk.priority >= 75:
lines.append("**Priority: HIGH**")
lines.append("")
lines.append("```" + self._get_language_lexer(chunk.metadata.language))
lines.append(chunk.content)
lines.append("```")
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)
def _format_lmstudio(self, chunks: List[ParsedChunk]) -> str:
"""Format for LM Studio."""
import json
messages = []
system_content = """You are a helpful programming assistant analyzing a codebase.
Provide clear, accurate code analysis and assistance."""
messages.append({
"role": "system",
"content": system_content
})
content_parts = []
content_parts.append("# Codebase Analysis")
content_parts.append("")
content_parts.append(f"Total files/chunks: {len(chunks)}")
content_parts.append("")
for chunk in chunks:
content_parts.append(f"## {chunk.metadata.file_name}")
content_parts.append(f"**Type:** {chunk.chunk_type}")
content_parts.append(f"**Name:** {chunk.name}")
content_parts.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}")
if chunk.metadata.docstring:
content_parts.append(f"**Description:** {chunk.metadata.docstring}")
content_parts.append("")
content_parts.append("```" + self._get_language_lexer(chunk.metadata.language))
content_parts.append(chunk.content)
content_parts.append("```")
content_parts.append("")
messages.append({
"role": "user",
"content": "\n".join(content_parts)
})
return json.dumps(messages, indent=2)
def _format_markdown(self, chunks: List[ParsedChunk]) -> str:
"""Format as markdown."""
lines = []
lines.append("# Code Context")
lines.append("")
lines.append(f"_Generated by CodeChunk CLI | {len(chunks)} chunks_")
lines.append("")
for chunk in chunks:
lines.append(f"## {chunk.name}")
lines.append("")
lines.append(f"**File:** `{chunk.metadata.file_path}`")
lines.append(f"**Type:** {chunk.chunk_type}")
lines.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}")
if chunk.metadata.docstring:
lines.append("")
lines.append(f"> {chunk.metadata.docstring}")
if chunk.metadata.parameters:
lines.append(f"**Parameters:** `{'`, `'.join(chunk.metadata.parameters)}`")
if chunk.metadata.return_type:
lines.append(f"**Returns:** `{chunk.metadata.return_type}`")
if chunk.priority >= 75:
lines.append("")
lines.append("⭐ **High Priority**")
lines.append("")
lines.append("```" + self._get_language_lexer(chunk.metadata.language))
lines.append(chunk.content)
lines.append("```")
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)
def _get_language_lexer(self, language: str) -> str:
"""Get language lexer for code block."""
language_map = {
"python": "python",
"javascript": "javascript",
"typescript": "typescript",
"go": "go",
"rust": "rust",
"java": "java",
"cpp": "cpp",
"c": "c",
"csharp": "csharp",
"ruby": "ruby",
"php": "php",
"swift": "swift",
"kotlin": "kotlin",
"scala": "scala",
"lua": "lua",
"perl": "perl",
"haskell": "haskell",
"elixir": "elixir",
"erlang": "erlang",
}
return language_map.get(language, "")
def estimate_tokens(self, text: str) -> int:
"""Estimate token count for text."""
chars = len(text)
avg_chars_per_token = 4
return int(chars / avg_chars_per_token)
def check_token_limit(self, text: str) -> tuple[bool, float, str]:
"""Check if text exceeds token limit."""
token_count = self.estimate_tokens(text)
ratio = token_count / self.max_tokens
if ratio >= 1.0:
return False, ratio, "CRITICAL"
elif ratio >= 0.9:
return False, ratio, "WARNING"
elif ratio >= 0.7:
return True, ratio, "INFO"
else:
return True, ratio, "OK"
def prune_for_limit(self, chunks: List[ParsedChunk], max_tokens: int) -> List[ParsedChunk]:
"""Prune chunks to fit within token limit."""
result = []
current_tokens = 0
for chunk in chunks:
chunk_tokens = self.estimate_tokens(chunk.content)
if current_tokens + chunk_tokens <= max_tokens:
result.append(chunk)
current_tokens += chunk_tokens
return result