Add core modules (parser, chunking, formatter, dependency, summarizer)
This commit is contained in:
197
codechunk/core/formatter.py
Normal file
197
codechunk/core/formatter.py
Normal file
@@ -0,0 +1,197 @@
|
||||
from typing import List, Optional
|
||||
from codechunk.core.chunking import ParsedChunk
|
||||
|
||||
|
||||
class OutputFormatter:
|
||||
"""Format output for different LLM platforms."""
|
||||
|
||||
def __init__(self, format_type: str = "markdown", max_tokens: int = 8192):
|
||||
self.format_type = format_type
|
||||
self.max_tokens = max_tokens
|
||||
self.token_warning_thresholds = [0.7, 0.9, 1.0]
|
||||
|
||||
def format(self, chunks: List[ParsedChunk]) -> str:
|
||||
"""Format chunks for output."""
|
||||
if self.format_type == "ollama":
|
||||
return self._format_ollama(chunks)
|
||||
elif self.format_type == "lmstudio":
|
||||
return self._format_lmstudio(chunks)
|
||||
else:
|
||||
return self._format_markdown(chunks)
|
||||
|
||||
def _format_ollama(self, chunks: List[ParsedChunk]) -> str:
|
||||
"""Format for Ollama."""
|
||||
lines = []
|
||||
lines.append("### System")
|
||||
lines.append("You are a helpful programming assistant analyzing a codebase.")
|
||||
lines.append("")
|
||||
lines.append("### User")
|
||||
lines.append("Analyze the following code:")
|
||||
lines.append("")
|
||||
|
||||
for chunk in chunks:
|
||||
lines.append(f"**File: {chunk.metadata.file_path}**")
|
||||
lines.append(f"**Type: {chunk.chunk_type}**")
|
||||
lines.append(f"**Name: {chunk.name}**")
|
||||
|
||||
if chunk.metadata.docstring:
|
||||
lines.append(f"**Description: {chunk.metadata.docstring}**")
|
||||
|
||||
if chunk.metadata.parameters:
|
||||
lines.append(f"**Parameters: {', '.join(chunk.metadata.parameters)}**")
|
||||
|
||||
if chunk.metadata.return_type:
|
||||
lines.append(f"**Returns: {chunk.metadata.return_type}**")
|
||||
|
||||
if chunk.priority >= 75:
|
||||
lines.append("**Priority: HIGH**")
|
||||
|
||||
lines.append("")
|
||||
lines.append("```" + self._get_language_lexer(chunk.metadata.language))
|
||||
lines.append(chunk.content)
|
||||
lines.append("```")
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_lmstudio(self, chunks: List[ParsedChunk]) -> str:
|
||||
"""Format for LM Studio."""
|
||||
import json
|
||||
|
||||
messages = []
|
||||
|
||||
system_content = """You are a helpful programming assistant analyzing a codebase.
|
||||
Provide clear, accurate code analysis and assistance."""
|
||||
|
||||
messages.append({
|
||||
"role": "system",
|
||||
"content": system_content
|
||||
})
|
||||
|
||||
content_parts = []
|
||||
|
||||
content_parts.append("# Codebase Analysis")
|
||||
content_parts.append("")
|
||||
content_parts.append(f"Total files/chunks: {len(chunks)}")
|
||||
content_parts.append("")
|
||||
|
||||
for chunk in chunks:
|
||||
content_parts.append(f"## {chunk.metadata.file_name}")
|
||||
content_parts.append(f"**Type:** {chunk.chunk_type}")
|
||||
content_parts.append(f"**Name:** {chunk.name}")
|
||||
content_parts.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}")
|
||||
|
||||
if chunk.metadata.docstring:
|
||||
content_parts.append(f"**Description:** {chunk.metadata.docstring}")
|
||||
|
||||
content_parts.append("")
|
||||
content_parts.append("```" + self._get_language_lexer(chunk.metadata.language))
|
||||
content_parts.append(chunk.content)
|
||||
content_parts.append("```")
|
||||
content_parts.append("")
|
||||
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": "\n".join(content_parts)
|
||||
})
|
||||
|
||||
return json.dumps(messages, indent=2)
|
||||
|
||||
def _format_markdown(self, chunks: List[ParsedChunk]) -> str:
|
||||
"""Format as markdown."""
|
||||
lines = []
|
||||
lines.append("# Code Context")
|
||||
lines.append("")
|
||||
lines.append(f"_Generated by CodeChunk CLI | {len(chunks)} chunks_")
|
||||
lines.append("")
|
||||
|
||||
for chunk in chunks:
|
||||
lines.append(f"## {chunk.name}")
|
||||
lines.append("")
|
||||
lines.append(f"**File:** `{chunk.metadata.file_path}`")
|
||||
lines.append(f"**Type:** {chunk.chunk_type}")
|
||||
lines.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}")
|
||||
|
||||
if chunk.metadata.docstring:
|
||||
lines.append("")
|
||||
lines.append(f"> {chunk.metadata.docstring}")
|
||||
|
||||
if chunk.metadata.parameters:
|
||||
lines.append(f"**Parameters:** `{'`, `'.join(chunk.metadata.parameters)}`")
|
||||
|
||||
if chunk.metadata.return_type:
|
||||
lines.append(f"**Returns:** `{chunk.metadata.return_type}`")
|
||||
|
||||
if chunk.priority >= 75:
|
||||
lines.append("")
|
||||
lines.append("⭐ **High Priority**")
|
||||
|
||||
lines.append("")
|
||||
lines.append("```" + self._get_language_lexer(chunk.metadata.language))
|
||||
lines.append(chunk.content)
|
||||
lines.append("```")
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _get_language_lexer(self, language: str) -> str:
|
||||
"""Get language lexer for code block."""
|
||||
language_map = {
|
||||
"python": "python",
|
||||
"javascript": "javascript",
|
||||
"typescript": "typescript",
|
||||
"go": "go",
|
||||
"rust": "rust",
|
||||
"java": "java",
|
||||
"cpp": "cpp",
|
||||
"c": "c",
|
||||
"csharp": "csharp",
|
||||
"ruby": "ruby",
|
||||
"php": "php",
|
||||
"swift": "swift",
|
||||
"kotlin": "kotlin",
|
||||
"scala": "scala",
|
||||
"lua": "lua",
|
||||
"perl": "perl",
|
||||
"haskell": "haskell",
|
||||
"elixir": "elixir",
|
||||
"erlang": "erlang",
|
||||
}
|
||||
return language_map.get(language, "")
|
||||
|
||||
def estimate_tokens(self, text: str) -> int:
|
||||
"""Estimate token count for text."""
|
||||
chars = len(text)
|
||||
avg_chars_per_token = 4
|
||||
return int(chars / avg_chars_per_token)
|
||||
|
||||
def check_token_limit(self, text: str) -> tuple[bool, float, str]:
|
||||
"""Check if text exceeds token limit."""
|
||||
token_count = self.estimate_tokens(text)
|
||||
ratio = token_count / self.max_tokens
|
||||
|
||||
if ratio >= 1.0:
|
||||
return False, ratio, "CRITICAL"
|
||||
elif ratio >= 0.9:
|
||||
return False, ratio, "WARNING"
|
||||
elif ratio >= 0.7:
|
||||
return True, ratio, "INFO"
|
||||
else:
|
||||
return True, ratio, "OK"
|
||||
|
||||
def prune_for_limit(self, chunks: List[ParsedChunk], max_tokens: int) -> List[ParsedChunk]:
|
||||
"""Prune chunks to fit within token limit."""
|
||||
result = []
|
||||
current_tokens = 0
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_tokens = self.estimate_tokens(chunk.content)
|
||||
if current_tokens + chunk_tokens <= max_tokens:
|
||||
result.append(chunk)
|
||||
current_tokens += chunk_tokens
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user