198 lines
6.7 KiB
Python
198 lines
6.7 KiB
Python
from typing import List, Optional
|
|
from codechunk.core.chunking import ParsedChunk
|
|
|
|
|
|
class OutputFormatter:
|
|
"""Format output for different LLM platforms."""
|
|
|
|
def __init__(self, format_type: str = "markdown", max_tokens: int = 8192):
|
|
self.format_type = format_type
|
|
self.max_tokens = max_tokens
|
|
self.token_warning_thresholds = [0.7, 0.9, 1.0]
|
|
|
|
def format(self, chunks: List[ParsedChunk]) -> str:
|
|
"""Format chunks for output."""
|
|
if self.format_type == "ollama":
|
|
return self._format_ollama(chunks)
|
|
elif self.format_type == "lmstudio":
|
|
return self._format_lmstudio(chunks)
|
|
else:
|
|
return self._format_markdown(chunks)
|
|
|
|
def _format_ollama(self, chunks: List[ParsedChunk]) -> str:
|
|
"""Format for Ollama."""
|
|
lines = []
|
|
lines.append("### System")
|
|
lines.append("You are a helpful programming assistant analyzing a codebase.")
|
|
lines.append("")
|
|
lines.append("### User")
|
|
lines.append("Analyze the following code:")
|
|
lines.append("")
|
|
|
|
for chunk in chunks:
|
|
lines.append(f"**File: {chunk.metadata.file_path}**")
|
|
lines.append(f"**Type: {chunk.chunk_type}**")
|
|
lines.append(f"**Name: {chunk.name}**")
|
|
|
|
if chunk.metadata.docstring:
|
|
lines.append(f"**Description: {chunk.metadata.docstring}**")
|
|
|
|
if chunk.metadata.parameters:
|
|
lines.append(f"**Parameters: {', '.join(chunk.metadata.parameters)}**")
|
|
|
|
if chunk.metadata.return_type:
|
|
lines.append(f"**Returns: {chunk.metadata.return_type}**")
|
|
|
|
if chunk.priority >= 75:
|
|
lines.append("**Priority: HIGH**")
|
|
|
|
lines.append("")
|
|
lines.append("```" + self._get_language_lexer(chunk.metadata.language))
|
|
lines.append(chunk.content)
|
|
lines.append("```")
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_lmstudio(self, chunks: List[ParsedChunk]) -> str:
|
|
"""Format for LM Studio."""
|
|
import json
|
|
|
|
messages = []
|
|
|
|
system_content = """You are a helpful programming assistant analyzing a codebase.
|
|
Provide clear, accurate code analysis and assistance."""
|
|
|
|
messages.append({
|
|
"role": "system",
|
|
"content": system_content
|
|
})
|
|
|
|
content_parts = []
|
|
|
|
content_parts.append("# Codebase Analysis")
|
|
content_parts.append("")
|
|
content_parts.append(f"Total files/chunks: {len(chunks)}")
|
|
content_parts.append("")
|
|
|
|
for chunk in chunks:
|
|
content_parts.append(f"## {chunk.metadata.file_name}")
|
|
content_parts.append(f"**Type:** {chunk.chunk_type}")
|
|
content_parts.append(f"**Name:** {chunk.name}")
|
|
content_parts.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}")
|
|
|
|
if chunk.metadata.docstring:
|
|
content_parts.append(f"**Description:** {chunk.metadata.docstring}")
|
|
|
|
content_parts.append("")
|
|
content_parts.append("```" + self._get_language_lexer(chunk.metadata.language))
|
|
content_parts.append(chunk.content)
|
|
content_parts.append("```")
|
|
content_parts.append("")
|
|
|
|
messages.append({
|
|
"role": "user",
|
|
"content": "\n".join(content_parts)
|
|
})
|
|
|
|
return json.dumps(messages, indent=2)
|
|
|
|
def _format_markdown(self, chunks: List[ParsedChunk]) -> str:
|
|
"""Format as markdown."""
|
|
lines = []
|
|
lines.append("# Code Context")
|
|
lines.append("")
|
|
lines.append(f"_Generated by CodeChunk CLI | {len(chunks)} chunks_")
|
|
lines.append("")
|
|
|
|
for chunk in chunks:
|
|
lines.append(f"## {chunk.name}")
|
|
lines.append("")
|
|
lines.append(f"**File:** `{chunk.metadata.file_path}`")
|
|
lines.append(f"**Type:** {chunk.chunk_type}")
|
|
lines.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}")
|
|
|
|
if chunk.metadata.docstring:
|
|
lines.append("")
|
|
lines.append(f"> {chunk.metadata.docstring}")
|
|
|
|
if chunk.metadata.parameters:
|
|
lines.append(f"**Parameters:** `{'`, `'.join(chunk.metadata.parameters)}`")
|
|
|
|
if chunk.metadata.return_type:
|
|
lines.append(f"**Returns:** `{chunk.metadata.return_type}`")
|
|
|
|
if chunk.priority >= 75:
|
|
lines.append("")
|
|
lines.append("⭐ **High Priority**")
|
|
|
|
lines.append("")
|
|
lines.append("```" + self._get_language_lexer(chunk.metadata.language))
|
|
lines.append(chunk.content)
|
|
lines.append("```")
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _get_language_lexer(self, language: str) -> str:
|
|
"""Get language lexer for code block."""
|
|
language_map = {
|
|
"python": "python",
|
|
"javascript": "javascript",
|
|
"typescript": "typescript",
|
|
"go": "go",
|
|
"rust": "rust",
|
|
"java": "java",
|
|
"cpp": "cpp",
|
|
"c": "c",
|
|
"csharp": "csharp",
|
|
"ruby": "ruby",
|
|
"php": "php",
|
|
"swift": "swift",
|
|
"kotlin": "kotlin",
|
|
"scala": "scala",
|
|
"lua": "lua",
|
|
"perl": "perl",
|
|
"haskell": "haskell",
|
|
"elixir": "elixir",
|
|
"erlang": "erlang",
|
|
}
|
|
return language_map.get(language, "")
|
|
|
|
def estimate_tokens(self, text: str) -> int:
|
|
"""Estimate token count for text."""
|
|
chars = len(text)
|
|
avg_chars_per_token = 4
|
|
return int(chars / avg_chars_per_token)
|
|
|
|
def check_token_limit(self, text: str) -> tuple[bool, float, str]:
|
|
"""Check if text exceeds token limit."""
|
|
token_count = self.estimate_tokens(text)
|
|
ratio = token_count / self.max_tokens
|
|
|
|
if ratio >= 1.0:
|
|
return False, ratio, "CRITICAL"
|
|
elif ratio >= 0.9:
|
|
return False, ratio, "WARNING"
|
|
elif ratio >= 0.7:
|
|
return True, ratio, "INFO"
|
|
else:
|
|
return True, ratio, "OK"
|
|
|
|
def prune_for_limit(self, chunks: List[ParsedChunk], max_tokens: int) -> List[ParsedChunk]:
|
|
"""Prune chunks to fit within token limit."""
|
|
result = []
|
|
current_tokens = 0
|
|
|
|
for chunk in chunks:
|
|
chunk_tokens = self.estimate_tokens(chunk.content)
|
|
if current_tokens + chunk_tokens <= max_tokens:
|
|
result.append(chunk)
|
|
current_tokens += chunk_tokens
|
|
|
|
return result
|