From be6ed40df265b537485919615433f7a5c5d27237 Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Sun, 1 Feb 2026 23:44:50 +0000 Subject: [PATCH] Add core modules (parser, chunking, formatter, dependency, summarizer) --- codechunk/core/formatter.py | 197 ++++++++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 codechunk/core/formatter.py diff --git a/codechunk/core/formatter.py b/codechunk/core/formatter.py new file mode 100644 index 0000000..65014dd --- /dev/null +++ b/codechunk/core/formatter.py @@ -0,0 +1,197 @@ +from typing import List, Optional +from codechunk.core.chunking import ParsedChunk + + +class OutputFormatter: + """Format output for different LLM platforms.""" + + def __init__(self, format_type: str = "markdown", max_tokens: int = 8192): + self.format_type = format_type + self.max_tokens = max_tokens + self.token_warning_thresholds = [0.7, 0.9, 1.0] + + def format(self, chunks: List[ParsedChunk]) -> str: + """Format chunks for output.""" + if self.format_type == "ollama": + return self._format_ollama(chunks) + elif self.format_type == "lmstudio": + return self._format_lmstudio(chunks) + else: + return self._format_markdown(chunks) + + def _format_ollama(self, chunks: List[ParsedChunk]) -> str: + """Format for Ollama.""" + lines = [] + lines.append("### System") + lines.append("You are a helpful programming assistant analyzing a codebase.") + lines.append("") + lines.append("### User") + lines.append("Analyze the following code:") + lines.append("") + + for chunk in chunks: + lines.append(f"**File: {chunk.metadata.file_path}**") + lines.append(f"**Type: {chunk.chunk_type}**") + lines.append(f"**Name: {chunk.name}**") + + if chunk.metadata.docstring: + lines.append(f"**Description: {chunk.metadata.docstring}**") + + if chunk.metadata.parameters: + lines.append(f"**Parameters: {', '.join(chunk.metadata.parameters)}**") + + if chunk.metadata.return_type: + lines.append(f"**Returns: {chunk.metadata.return_type}**") + + if chunk.priority >= 75: + lines.append("**Priority: HIGH**") + + lines.append("") + lines.append("```" + self._get_language_lexer(chunk.metadata.language)) + lines.append(chunk.content) + lines.append("```") + lines.append("") + lines.append("---") + lines.append("") + + return "\n".join(lines) + + def _format_lmstudio(self, chunks: List[ParsedChunk]) -> str: + """Format for LM Studio.""" + import json + + messages = [] + + system_content = """You are a helpful programming assistant analyzing a codebase. +Provide clear, accurate code analysis and assistance.""" + + messages.append({ + "role": "system", + "content": system_content + }) + + content_parts = [] + + content_parts.append("# Codebase Analysis") + content_parts.append("") + content_parts.append(f"Total files/chunks: {len(chunks)}") + content_parts.append("") + + for chunk in chunks: + content_parts.append(f"## {chunk.metadata.file_name}") + content_parts.append(f"**Type:** {chunk.chunk_type}") + content_parts.append(f"**Name:** {chunk.name}") + content_parts.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}") + + if chunk.metadata.docstring: + content_parts.append(f"**Description:** {chunk.metadata.docstring}") + + content_parts.append("") + content_parts.append("```" + self._get_language_lexer(chunk.metadata.language)) + content_parts.append(chunk.content) + content_parts.append("```") + content_parts.append("") + + messages.append({ + "role": "user", + "content": "\n".join(content_parts) + }) + + return json.dumps(messages, indent=2) + + def _format_markdown(self, chunks: List[ParsedChunk]) -> str: + """Format as markdown.""" + lines = [] + lines.append("# Code Context") + lines.append("") + lines.append(f"_Generated by CodeChunk CLI | {len(chunks)} chunks_") + lines.append("") + + for chunk in chunks: + lines.append(f"## {chunk.name}") + lines.append("") + lines.append(f"**File:** `{chunk.metadata.file_path}`") + lines.append(f"**Type:** {chunk.chunk_type}") + lines.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}") + + if chunk.metadata.docstring: + lines.append("") + lines.append(f"> {chunk.metadata.docstring}") + + if chunk.metadata.parameters: + lines.append(f"**Parameters:** `{'`, `'.join(chunk.metadata.parameters)}`") + + if chunk.metadata.return_type: + lines.append(f"**Returns:** `{chunk.metadata.return_type}`") + + if chunk.priority >= 75: + lines.append("") + lines.append("⭐ **High Priority**") + + lines.append("") + lines.append("```" + self._get_language_lexer(chunk.metadata.language)) + lines.append(chunk.content) + lines.append("```") + lines.append("") + lines.append("---") + lines.append("") + + return "\n".join(lines) + + def _get_language_lexer(self, language: str) -> str: + """Get language lexer for code block.""" + language_map = { + "python": "python", + "javascript": "javascript", + "typescript": "typescript", + "go": "go", + "rust": "rust", + "java": "java", + "cpp": "cpp", + "c": "c", + "csharp": "csharp", + "ruby": "ruby", + "php": "php", + "swift": "swift", + "kotlin": "kotlin", + "scala": "scala", + "lua": "lua", + "perl": "perl", + "haskell": "haskell", + "elixir": "elixir", + "erlang": "erlang", + } + return language_map.get(language, "") + + def estimate_tokens(self, text: str) -> int: + """Estimate token count for text.""" + chars = len(text) + avg_chars_per_token = 4 + return int(chars / avg_chars_per_token) + + def check_token_limit(self, text: str) -> tuple[bool, float, str]: + """Check if text exceeds token limit.""" + token_count = self.estimate_tokens(text) + ratio = token_count / self.max_tokens + + if ratio >= 1.0: + return False, ratio, "CRITICAL" + elif ratio >= 0.9: + return False, ratio, "WARNING" + elif ratio >= 0.7: + return True, ratio, "INFO" + else: + return True, ratio, "OK" + + def prune_for_limit(self, chunks: List[ParsedChunk], max_tokens: int) -> List[ParsedChunk]: + """Prune chunks to fit within token limit.""" + result = [] + current_tokens = 0 + + for chunk in chunks: + chunk_tokens = self.estimate_tokens(chunk.content) + if current_tokens + chunk_tokens <= max_tokens: + result.append(chunk) + current_tokens += chunk_tokens + + return result