Add core modules (parser, chunking, formatter, dependency, summarizer)

2026-02-01 23:44:50 +00:00
parent 0c1826892e
commit be6ed40df2
1 changed files with 197 additions and 0 deletions
--- a/codechunk/core/formatter.py
+++ b/codechunk/core/formatter.py
@@ -0,0 +1,197 @@
+from typing import List, Optional
+from codechunk.core.chunking import ParsedChunk
+
+
+class OutputFormatter:
+    """Format output for different LLM platforms."""
+
+    def __init__(self, format_type: str = "markdown", max_tokens: int = 8192):
+        self.format_type = format_type
+        self.max_tokens = max_tokens
+        self.token_warning_thresholds = [0.7, 0.9, 1.0]
+
+    def format(self, chunks: List[ParsedChunk]) -> str:
+        """Format chunks for output."""
+        if self.format_type == "ollama":
+            return self._format_ollama(chunks)
+        elif self.format_type == "lmstudio":
+            return self._format_lmstudio(chunks)
+        else:
+            return self._format_markdown(chunks)
+
+    def _format_ollama(self, chunks: List[ParsedChunk]) -> str:
+        """Format for Ollama."""
+        lines = []
+        lines.append("### System")
+        lines.append("You are a helpful programming assistant analyzing a codebase.")
+        lines.append("")
+        lines.append("### User")
+        lines.append("Analyze the following code:")
+        lines.append("")
+
+        for chunk in chunks:
+            lines.append(f"**File: {chunk.metadata.file_path}**")
+            lines.append(f"**Type: {chunk.chunk_type}**")
+            lines.append(f"**Name: {chunk.name}**")
+
+            if chunk.metadata.docstring:
+                lines.append(f"**Description: {chunk.metadata.docstring}**")
+
+            if chunk.metadata.parameters:
+                lines.append(f"**Parameters: {', '.join(chunk.metadata.parameters)}**")
+
+            if chunk.metadata.return_type:
+                lines.append(f"**Returns: {chunk.metadata.return_type}**")
+
+            if chunk.priority >= 75:
+                lines.append("**Priority: HIGH**")
+
+            lines.append("")
+            lines.append("```" + self._get_language_lexer(chunk.metadata.language))
+            lines.append(chunk.content)
+            lines.append("```")
+            lines.append("")
+            lines.append("---")
+            lines.append("")
+
+        return "\n".join(lines)
+
+    def _format_lmstudio(self, chunks: List[ParsedChunk]) -> str:
+        """Format for LM Studio."""
+        import json
+
+        messages = []
+
+        system_content = """You are a helpful programming assistant analyzing a codebase.
+Provide clear, accurate code analysis and assistance."""
+
+        messages.append({
+            "role": "system",
+            "content": system_content
+        })
+
+        content_parts = []
+
+        content_parts.append("# Codebase Analysis")
+        content_parts.append("")
+        content_parts.append(f"Total files/chunks: {len(chunks)}")
+        content_parts.append("")
+
+        for chunk in chunks:
+            content_parts.append(f"## {chunk.metadata.file_name}")
+            content_parts.append(f"**Type:** {chunk.chunk_type}")
+            content_parts.append(f"**Name:** {chunk.name}")
+            content_parts.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}")
+
+            if chunk.metadata.docstring:
+                content_parts.append(f"**Description:** {chunk.metadata.docstring}")
+
+            content_parts.append("")
+            content_parts.append("```" + self._get_language_lexer(chunk.metadata.language))
+            content_parts.append(chunk.content)
+            content_parts.append("```")
+            content_parts.append("")
+
+        messages.append({
+            "role": "user",
+            "content": "\n".join(content_parts)
+        })
+
+        return json.dumps(messages, indent=2)
+
+    def _format_markdown(self, chunks: List[ParsedChunk]) -> str:
+        """Format as markdown."""
+        lines = []
+        lines.append("# Code Context")
+        lines.append("")
+        lines.append(f"_Generated by CodeChunk CLI | {len(chunks)} chunks_")
+        lines.append("")
+
+        for chunk in chunks:
+            lines.append(f"## {chunk.name}")
+            lines.append("")
+            lines.append(f"**File:** `{chunk.metadata.file_path}`")
+            lines.append(f"**Type:** {chunk.chunk_type}")
+            lines.append(f"**Lines:** {chunk.metadata.start_line}-{chunk.metadata.end_line}")
+
+            if chunk.metadata.docstring:
+                lines.append("")
+                lines.append(f"> {chunk.metadata.docstring}")
+
+            if chunk.metadata.parameters:
+                lines.append(f"**Parameters:** `{'`, `'.join(chunk.metadata.parameters)}`")
+
+            if chunk.metadata.return_type:
+                lines.append(f"**Returns:** `{chunk.metadata.return_type}`")
+
+            if chunk.priority >= 75:
+                lines.append("")
+                lines.append("⭐ **High Priority**")
+
+            lines.append("")
+            lines.append("```" + self._get_language_lexer(chunk.metadata.language))
+            lines.append(chunk.content)
+            lines.append("```")
+            lines.append("")
+            lines.append("---")
+            lines.append("")
+
+        return "\n".join(lines)
+
+    def _get_language_lexer(self, language: str) -> str:
+        """Get language lexer for code block."""
+        language_map = {
+            "python": "python",
+            "javascript": "javascript",
+            "typescript": "typescript",
+            "go": "go",
+            "rust": "rust",
+            "java": "java",
+            "cpp": "cpp",
+            "c": "c",
+            "csharp": "csharp",
+            "ruby": "ruby",
+            "php": "php",
+            "swift": "swift",
+            "kotlin": "kotlin",
+            "scala": "scala",
+            "lua": "lua",
+            "perl": "perl",
+            "haskell": "haskell",
+            "elixir": "elixir",
+            "erlang": "erlang",
+        }
+        return language_map.get(language, "")
+
+    def estimate_tokens(self, text: str) -> int:
+        """Estimate token count for text."""
+        chars = len(text)
+        avg_chars_per_token = 4
+        return int(chars / avg_chars_per_token)
+
+    def check_token_limit(self, text: str) -> tuple[bool, float, str]:
+        """Check if text exceeds token limit."""
+        token_count = self.estimate_tokens(text)
+        ratio = token_count / self.max_tokens
+
+        if ratio >= 1.0:
+            return False, ratio, "CRITICAL"
+        elif ratio >= 0.9:
+            return False, ratio, "WARNING"
+        elif ratio >= 0.7:
+            return True, ratio, "INFO"
+        else:
+            return True, ratio, "OK"
+
+    def prune_for_limit(self, chunks: List[ParsedChunk], max_tokens: int) -> List[ParsedChunk]:
+        """Prune chunks to fit within token limit."""
+        result = []
+        current_tokens = 0
+
+        for chunk in chunks:
+            chunk_tokens = self.estimate_tokens(chunk.content)
+            if current_tokens + chunk_tokens <= max_tokens:
+                result.append(chunk)
+                current_tokens += chunk_tokens
+
+        return result