diff --git a/codechunk/core/summarizer.py b/codechunk/core/summarizer.py new file mode 100644 index 0000000..7705b00 --- /dev/null +++ b/codechunk/core/summarizer.py @@ -0,0 +1,153 @@ +from typing import Optional, List +from codechunk.core.chunking import ParsedChunk + + +class CodeSummarizer: + """Summarize code chunks for LLM context.""" + + def __init__(self): + self.summary_templates = { + "function": "Function {name} with {param_count} parameters", + "class": "Class {name} with {method_count} methods", + "method": "Method {name} of class {class_name}", + "file": "File {filename} in {language}", + } + + def summarize(self, chunk: ParsedChunk) -> str: + """Generate a summary for a chunk.""" + if chunk.summary: + return chunk.summary + + if chunk.chunk_type == "function": + summary = self._summarize_function(chunk) + elif chunk.chunk_type == "method": + summary = self._summarize_method(chunk) + elif chunk.chunk_type == "class": + summary = self._summarize_class(chunk) + elif chunk.chunk_type == "file": + summary = self._summarize_file(chunk) + else: + summary = self._summarize_generic(chunk) + + return summary + + def _summarize_function(self, chunk: ParsedChunk) -> str: + """Summarize a function.""" + parts = [] + + if chunk.metadata.decorators: + decorators = ", ".join(d[1:] if d.startswith("@") else d for d in chunk.metadata.decorators) + parts.append(f"Decorators: {decorators}") + + parts.append(f"Function: {chunk.name}") + + if chunk.metadata.parameters: + parts.append(f"Parameters: {', '.join(chunk.metadata.parameters)}") + + if chunk.metadata.return_type: + parts.append(f"Returns: {chunk.metadata.return_type}") + + if chunk.metadata.docstring: + doc_summary = chunk.metadata.docstring[:100] + if len(chunk.metadata.docstring) > 100: + doc_summary += "..." + parts.append(f"Doc: {doc_summary}") + + parts.append(f"Lines: {chunk.metadata.line_count}") + + if chunk.metadata.complexity_score > 5: + parts.append(f"Complexity: {chunk.metadata.complexity_score}") + + return " | ".join(parts) + + def _summarize_method(self, chunk: ParsedChunk) -> str: + """Summarize a method.""" + class_name = chunk.name.split(".")[0] if "." in chunk.name else "Unknown" + + parts = [] + parts.append(f"Method: {chunk.name}") + + if chunk.metadata.parameters: + params = [p for p in chunk.metadata.parameters if p != "self" and p != "cls"] + if params: + parts.append(f"Parameters: {', '.join(params)}") + + if chunk.metadata.return_type: + parts.append(f"Returns: {chunk.metadata.return_type}") + + if chunk.metadata.docstring: + doc_summary = chunk.metadata.docstring[:100] + if len(chunk.metadata.docstring) > 100: + doc_summary += "..." + parts.append(f"Doc: {doc_summary}") + + return " | ".join(parts) + + def _summarize_class(self, chunk: ParsedChunk) -> str: + """Summarize a class.""" + parts = [] + parts.append(f"Class: {chunk.name}") + + if chunk.metadata.docstring: + doc_summary = chunk.metadata.docstring[:150] + if len(chunk.metadata.docstring) > 150: + doc_summary += "..." + parts.append(f"Doc: {doc_summary}") + + parts.append(f"Lines: {chunk.metadata.line_count}") + + return " | ".join(parts) + + def _summarize_file(self, chunk: ParsedChunk) -> str: + """Summarize a file.""" + parts = [] + parts.append(f"File: {chunk.metadata.file_name}") + parts.append(f"Language: {chunk.metadata.language}") + parts.append(f"Lines: {chunk.metadata.line_count}") + + if chunk.metadata.imports: + import_count = len(chunk.metadata.imports) + parts.append(f"Imports: {import_count}") + + return " | ".join(parts) + + def _summarize_generic(self, chunk: ParsedChunk) -> str: + """Summarize a generic chunk.""" + return f"{chunk.chunk_type.capitalize()}: {chunk.name} ({chunk.metadata.line_count} lines)" + + def batch_summarize(self, chunks: List[ParsedChunk]) -> List[str]: + """Generate summaries for multiple chunks.""" + return [self.summarize(chunk) for chunk in chunks] + + def generate_overview(self, chunks: List[ParsedChunk], project_name: str = "Project") -> str: + """Generate an overview of the project structure.""" + lines = [] + lines.append(f"# {project_name} Overview") + lines.append("") + + type_counts = {} + lang_counts = {} + + for chunk in chunks: + type_counts[chunk.chunk_type] = type_counts.get(chunk.chunk_type, 0) + 1 + lang_counts[chunk.metadata.language] = lang_counts.get(chunk.metadata.language, 0) + 1 + + lines.append("## Statistics") + lines.append(f"- Total chunks: {len(chunks)}") + for chunk_type, count in sorted(type_counts.items()): + lines.append(f" - {chunk_type}: {count}") + lines.append("") + + lines.append("## Languages") + for lang, count in sorted(lang_counts.items(), key=lambda x: -x[1]): + lines.append(f"- {lang}: {count}") + lines.append("") + + high_priority = [c for c in chunks if c.priority >= 75] + if high_priority: + lines.append("## Key Components (High Priority)") + for chunk in high_priority[:10]: + summary = self.summarize(chunk) + lines.append(f"- **{chunk.name}**: {summary}") + + return "\n".join(lines)