diff --git a/codesnap/output/llm_exporter.py b/codesnap/output/llm_exporter.py new file mode 100644 index 0000000..52a7d7c --- /dev/null +++ b/codesnap/output/llm_exporter.py @@ -0,0 +1,99 @@ +"""LLM-optimized export module for CodeSnap.""" + +from pathlib import Path +from typing import Any, Optional + +from ..core.extractor import ExtractedCode + + +class LLMExporter: + """Exports code summaries optimized for LLM context windows.""" + + def __init__(self, tokens_per_word: float = 0.75) -> None: + self.tokens_per_word = tokens_per_word + + def estimate_tokens(self, text: str) -> int: + """Estimate token count for text.""" + return int(len(text.split()) / self.tokens_per_word) + + def export( + self, + extracted_files: list[ExtractedCode], + file_paths: list[Path], + dependency_data: dict[str, Any], + complexity_data: dict[str, str], + max_tokens: int = 8000, + output_path: Optional[Path] = None, + ) -> str: + """Export code summary optimized for LLM context.""" + lines: list[str] = [] + + lines.append("## CODEBASE SUMMARY\n") + lines.append(f"Files analyzed: {len(extracted_files)}\n") + + language_counts: dict[str, int] = {} + for extracted in extracted_files: + language_counts[extracted.language] = language_counts.get(extracted.language, 0) + 1 + lang_summary = ", ".join(f"{k}: {v}" for k, v in sorted(language_counts.items())) + lines.append(f"Languages: {lang_summary}\n") + + total_funcs = sum(len(f.functions) for f in extracted_files) + total_classes = sum(len(f.classes) for f in extracted_files) + lines.append(f"Functions: {total_funcs}, Classes: {total_classes}\n") + + lines.append("\n## KEY STRUCTURE\n") + + for extracted in extracted_files: + if extracted.classes: + lines.append(f"\n{extracted.file_path.name}:\n") + for cls in extracted.classes: + method_names = [m.name for m in cls.methods] + if method_names: + lines.append(f" CLASS {cls.name}: {', '.join(method_names)}\n") + else: + lines.append(f" CLASS {cls.name}\n") + + lines.append("\n## FUNCTIONS (Global)\n") + for extracted in extracted_files: + for func in extracted.functions: + if not func.is_method: + params = ", ".join(func.parameters) if func.parameters else "" + lines.append(f"{extracted.file_path.name}:{func.name}({params})\n") + + lines.append("\n## DEPENDENCIES\n") + dependencies = dependency_data.get("dependencies", []) + for dep in dependencies[:30]: + lines.append(f"{dep.source.name} → {dep.target.name}\n") + + orphaned = dependency_data.get("orphaned", []) + if orphaned: + lines.append(f"\nOrphaned files: {', '.join(f.name for f in orphaned)}\n") + + cycles = dependency_data.get("cycles", []) + if cycles: + lines.append("\nCircular dependencies detected\n") + + lines.append("\n## FILE LIST\n") + for path in file_paths: + complexity = complexity_data.get(str(path), "?") + lines.append(f"{path.relative_to(file_paths[0].parent)} [{complexity}]\n") + + result = "\n".join(lines) + + if self.estimate_tokens(result) > max_tokens: + result = self._compress_output(result, max_tokens) + + if output_path: + output_path.write_text(result, encoding="utf-8") + + return result + + def _compress_output(self, content: str, max_tokens: int) -> str: + """Compress output to fit within token limit.""" + while self.estimate_tokens(content) > max_tokens and len(content) > 100: + lines = content.split("\n") + if len(lines) > 10: + content = "\n".join(lines[:-5]) + "\n...[truncated]..." + else: + content = content[: int(len(content) * 0.8)] + return content