Add naming, style, and documentation analyzers

2026-01-29 13:23:39 +00:00
parent 160df350d7
commit 8a302a291e
1 changed files with 189 additions and 0 deletions
--- a/src/contextgen/analyzers/style_analyzer.py
+++ b/src/contextgen/analyzers/style_analyzer.py
@@ -0,0 +1,189 @@
+"""Style analyzer for indentation, quotes, and formatting patterns."""
+
+import re
+from pathlib import Path
+from typing import Any
+
+
+class StyleAnalyzer:
+    """Analyzes code style patterns."""
+
+    def __init__(self, project_path: Path):
+        self.project_path = project_path
+
+    def analyze(self) -> dict[str, Any]:
+        """Analyze style patterns across project files."""
+        indentation = self._detect_indentation()
+        quote_style = self._detect_quote_style()
+        line_endings = self._detect_line_endings()
+        max_line_length = self._detect_line_length()
+        trailing_newline = self._detect_trailing_newline()
+
+        return {
+            "indentation": indentation,
+            "quote_style": quote_style,
+            "line_endings": line_endings,
+            "max_line_length": max_line_length,
+            "trailing_newline": trailing_newline,
+        }
+
+    def _detect_indentation(self) -> dict[str, Any]:
+        """Detect indentation style and width."""
+        indentations: dict[int, int] = {}
+        
+        code_files = self._get_code_files()
+        
+        for file_path in code_files[:30]:
+            content = self._safe_read_file(file_path)
+            if content:
+                indent_width = self._analyze_indent_width(content)
+                if indent_width:
+                    indentations[indent_width] = indentations.get(indent_width, 0) + 1
+        
+        if not indentations:
+            return {"style": "unknown", "width": None}
+        
+        dominant_width: int = max(indentations.keys(), key=lambda k: indentations[k])
+        
+        style = "spaces" if dominant_width in [2, 4] else "tabs"
+        
+        return {"style": style, "width": dominant_width}
+
+    def _analyze_indent_width(self, content: str) -> int | None:
+        """Analyze the indentation width from content."""
+        lines = content.split("\n")
+        indent_counts: dict[int, int] = {}
+        
+        for line in lines:
+            if not line.strip():
+                continue
+            leading_spaces = len(line) - len(line.lstrip())
+            leading_tabs = len(line) - len(line.lstrip("\t"))
+            
+            if leading_spaces > 0 and leading_spaces % 2 == 0:
+                indent_counts[leading_spaces] = indent_counts.get(leading_spaces, 0) + 1
+            elif leading_tabs > 0:
+                return 1
+        
+        if indent_counts:
+            return min(indent_counts.keys(), key=lambda k: indent_counts[k])
+        return None
+
+    def _detect_quote_style(self) -> dict[str, Any]:
+        """Detect quote style (single vs double)."""
+        single_count = 0
+        double_count = 0
+        
+        code_files = self._get_code_files()
+        
+        for file_path in code_files[:30]:
+            content = self._safe_read_file(file_path)
+            if content:
+                content = self._remove_string_literals(content)
+                single_count += content.count("'") - content.count("\\'")
+                double_count += content.count('"') - content.count('\\"')
+        
+        total = single_count + double_count
+        if total == 0:
+            return {"style": "unknown", "ratio": None}
+        
+        single_ratio = single_count / total
+        
+        if single_ratio > 0.6:
+            style = "single"
+        elif double_ratio := 1 - single_ratio > 0.6:
+            style = "double"
+        else:
+            style = "mixed"
+        
+        return {"style": style, "single_ratio": round(single_ratio, 2)}
+
+    def _remove_string_literals(self, content: str) -> str:
+        """Remove string literals from content to avoid false positives."""
+        pattern = r'(?:"(?:[^"\\]|\\.)*")|(?:\'(?:[^\'\\]|\\.)*\')'
+        return re.sub(pattern, '""', content)
+
+    def _detect_line_endings(self) -> dict[str, Any]:
+        """Detect line ending style (LF vs CRLF)."""
+        crlf_count = 0
+        lf_count = 0
+        
+        code_files = self._get_code_files()
+        
+        for file_path in code_files[:20]:
+            content = self._safe_read_file(file_path)
+            if content:
+                crlf_count += content.count("\r\n")
+                lf_count += content.count("\n") - crlf_count
+        
+        total = crlf_count + lf_count
+        if total == 0:
+            return {"style": "unknown"}
+        
+        if crlf_count > lf_count:
+            return {"style": "CRLF", "ratio": round(crlf_count / total, 2)}
+        else:
+            return {"style": "LF", "ratio": round(lf_count / total, 2)}
+
+    def _detect_line_length(self) -> dict[str, Any]:
+        """Detect preferred line length."""
+        lengths: dict[int, int] = {}
+        
+        code_files = self._get_code_files()
+        
+        for file_path in code_files[:20]:
+            content = self._safe_read_file(file_path)
+            if content:
+                for line in content.split("\n"):
+                    line_len = len(line.rstrip())
+                    if line_len > 0:
+                        bucket = (line_len // 10) * 10
+                        lengths[bucket] = lengths.get(bucket, 0) + 1
+        
+        if not lengths:
+            return {"max": None, "preferred": None}
+        
+        max_bucket: int = max(lengths.keys(), key=lambda k: lengths[k])
+        return {"max": max_bucket + 10, "preferred": max_bucket}
+
+    def _detect_trailing_newline(self) -> dict[str, Any]:
+        """Detect if files typically have trailing newlines."""
+        with_newline = 0
+        without_newline = 0
+        
+        code_files = self._get_code_files()
+        
+        for file_path in code_files[:20]:
+            content = self._safe_read_file(file_path)
+            if content:
+                if content.endswith("\n"):
+                    with_newline += 1
+                else:
+                    without_newline += 1
+        
+        total = with_newline + without_newline
+        if total == 0:
+            return {"has_trailing_newline": None}
+        
+        return {
+            "has_trailing_newline": with_newline > without_newline,
+            "ratio": round(with_newline / total, 2),
+        }
+
+    def _get_code_files(self) -> list[Path]:
+        """Get list of code files to analyze."""
+        extensions = [".py", ".js", ".ts", ".tsx", ".go", ".rs", ".java", ".rb", ".php", ".c", ".cpp"]
+        files = []
+        try:
+            for ext in extensions:
+                files.extend(self.project_path.rglob(f"*{ext}"))
+        except PermissionError:
+            pass
+        return sorted(set(files))
+
+    def _safe_read_file(self, path: Path) -> str | None:
+        """Safely read a file."""
+        try:
+            return path.read_text(encoding="utf-8")
+        except (IOError, UnicodeDecodeError):
+            return None