diff --git a/src/contextgen/analyzers/style_analyzer.py b/src/contextgen/analyzers/style_analyzer.py new file mode 100644 index 0000000..1492a28 --- /dev/null +++ b/src/contextgen/analyzers/style_analyzer.py @@ -0,0 +1,189 @@ +"""Style analyzer for indentation, quotes, and formatting patterns.""" + +import re +from pathlib import Path +from typing import Any + + +class StyleAnalyzer: + """Analyzes code style patterns.""" + + def __init__(self, project_path: Path): + self.project_path = project_path + + def analyze(self) -> dict[str, Any]: + """Analyze style patterns across project files.""" + indentation = self._detect_indentation() + quote_style = self._detect_quote_style() + line_endings = self._detect_line_endings() + max_line_length = self._detect_line_length() + trailing_newline = self._detect_trailing_newline() + + return { + "indentation": indentation, + "quote_style": quote_style, + "line_endings": line_endings, + "max_line_length": max_line_length, + "trailing_newline": trailing_newline, + } + + def _detect_indentation(self) -> dict[str, Any]: + """Detect indentation style and width.""" + indentations: dict[int, int] = {} + + code_files = self._get_code_files() + + for file_path in code_files[:30]: + content = self._safe_read_file(file_path) + if content: + indent_width = self._analyze_indent_width(content) + if indent_width: + indentations[indent_width] = indentations.get(indent_width, 0) + 1 + + if not indentations: + return {"style": "unknown", "width": None} + + dominant_width: int = max(indentations.keys(), key=lambda k: indentations[k]) + + style = "spaces" if dominant_width in [2, 4] else "tabs" + + return {"style": style, "width": dominant_width} + + def _analyze_indent_width(self, content: str) -> int | None: + """Analyze the indentation width from content.""" + lines = content.split("\n") + indent_counts: dict[int, int] = {} + + for line in lines: + if not line.strip(): + continue + leading_spaces = len(line) - len(line.lstrip()) + leading_tabs = len(line) - len(line.lstrip("\t")) + + if leading_spaces > 0 and leading_spaces % 2 == 0: + indent_counts[leading_spaces] = indent_counts.get(leading_spaces, 0) + 1 + elif leading_tabs > 0: + return 1 + + if indent_counts: + return min(indent_counts.keys(), key=lambda k: indent_counts[k]) + return None + + def _detect_quote_style(self) -> dict[str, Any]: + """Detect quote style (single vs double).""" + single_count = 0 + double_count = 0 + + code_files = self._get_code_files() + + for file_path in code_files[:30]: + content = self._safe_read_file(file_path) + if content: + content = self._remove_string_literals(content) + single_count += content.count("'") - content.count("\\'") + double_count += content.count('"') - content.count('\\"') + + total = single_count + double_count + if total == 0: + return {"style": "unknown", "ratio": None} + + single_ratio = single_count / total + + if single_ratio > 0.6: + style = "single" + elif double_ratio := 1 - single_ratio > 0.6: + style = "double" + else: + style = "mixed" + + return {"style": style, "single_ratio": round(single_ratio, 2)} + + def _remove_string_literals(self, content: str) -> str: + """Remove string literals from content to avoid false positives.""" + pattern = r'(?:"(?:[^"\\]|\\.)*")|(?:\'(?:[^\'\\]|\\.)*\')' + return re.sub(pattern, '""', content) + + def _detect_line_endings(self) -> dict[str, Any]: + """Detect line ending style (LF vs CRLF).""" + crlf_count = 0 + lf_count = 0 + + code_files = self._get_code_files() + + for file_path in code_files[:20]: + content = self._safe_read_file(file_path) + if content: + crlf_count += content.count("\r\n") + lf_count += content.count("\n") - crlf_count + + total = crlf_count + lf_count + if total == 0: + return {"style": "unknown"} + + if crlf_count > lf_count: + return {"style": "CRLF", "ratio": round(crlf_count / total, 2)} + else: + return {"style": "LF", "ratio": round(lf_count / total, 2)} + + def _detect_line_length(self) -> dict[str, Any]: + """Detect preferred line length.""" + lengths: dict[int, int] = {} + + code_files = self._get_code_files() + + for file_path in code_files[:20]: + content = self._safe_read_file(file_path) + if content: + for line in content.split("\n"): + line_len = len(line.rstrip()) + if line_len > 0: + bucket = (line_len // 10) * 10 + lengths[bucket] = lengths.get(bucket, 0) + 1 + + if not lengths: + return {"max": None, "preferred": None} + + max_bucket: int = max(lengths.keys(), key=lambda k: lengths[k]) + return {"max": max_bucket + 10, "preferred": max_bucket} + + def _detect_trailing_newline(self) -> dict[str, Any]: + """Detect if files typically have trailing newlines.""" + with_newline = 0 + without_newline = 0 + + code_files = self._get_code_files() + + for file_path in code_files[:20]: + content = self._safe_read_file(file_path) + if content: + if content.endswith("\n"): + with_newline += 1 + else: + without_newline += 1 + + total = with_newline + without_newline + if total == 0: + return {"has_trailing_newline": None} + + return { + "has_trailing_newline": with_newline > without_newline, + "ratio": round(with_newline / total, 2), + } + + def _get_code_files(self) -> list[Path]: + """Get list of code files to analyze.""" + extensions = [".py", ".js", ".ts", ".tsx", ".go", ".rs", ".java", ".rb", ".php", ".c", ".cpp"] + files = [] + try: + for ext in extensions: + files.extend(self.project_path.rglob(f"*{ext}")) + except PermissionError: + pass + return sorted(set(files)) + + def _safe_read_file(self, path: Path) -> str | None: + """Safely read a file.""" + try: + return path.read_text(encoding="utf-8") + except (IOError, UnicodeDecodeError): + return None