Add naming, style, and documentation analyzers
This commit is contained in:
189
src/contextgen/analyzers/style_analyzer.py
Normal file
189
src/contextgen/analyzers/style_analyzer.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""Style analyzer for indentation, quotes, and formatting patterns."""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
class StyleAnalyzer:
|
||||
"""Analyzes code style patterns."""
|
||||
|
||||
def __init__(self, project_path: Path):
|
||||
self.project_path = project_path
|
||||
|
||||
def analyze(self) -> dict[str, Any]:
|
||||
"""Analyze style patterns across project files."""
|
||||
indentation = self._detect_indentation()
|
||||
quote_style = self._detect_quote_style()
|
||||
line_endings = self._detect_line_endings()
|
||||
max_line_length = self._detect_line_length()
|
||||
trailing_newline = self._detect_trailing_newline()
|
||||
|
||||
return {
|
||||
"indentation": indentation,
|
||||
"quote_style": quote_style,
|
||||
"line_endings": line_endings,
|
||||
"max_line_length": max_line_length,
|
||||
"trailing_newline": trailing_newline,
|
||||
}
|
||||
|
||||
def _detect_indentation(self) -> dict[str, Any]:
|
||||
"""Detect indentation style and width."""
|
||||
indentations: dict[int, int] = {}
|
||||
|
||||
code_files = self._get_code_files()
|
||||
|
||||
for file_path in code_files[:30]:
|
||||
content = self._safe_read_file(file_path)
|
||||
if content:
|
||||
indent_width = self._analyze_indent_width(content)
|
||||
if indent_width:
|
||||
indentations[indent_width] = indentations.get(indent_width, 0) + 1
|
||||
|
||||
if not indentations:
|
||||
return {"style": "unknown", "width": None}
|
||||
|
||||
dominant_width: int = max(indentations.keys(), key=lambda k: indentations[k])
|
||||
|
||||
style = "spaces" if dominant_width in [2, 4] else "tabs"
|
||||
|
||||
return {"style": style, "width": dominant_width}
|
||||
|
||||
def _analyze_indent_width(self, content: str) -> int | None:
|
||||
"""Analyze the indentation width from content."""
|
||||
lines = content.split("\n")
|
||||
indent_counts: dict[int, int] = {}
|
||||
|
||||
for line in lines:
|
||||
if not line.strip():
|
||||
continue
|
||||
leading_spaces = len(line) - len(line.lstrip())
|
||||
leading_tabs = len(line) - len(line.lstrip("\t"))
|
||||
|
||||
if leading_spaces > 0 and leading_spaces % 2 == 0:
|
||||
indent_counts[leading_spaces] = indent_counts.get(leading_spaces, 0) + 1
|
||||
elif leading_tabs > 0:
|
||||
return 1
|
||||
|
||||
if indent_counts:
|
||||
return min(indent_counts.keys(), key=lambda k: indent_counts[k])
|
||||
return None
|
||||
|
||||
def _detect_quote_style(self) -> dict[str, Any]:
|
||||
"""Detect quote style (single vs double)."""
|
||||
single_count = 0
|
||||
double_count = 0
|
||||
|
||||
code_files = self._get_code_files()
|
||||
|
||||
for file_path in code_files[:30]:
|
||||
content = self._safe_read_file(file_path)
|
||||
if content:
|
||||
content = self._remove_string_literals(content)
|
||||
single_count += content.count("'") - content.count("\\'")
|
||||
double_count += content.count('"') - content.count('\\"')
|
||||
|
||||
total = single_count + double_count
|
||||
if total == 0:
|
||||
return {"style": "unknown", "ratio": None}
|
||||
|
||||
single_ratio = single_count / total
|
||||
|
||||
if single_ratio > 0.6:
|
||||
style = "single"
|
||||
elif double_ratio := 1 - single_ratio > 0.6:
|
||||
style = "double"
|
||||
else:
|
||||
style = "mixed"
|
||||
|
||||
return {"style": style, "single_ratio": round(single_ratio, 2)}
|
||||
|
||||
def _remove_string_literals(self, content: str) -> str:
|
||||
"""Remove string literals from content to avoid false positives."""
|
||||
pattern = r'(?:"(?:[^"\\]|\\.)*")|(?:\'(?:[^\'\\]|\\.)*\')'
|
||||
return re.sub(pattern, '""', content)
|
||||
|
||||
def _detect_line_endings(self) -> dict[str, Any]:
|
||||
"""Detect line ending style (LF vs CRLF)."""
|
||||
crlf_count = 0
|
||||
lf_count = 0
|
||||
|
||||
code_files = self._get_code_files()
|
||||
|
||||
for file_path in code_files[:20]:
|
||||
content = self._safe_read_file(file_path)
|
||||
if content:
|
||||
crlf_count += content.count("\r\n")
|
||||
lf_count += content.count("\n") - crlf_count
|
||||
|
||||
total = crlf_count + lf_count
|
||||
if total == 0:
|
||||
return {"style": "unknown"}
|
||||
|
||||
if crlf_count > lf_count:
|
||||
return {"style": "CRLF", "ratio": round(crlf_count / total, 2)}
|
||||
else:
|
||||
return {"style": "LF", "ratio": round(lf_count / total, 2)}
|
||||
|
||||
def _detect_line_length(self) -> dict[str, Any]:
|
||||
"""Detect preferred line length."""
|
||||
lengths: dict[int, int] = {}
|
||||
|
||||
code_files = self._get_code_files()
|
||||
|
||||
for file_path in code_files[:20]:
|
||||
content = self._safe_read_file(file_path)
|
||||
if content:
|
||||
for line in content.split("\n"):
|
||||
line_len = len(line.rstrip())
|
||||
if line_len > 0:
|
||||
bucket = (line_len // 10) * 10
|
||||
lengths[bucket] = lengths.get(bucket, 0) + 1
|
||||
|
||||
if not lengths:
|
||||
return {"max": None, "preferred": None}
|
||||
|
||||
max_bucket: int = max(lengths.keys(), key=lambda k: lengths[k])
|
||||
return {"max": max_bucket + 10, "preferred": max_bucket}
|
||||
|
||||
def _detect_trailing_newline(self) -> dict[str, Any]:
|
||||
"""Detect if files typically have trailing newlines."""
|
||||
with_newline = 0
|
||||
without_newline = 0
|
||||
|
||||
code_files = self._get_code_files()
|
||||
|
||||
for file_path in code_files[:20]:
|
||||
content = self._safe_read_file(file_path)
|
||||
if content:
|
||||
if content.endswith("\n"):
|
||||
with_newline += 1
|
||||
else:
|
||||
without_newline += 1
|
||||
|
||||
total = with_newline + without_newline
|
||||
if total == 0:
|
||||
return {"has_trailing_newline": None}
|
||||
|
||||
return {
|
||||
"has_trailing_newline": with_newline > without_newline,
|
||||
"ratio": round(with_newline / total, 2),
|
||||
}
|
||||
|
||||
def _get_code_files(self) -> list[Path]:
|
||||
"""Get list of code files to analyze."""
|
||||
extensions = [".py", ".js", ".ts", ".tsx", ".go", ".rs", ".java", ".rb", ".php", ".c", ".cpp"]
|
||||
files = []
|
||||
try:
|
||||
for ext in extensions:
|
||||
files.extend(self.project_path.rglob(f"*{ext}"))
|
||||
except PermissionError:
|
||||
pass
|
||||
return sorted(set(files))
|
||||
|
||||
def _safe_read_file(self, path: Path) -> str | None:
|
||||
"""Safely read a file."""
|
||||
try:
|
||||
return path.read_text(encoding="utf-8")
|
||||
except (IOError, UnicodeDecodeError):
|
||||
return None
|
||||
Reference in New Issue
Block a user