Add naming, style, and documentation analyzers
This commit is contained in:
189
src/contextgen/analyzers/style_analyzer.py
Normal file
189
src/contextgen/analyzers/style_analyzer.py
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
"""Style analyzer for indentation, quotes, and formatting patterns."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
class StyleAnalyzer:
|
||||||
|
"""Analyzes code style patterns."""
|
||||||
|
|
||||||
|
def __init__(self, project_path: Path):
|
||||||
|
self.project_path = project_path
|
||||||
|
|
||||||
|
def analyze(self) -> dict[str, Any]:
|
||||||
|
"""Analyze style patterns across project files."""
|
||||||
|
indentation = self._detect_indentation()
|
||||||
|
quote_style = self._detect_quote_style()
|
||||||
|
line_endings = self._detect_line_endings()
|
||||||
|
max_line_length = self._detect_line_length()
|
||||||
|
trailing_newline = self._detect_trailing_newline()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"indentation": indentation,
|
||||||
|
"quote_style": quote_style,
|
||||||
|
"line_endings": line_endings,
|
||||||
|
"max_line_length": max_line_length,
|
||||||
|
"trailing_newline": trailing_newline,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _detect_indentation(self) -> dict[str, Any]:
|
||||||
|
"""Detect indentation style and width."""
|
||||||
|
indentations: dict[int, int] = {}
|
||||||
|
|
||||||
|
code_files = self._get_code_files()
|
||||||
|
|
||||||
|
for file_path in code_files[:30]:
|
||||||
|
content = self._safe_read_file(file_path)
|
||||||
|
if content:
|
||||||
|
indent_width = self._analyze_indent_width(content)
|
||||||
|
if indent_width:
|
||||||
|
indentations[indent_width] = indentations.get(indent_width, 0) + 1
|
||||||
|
|
||||||
|
if not indentations:
|
||||||
|
return {"style": "unknown", "width": None}
|
||||||
|
|
||||||
|
dominant_width: int = max(indentations.keys(), key=lambda k: indentations[k])
|
||||||
|
|
||||||
|
style = "spaces" if dominant_width in [2, 4] else "tabs"
|
||||||
|
|
||||||
|
return {"style": style, "width": dominant_width}
|
||||||
|
|
||||||
|
def _analyze_indent_width(self, content: str) -> int | None:
|
||||||
|
"""Analyze the indentation width from content."""
|
||||||
|
lines = content.split("\n")
|
||||||
|
indent_counts: dict[int, int] = {}
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
leading_spaces = len(line) - len(line.lstrip())
|
||||||
|
leading_tabs = len(line) - len(line.lstrip("\t"))
|
||||||
|
|
||||||
|
if leading_spaces > 0 and leading_spaces % 2 == 0:
|
||||||
|
indent_counts[leading_spaces] = indent_counts.get(leading_spaces, 0) + 1
|
||||||
|
elif leading_tabs > 0:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if indent_counts:
|
||||||
|
return min(indent_counts.keys(), key=lambda k: indent_counts[k])
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _detect_quote_style(self) -> dict[str, Any]:
|
||||||
|
"""Detect quote style (single vs double)."""
|
||||||
|
single_count = 0
|
||||||
|
double_count = 0
|
||||||
|
|
||||||
|
code_files = self._get_code_files()
|
||||||
|
|
||||||
|
for file_path in code_files[:30]:
|
||||||
|
content = self._safe_read_file(file_path)
|
||||||
|
if content:
|
||||||
|
content = self._remove_string_literals(content)
|
||||||
|
single_count += content.count("'") - content.count("\\'")
|
||||||
|
double_count += content.count('"') - content.count('\\"')
|
||||||
|
|
||||||
|
total = single_count + double_count
|
||||||
|
if total == 0:
|
||||||
|
return {"style": "unknown", "ratio": None}
|
||||||
|
|
||||||
|
single_ratio = single_count / total
|
||||||
|
|
||||||
|
if single_ratio > 0.6:
|
||||||
|
style = "single"
|
||||||
|
elif double_ratio := 1 - single_ratio > 0.6:
|
||||||
|
style = "double"
|
||||||
|
else:
|
||||||
|
style = "mixed"
|
||||||
|
|
||||||
|
return {"style": style, "single_ratio": round(single_ratio, 2)}
|
||||||
|
|
||||||
|
def _remove_string_literals(self, content: str) -> str:
|
||||||
|
"""Remove string literals from content to avoid false positives."""
|
||||||
|
pattern = r'(?:"(?:[^"\\]|\\.)*")|(?:\'(?:[^\'\\]|\\.)*\')'
|
||||||
|
return re.sub(pattern, '""', content)
|
||||||
|
|
||||||
|
def _detect_line_endings(self) -> dict[str, Any]:
|
||||||
|
"""Detect line ending style (LF vs CRLF)."""
|
||||||
|
crlf_count = 0
|
||||||
|
lf_count = 0
|
||||||
|
|
||||||
|
code_files = self._get_code_files()
|
||||||
|
|
||||||
|
for file_path in code_files[:20]:
|
||||||
|
content = self._safe_read_file(file_path)
|
||||||
|
if content:
|
||||||
|
crlf_count += content.count("\r\n")
|
||||||
|
lf_count += content.count("\n") - crlf_count
|
||||||
|
|
||||||
|
total = crlf_count + lf_count
|
||||||
|
if total == 0:
|
||||||
|
return {"style": "unknown"}
|
||||||
|
|
||||||
|
if crlf_count > lf_count:
|
||||||
|
return {"style": "CRLF", "ratio": round(crlf_count / total, 2)}
|
||||||
|
else:
|
||||||
|
return {"style": "LF", "ratio": round(lf_count / total, 2)}
|
||||||
|
|
||||||
|
def _detect_line_length(self) -> dict[str, Any]:
|
||||||
|
"""Detect preferred line length."""
|
||||||
|
lengths: dict[int, int] = {}
|
||||||
|
|
||||||
|
code_files = self._get_code_files()
|
||||||
|
|
||||||
|
for file_path in code_files[:20]:
|
||||||
|
content = self._safe_read_file(file_path)
|
||||||
|
if content:
|
||||||
|
for line in content.split("\n"):
|
||||||
|
line_len = len(line.rstrip())
|
||||||
|
if line_len > 0:
|
||||||
|
bucket = (line_len // 10) * 10
|
||||||
|
lengths[bucket] = lengths.get(bucket, 0) + 1
|
||||||
|
|
||||||
|
if not lengths:
|
||||||
|
return {"max": None, "preferred": None}
|
||||||
|
|
||||||
|
max_bucket: int = max(lengths.keys(), key=lambda k: lengths[k])
|
||||||
|
return {"max": max_bucket + 10, "preferred": max_bucket}
|
||||||
|
|
||||||
|
def _detect_trailing_newline(self) -> dict[str, Any]:
|
||||||
|
"""Detect if files typically have trailing newlines."""
|
||||||
|
with_newline = 0
|
||||||
|
without_newline = 0
|
||||||
|
|
||||||
|
code_files = self._get_code_files()
|
||||||
|
|
||||||
|
for file_path in code_files[:20]:
|
||||||
|
content = self._safe_read_file(file_path)
|
||||||
|
if content:
|
||||||
|
if content.endswith("\n"):
|
||||||
|
with_newline += 1
|
||||||
|
else:
|
||||||
|
without_newline += 1
|
||||||
|
|
||||||
|
total = with_newline + without_newline
|
||||||
|
if total == 0:
|
||||||
|
return {"has_trailing_newline": None}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"has_trailing_newline": with_newline > without_newline,
|
||||||
|
"ratio": round(with_newline / total, 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_code_files(self) -> list[Path]:
|
||||||
|
"""Get list of code files to analyze."""
|
||||||
|
extensions = [".py", ".js", ".ts", ".tsx", ".go", ".rs", ".java", ".rb", ".php", ".c", ".cpp"]
|
||||||
|
files = []
|
||||||
|
try:
|
||||||
|
for ext in extensions:
|
||||||
|
files.extend(self.project_path.rglob(f"*{ext}"))
|
||||||
|
except PermissionError:
|
||||||
|
pass
|
||||||
|
return sorted(set(files))
|
||||||
|
|
||||||
|
def _safe_read_file(self, path: Path) -> str | None:
|
||||||
|
"""Safely read a file."""
|
||||||
|
try:
|
||||||
|
return path.read_text(encoding="utf-8")
|
||||||
|
except (IOError, UnicodeDecodeError):
|
||||||
|
return None
|
||||||
Reference in New Issue
Block a user