fix: resolve CI mypy type checking issues
This commit is contained in:
253
patternforge/src/patternforge/analyzer.py
Normal file
253
patternforge/src/patternforge/analyzer.py
Normal file
@@ -0,0 +1,253 @@
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import tree_sitter
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
from patternforge.config import Config
|
||||
|
||||
|
||||
@dataclass
|
||||
class NamingPattern:
|
||||
convention: str
|
||||
prefixes: list[str] = field(default_factory=list)
|
||||
suffixes: list[str] = field(default_factory=list)
|
||||
examples: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CodeStructure:
|
||||
class_patterns: list[dict[str, Any]] = field(default_factory=list)
|
||||
function_patterns: list[dict[str, Any]] = field(default_factory=list)
|
||||
import_patterns: list[str] = field(default_factory=list)
|
||||
type_definitions: list[dict[str, str]] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StylePattern:
|
||||
indent_style: str = "space"
|
||||
indent_size: int = 4
|
||||
line_endings: str = "lf"
|
||||
bracket_style: str = "same-line"
|
||||
|
||||
|
||||
class CodeAnalyzer:
|
||||
LANGUAGE_MAP = {
|
||||
"python": "python",
|
||||
"javascript": "javascript",
|
||||
"typescript": "typescript",
|
||||
"java": "java",
|
||||
"cpp": "cpp",
|
||||
"c": "c",
|
||||
"rust": "rust",
|
||||
"go": "go",
|
||||
"ruby": "ruby",
|
||||
}
|
||||
|
||||
def __init__(self, language: str, config: Config) -> None:
|
||||
self.language = self.LANGUAGE_MAP.get(language, language)
|
||||
self.config = config
|
||||
self._parser: tree_sitter.Parser | None = None
|
||||
self._language: Any = None
|
||||
self._try_init_language()
|
||||
|
||||
def _try_init_language(self) -> None:
|
||||
try:
|
||||
self._parser = tree_sitter.Parser()
|
||||
self._language = get_language(self.language)
|
||||
self._parser.language = self._language
|
||||
except Exception:
|
||||
self._parser = None
|
||||
self._language = None
|
||||
|
||||
def _file_extensions(self) -> set[str]:
|
||||
extensions = {
|
||||
"python": [".py", ".pyi"],
|
||||
"javascript": [".js", ".mjs"],
|
||||
"typescript": [".ts", ".tsx"],
|
||||
"java": [".java"],
|
||||
"cpp": [".cpp", ".cc", ".cxx", ".hpp"],
|
||||
"c": [".c", ".h"],
|
||||
"rust": [".rs"],
|
||||
"go": [".go"],
|
||||
"ruby": [".rb"],
|
||||
}
|
||||
return set(extensions.get(self.language, [f".{self.language}"]))
|
||||
|
||||
def _is_code_file(self, path: Path) -> bool:
|
||||
return path.suffix in self._file_extensions()
|
||||
|
||||
def _collect_files(self, path: Path, recursive: bool) -> list[Path]:
|
||||
files: list[Path] = []
|
||||
if path.is_file():
|
||||
if self._is_code_file(path):
|
||||
files.append(path)
|
||||
return files
|
||||
pattern = "**/*" if recursive else "*"
|
||||
for f in path.glob(pattern):
|
||||
if f.is_file() and self._is_code_file(f):
|
||||
files.append(f)
|
||||
return files
|
||||
|
||||
def _extract_naming_conventions(self, content: str) -> dict[str, NamingPattern]:
|
||||
conventions: dict[str, NamingPattern] = {}
|
||||
patterns = {
|
||||
"camelCase": r"[a-z][a-zA-Z0-9]*",
|
||||
"PascalCase": r"[A-Z][a-zA-Z0-9]*",
|
||||
"snake_case": r"[a-z][a-z0-9_]*",
|
||||
"SCREAMING_SNAKE_CASE": r"[A-Z][A-Z0-9_]*",
|
||||
}
|
||||
for name, pattern in patterns.items():
|
||||
matches = re.findall(pattern, content)
|
||||
if matches:
|
||||
conventions[name] = NamingPattern(
|
||||
convention=name,
|
||||
examples=list(set(matches))[:10],
|
||||
)
|
||||
return conventions
|
||||
|
||||
def _extract_structure(self, content: str) -> CodeStructure:
|
||||
structure = CodeStructure()
|
||||
class_pattern = r"class\s+(\w+)"
|
||||
func_pattern = r"def\s+(\w+)|function\s+(\w+)|public\s+\w+\s+(\w+)"
|
||||
import_pattern = r"^import\s+.*|^from\s+.*|^#include\s+.*"
|
||||
|
||||
for match in re.finditer(class_pattern, content):
|
||||
structure.class_patterns.append({"name": match.group(1)})
|
||||
|
||||
for match in re.finditer(func_pattern, content):
|
||||
name = match.group(1) or match.group(2) or match.group(3)
|
||||
if name:
|
||||
structure.function_patterns.append({"name": name})
|
||||
|
||||
structure.import_patterns = re.findall(import_pattern, content, re.MULTILINE)[:20]
|
||||
|
||||
return structure
|
||||
|
||||
def _detect_style(self, content: str) -> StylePattern:
|
||||
style = StylePattern()
|
||||
if "\t" in content[:1000]:
|
||||
style.indent_style = "tab"
|
||||
style.indent_size = 1
|
||||
elif " " * 4 in content[:1000]:
|
||||
style.indent_size = 4
|
||||
elif " " * 2 in content[:1000]:
|
||||
style.indent_size = 2
|
||||
|
||||
if "\r\n" in content[:1000]:
|
||||
style.line_endings = "crlf"
|
||||
else:
|
||||
style.line_endings = "lf"
|
||||
|
||||
return style
|
||||
|
||||
def _analyze_file(self, path: Path) -> dict[str, Any]:
|
||||
try:
|
||||
with open(path, encoding="utf-8", errors="ignore") as f:
|
||||
content = f.read()
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
return {
|
||||
"path": str(path),
|
||||
"naming_conventions": self._extract_naming_conventions(content),
|
||||
"structure": {
|
||||
"classes": self._extract_structure(content).class_patterns,
|
||||
"functions": self._extract_structure(content).function_patterns,
|
||||
"imports": self._extract_structure(content).import_patterns,
|
||||
},
|
||||
"style": self._detect_style(content).__dict__,
|
||||
"size": len(content),
|
||||
"lines": content.count("\n"),
|
||||
}
|
||||
|
||||
def analyze(self, path: str, recursive: bool = True) -> dict[str, Any]:
|
||||
target = Path(path)
|
||||
files = self._collect_files(target, recursive)
|
||||
if not files:
|
||||
return {"error": "No matching files found", "language": self.language}
|
||||
|
||||
file_analyses = []
|
||||
all_naming: dict[str, set[str]] = {}
|
||||
all_classes: list[str] = []
|
||||
all_functions: list[str] = []
|
||||
all_imports: list[str] = []
|
||||
style_votes = {"space": 0, "tab": 0}
|
||||
indent_sizes: dict[int, int] = {}
|
||||
|
||||
for f in files:
|
||||
analysis = self._analyze_file(f)
|
||||
if not analysis:
|
||||
continue
|
||||
file_analyses.append(analysis)
|
||||
for nc in analysis.get("naming_conventions", {}).values():
|
||||
for ex in nc.examples:
|
||||
if nc.convention not in all_naming:
|
||||
all_naming[nc.convention] = set()
|
||||
all_naming[nc.convention].add(ex)
|
||||
|
||||
for cls in analysis.get("structure", {}).get("classes", []):
|
||||
all_classes.append(cls.get("name", ""))
|
||||
|
||||
for func in analysis.get("structure", {}).get("functions", []):
|
||||
all_functions.append(func.get("name", ""))
|
||||
|
||||
all_imports.extend(analysis.get("structure", {}).get("imports", []))
|
||||
|
||||
style = analysis.get("style", {})
|
||||
if style.get("indent_style"):
|
||||
style_votes[style["indent_style"]] += 1
|
||||
indent = style.get("indent_size", 0)
|
||||
if indent > 0:
|
||||
indent_sizes[indent] = indent_sizes.get(indent, 0) + 1
|
||||
|
||||
dominant_style = "space" if style_votes["space"] >= style_votes["tab"] else "tab"
|
||||
dominant_indent = max(indent_sizes.items(), key=lambda x: x[1], default=(4, 0))[0]
|
||||
|
||||
return {
|
||||
"language": self.language,
|
||||
"files_analyzed": len(file_analyses),
|
||||
"file_details": file_analyses[:5],
|
||||
"naming_conventions": {
|
||||
k: list(v)[:20] for k, v in all_naming.items()
|
||||
},
|
||||
"entity_counts": {
|
||||
"classes": len(all_classes),
|
||||
"functions": len(all_functions),
|
||||
"imports": len(all_imports),
|
||||
},
|
||||
"style": {
|
||||
"indent_style": dominant_style,
|
||||
"indent_size": dominant_indent,
|
||||
},
|
||||
"summary": {
|
||||
"files": len(file_analyses),
|
||||
"classes": len(all_classes),
|
||||
"functions": len(all_functions),
|
||||
"primary_naming": list(all_naming.keys())[0] if all_naming else "unknown",
|
||||
},
|
||||
}
|
||||
|
||||
def save_patterns(self, output_path: str, patterns: dict[str, Any]) -> None:
|
||||
import yaml
|
||||
|
||||
def convert_dataclass(obj: Any) -> Any:
|
||||
if hasattr(obj, "__dict__"):
|
||||
return {
|
||||
k: convert_dataclass(v)
|
||||
for k, v in obj.__dict__.items()
|
||||
if not k.startswith("_")
|
||||
}
|
||||
elif isinstance(obj, dict):
|
||||
return {k: convert_dataclass(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [convert_dataclass(i) for i in obj]
|
||||
return obj
|
||||
|
||||
path = Path(output_path)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
clean_patterns = convert_dataclass(patterns)
|
||||
with open(path, "w") as f:
|
||||
yaml.dump(clean_patterns, f, default_flow_style=False, indent=2)
|
||||
Reference in New Issue
Block a user