Initial upload: PatternForge CLI tool with pattern detection and boilerplate generation
This commit is contained in:
251
src/patternforge/analyzer.py
Normal file
251
src/patternforge/analyzer.py
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import tree_sitter
|
||||||
|
from tree_sitter_languages import get_language
|
||||||
|
|
||||||
|
from patternforge.config import Config
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NamingPattern:
|
||||||
|
convention: str
|
||||||
|
prefixes: list[str] = field(default_factory=list)
|
||||||
|
suffixes: list[str] = field(default_factory=list)
|
||||||
|
examples: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CodeStructure:
|
||||||
|
class_patterns: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
function_patterns: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
import_patterns: list[str] = field(default_factory=list)
|
||||||
|
type_definitions: list[dict[str, str]] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class StylePattern:
|
||||||
|
indent_style: str = "space"
|
||||||
|
indent_size: int = 4
|
||||||
|
line_endings: str = "lf"
|
||||||
|
bracket_style: str = "same-line"
|
||||||
|
|
||||||
|
|
||||||
|
class CodeAnalyzer:
|
||||||
|
LANGUAGE_MAP = {
|
||||||
|
"python": "python",
|
||||||
|
"javascript": "javascript",
|
||||||
|
"typescript": "typescript",
|
||||||
|
"java": "java",
|
||||||
|
"cpp": "cpp",
|
||||||
|
"c": "c",
|
||||||
|
"rust": "rust",
|
||||||
|
"go": "go",
|
||||||
|
"ruby": "ruby",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, language: str, config: Config) -> None:
|
||||||
|
self.language = self.LANGUAGE_MAP.get(language, language)
|
||||||
|
self.config = config
|
||||||
|
self._try_init_language()
|
||||||
|
|
||||||
|
def _try_init_language(self) -> None:
|
||||||
|
try:
|
||||||
|
self._parser = tree_sitter.Parser()
|
||||||
|
self._language = get_language(self.language)
|
||||||
|
self._parser.set_language(self._language)
|
||||||
|
except Exception:
|
||||||
|
self._parser = None
|
||||||
|
self._language = None
|
||||||
|
|
||||||
|
def _file_extensions(self) -> set[str]:
|
||||||
|
extensions = {
|
||||||
|
"python": [".py", ".pyi"],
|
||||||
|
"javascript": [".js", ".mjs"],
|
||||||
|
"typescript": [".ts", ".tsx"],
|
||||||
|
"java": [".java"],
|
||||||
|
"cpp": [".cpp", ".cc", ".cxx", ".hpp"],
|
||||||
|
"c": [".c", ".h"],
|
||||||
|
"rust": [".rs"],
|
||||||
|
"go": [".go"],
|
||||||
|
"ruby": [".rb"],
|
||||||
|
}
|
||||||
|
return extensions.get(self.language, [f".{self.language}"])
|
||||||
|
|
||||||
|
def _is_code_file(self, path: Path) -> bool:
|
||||||
|
return path.suffix in self._file_extensions()
|
||||||
|
|
||||||
|
def _collect_files(self, path: Path, recursive: bool) -> list[Path]:
|
||||||
|
files: list[Path] = []
|
||||||
|
if path.is_file():
|
||||||
|
if self._is_code_file(path):
|
||||||
|
files.append(path)
|
||||||
|
return files
|
||||||
|
pattern = "**/*" if recursive else "*"
|
||||||
|
for f in path.glob(pattern):
|
||||||
|
if f.is_file() and self._is_code_file(f):
|
||||||
|
files.append(f)
|
||||||
|
return files
|
||||||
|
|
||||||
|
def _extract_naming_conventions(self, content: str) -> dict[str, NamingPattern]:
|
||||||
|
conventions: dict[str, NamingPattern] = {}
|
||||||
|
patterns = {
|
||||||
|
"camelCase": r"[a-z][a-zA-Z0-9]*",
|
||||||
|
"PascalCase": r"[A-Z][a-zA-Z0-9]*",
|
||||||
|
"snake_case": r"[a-z][a-z0-9_]*",
|
||||||
|
"SCREAMING_SNAKE_CASE": r"[A-Z][A-Z0-9_]*",
|
||||||
|
}
|
||||||
|
for name, pattern in patterns.items():
|
||||||
|
matches = re.findall(pattern, content)
|
||||||
|
if matches:
|
||||||
|
conventions[name] = NamingPattern(
|
||||||
|
convention=name,
|
||||||
|
examples=list(set(matches))[:10],
|
||||||
|
)
|
||||||
|
return conventions
|
||||||
|
|
||||||
|
def _extract_structure(self, content: str) -> CodeStructure:
|
||||||
|
structure = CodeStructure()
|
||||||
|
class_pattern = r"class\s+(\w+)"
|
||||||
|
func_pattern = r"def\s+(\w+)|function\s+(\w+)|public\s+\w+\s+(\w+)"
|
||||||
|
import_pattern = r"^import\s+.*|^from\s+.*|^#include\s+.*"
|
||||||
|
|
||||||
|
for match in re.finditer(class_pattern, content):
|
||||||
|
structure.class_patterns.append({"name": match.group(1)})
|
||||||
|
|
||||||
|
for match in re.finditer(func_pattern, content):
|
||||||
|
name = match.group(1) or match.group(2) or match.group(3)
|
||||||
|
if name:
|
||||||
|
structure.function_patterns.append({"name": name})
|
||||||
|
|
||||||
|
structure.import_patterns = re.findall(import_pattern, content, re.MULTILINE)[:20]
|
||||||
|
|
||||||
|
return structure
|
||||||
|
|
||||||
|
def _detect_style(self, content: str) -> StylePattern:
|
||||||
|
style = StylePattern()
|
||||||
|
if "\t" in content[:1000]:
|
||||||
|
style.indent_style = "tab"
|
||||||
|
style.indent_size = 1
|
||||||
|
elif " " * 4 in content[:1000]:
|
||||||
|
style.indent_size = 4
|
||||||
|
elif " " * 2 in content[:1000]:
|
||||||
|
style.indent_size = 2
|
||||||
|
|
||||||
|
if "\r\n" in content[:1000]:
|
||||||
|
style.line_endings = "crlf"
|
||||||
|
else:
|
||||||
|
style.line_endings = "lf"
|
||||||
|
|
||||||
|
return style
|
||||||
|
|
||||||
|
def _analyze_file(self, path: Path) -> dict[str, Any]:
|
||||||
|
try:
|
||||||
|
with open(path, encoding="utf-8", errors="ignore") as f:
|
||||||
|
content = f.read()
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"path": str(path),
|
||||||
|
"naming_conventions": self._extract_naming_conventions(content),
|
||||||
|
"structure": {
|
||||||
|
"classes": self._extract_structure(content).class_patterns,
|
||||||
|
"functions": self._extract_structure(content).function_patterns,
|
||||||
|
"imports": self._extract_structure(content).import_patterns,
|
||||||
|
},
|
||||||
|
"style": self._detect_style(content).__dict__,
|
||||||
|
"size": len(content),
|
||||||
|
"lines": content.count("\n"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def analyze(self, path: str, recursive: bool = True) -> dict[str, Any]:
|
||||||
|
target = Path(path)
|
||||||
|
files = self._collect_files(target, recursive)
|
||||||
|
if not files:
|
||||||
|
return {"error": "No matching files found", "language": self.language}
|
||||||
|
|
||||||
|
file_analyses = []
|
||||||
|
all_naming: dict[str, set[str]] = {}
|
||||||
|
all_classes: list[str] = []
|
||||||
|
all_functions: list[str] = []
|
||||||
|
all_imports: list[str] = []
|
||||||
|
style_votes = {"space": 0, "tab": 0}
|
||||||
|
indent_sizes: dict[int, int] = {}
|
||||||
|
|
||||||
|
for f in files:
|
||||||
|
analysis = self._analyze_file(f)
|
||||||
|
if not analysis:
|
||||||
|
continue
|
||||||
|
file_analyses.append(analysis)
|
||||||
|
for nc in analysis.get("naming_conventions", {}).values():
|
||||||
|
for ex in nc.examples:
|
||||||
|
if nc.convention not in all_naming:
|
||||||
|
all_naming[nc.convention] = set()
|
||||||
|
all_naming[nc.convention].add(ex)
|
||||||
|
|
||||||
|
for cls in analysis.get("structure", {}).get("classes", []):
|
||||||
|
all_classes.append(cls.get("name", ""))
|
||||||
|
|
||||||
|
for func in analysis.get("structure", {}).get("functions", []):
|
||||||
|
all_functions.append(func.get("name", ""))
|
||||||
|
|
||||||
|
all_imports.extend(analysis.get("structure", {}).get("imports", []))
|
||||||
|
|
||||||
|
style = analysis.get("style", {})
|
||||||
|
if style.get("indent_style"):
|
||||||
|
style_votes[style["indent_style"]] += 1
|
||||||
|
indent = style.get("indent_size", 0)
|
||||||
|
if indent > 0:
|
||||||
|
indent_sizes[indent] = indent_sizes.get(indent, 0) + 1
|
||||||
|
|
||||||
|
dominant_style = "space" if style_votes["space"] >= style_votes["tab"] else "tab"
|
||||||
|
dominant_indent = max(indent_sizes, key=indent_sizes.get, default=4)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"language": self.language,
|
||||||
|
"files_analyzed": len(file_analyses),
|
||||||
|
"file_details": file_analyses[:5],
|
||||||
|
"naming_conventions": {
|
||||||
|
k: list(v)[:20] for k, v in all_naming.items()
|
||||||
|
},
|
||||||
|
"entity_counts": {
|
||||||
|
"classes": len(all_classes),
|
||||||
|
"functions": len(all_functions),
|
||||||
|
"imports": len(all_imports),
|
||||||
|
},
|
||||||
|
"style": {
|
||||||
|
"indent_style": dominant_style,
|
||||||
|
"indent_size": dominant_indent,
|
||||||
|
},
|
||||||
|
"summary": {
|
||||||
|
"files": len(file_analyses),
|
||||||
|
"classes": len(all_classes),
|
||||||
|
"functions": len(all_functions),
|
||||||
|
"primary_naming": list(all_naming.keys())[0] if all_naming else "unknown",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def save_patterns(self, output_path: str, patterns: dict[str, Any]) -> None:
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
def convert_dataclass(obj: Any) -> Any:
|
||||||
|
if hasattr(obj, "__dict__"):
|
||||||
|
return {
|
||||||
|
k: convert_dataclass(v)
|
||||||
|
for k, v in obj.__dict__.items()
|
||||||
|
if not k.startswith("_")
|
||||||
|
}
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
return {k: convert_dataclass(v) for k, v in obj.items()}
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
return [convert_dataclass(i) for i in obj]
|
||||||
|
return obj
|
||||||
|
|
||||||
|
path = Path(output_path)
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
clean_patterns = convert_dataclass(patterns)
|
||||||
|
with open(path, "w") as f:
|
||||||
|
yaml.dump(clean_patterns, f, default_flow_style=False, indent=2)
|
||||||
Reference in New Issue
Block a user