Add language detector, code analyzer, and issue detector
This commit is contained in:
173
src/gdiffer/code_analyzer.py
Normal file
173
src/gdiffer/code_analyzer.py
Normal file
@@ -0,0 +1,173 @@
|
||||
"""Code analyzer using tree-sitter for AST-based analysis."""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from gdiffer.language_detector import LanguageDetector
|
||||
|
||||
|
||||
LANGUAGE_GRAMMARS = {
|
||||
'python': 'python', 'javascript': 'javascript', 'typescript': 'typescript',
|
||||
'java': 'java', 'go': 'go', 'rust': 'rust', 'c': 'c', 'cpp': 'cpp', 'ruby': 'ruby', 'php': 'php',
|
||||
}
|
||||
|
||||
|
||||
class CodeAnalyzer:
|
||||
def __init__(self):
|
||||
self.language_detector = LanguageDetector()
|
||||
self._parsers = {}
|
||||
|
||||
def _get_parser(self, language: str):
|
||||
if language not in self._parsers:
|
||||
try:
|
||||
import tree_sitter
|
||||
lang_name = LANGUAGE_GRAMMARS.get(language, language)
|
||||
self._parsers[language] = tree_sitter.Parser(lang_name)
|
||||
except Exception:
|
||||
self._parsers[language] = None
|
||||
return self._parsers[language]
|
||||
|
||||
def analyze_code(self, code: str, language: str = "text") -> dict:
|
||||
result = {
|
||||
'language': language, 'functions': [], 'classes': [],
|
||||
'imports': [], 'function_calls': [], 'change_summary': "",
|
||||
}
|
||||
if language == "text" or not code.strip():
|
||||
return result
|
||||
parser = self._get_parser(language)
|
||||
if parser is None:
|
||||
result['change_summary'] = self._analyze_without_parser(code)
|
||||
return result
|
||||
try:
|
||||
tree = parser.parse(code.encode() if isinstance(code, str) else code)
|
||||
result['ast_info'] = self._extract_ast_info(tree.root_node, language)
|
||||
result['change_summary'] = self._generate_summary(result['ast_info'])
|
||||
except Exception:
|
||||
result['change_summary'] = self._analyze_without_parser(code)
|
||||
return result
|
||||
|
||||
def _extract_ast_info(self, node, language: str) -> dict:
|
||||
info = {'functions': [], 'classes': [], 'imports': [], 'function_calls': [], 'nested_nodes': []}
|
||||
if node is None:
|
||||
return info
|
||||
node_type = node.type
|
||||
node_text = node.text.decode() if isinstance(node.text, bytes) else node.text
|
||||
function_keywords = ['function_definition', 'function_declaration', 'method_definition', 'func']
|
||||
class_keywords = ['class_definition', 'class_declaration', 'struct', 'impl']
|
||||
import_keywords = ['import_statement', 'import_from_statement', 'import', 'require']
|
||||
if node_type in function_keywords:
|
||||
info['functions'].append(self._extract_function_info(node, language))
|
||||
if node_type in class_keywords:
|
||||
info['classes'].append(self._extract_class_info(node, language))
|
||||
if node_type in import_keywords:
|
||||
info['imports'].append(node_text)
|
||||
for child in node.children:
|
||||
child_info = self._extract_ast_info(child, language)
|
||||
info['functions'].extend(child_info['functions'])
|
||||
info['classes'].extend(child_info['classes'])
|
||||
info['imports'].extend(child_info['imports'])
|
||||
info['function_calls'].extend(child_info['function_calls'])
|
||||
return info
|
||||
|
||||
def _extract_function_info(self, node, language: str) -> dict:
|
||||
name = ""
|
||||
params = []
|
||||
start_line = node.start_point[0] + 1 if node.start_point else 0
|
||||
for child in node.children:
|
||||
if child.type in ['identifier', 'function_name', 'name']:
|
||||
name = child.text.decode() if isinstance(child.text, bytes) else child.text
|
||||
elif child.type in ['parameters', 'parameter_list', 'formal_parameters']:
|
||||
params = self._extract_parameters(child)
|
||||
return {'name': name, 'parameters': params, 'start_line': start_line}
|
||||
|
||||
def _extract_class_info(self, node, language: str) -> dict:
|
||||
name = ""
|
||||
start_line = node.start_point[0] + 1 if node.start_point else 0
|
||||
for child in node.children:
|
||||
if child.type in ['identifier', 'name', 'type_identifier']:
|
||||
if not name:
|
||||
name = child.text.decode() if isinstance(child.text, bytes) else child.text
|
||||
return {'name': name, 'start_line': start_line, 'methods': []}
|
||||
|
||||
def _extract_parameters(self, node) -> list[str]:
|
||||
params = []
|
||||
for child in node.children:
|
||||
if child.type in ['identifier', 'parameter', 'positional_argument']:
|
||||
param_name = child.text.decode() if isinstance(child.text, bytes) else child.text
|
||||
if param_name and param_name not in [',', '(', ')']:
|
||||
params.append(param_name)
|
||||
return params
|
||||
|
||||
def _analyze_without_parser(self, code: str) -> str:
|
||||
summary_parts = []
|
||||
added_lines = [l for l in code.splitlines() if l.strip().startswith('+') and not l.strip().startswith('+++')]
|
||||
removed_lines = [l for l in code.splitlines() if l.strip().startswith('-') and not l.strip().startswith('---')]
|
||||
if added_lines or removed_lines:
|
||||
summary_parts.append(f"Added {len(added_lines)} lines, removed {len(removed_lines)} lines")
|
||||
func_patterns = {
|
||||
'python': r'^def\s+(\w+)', 'javascript': r'^function\s+(\w+)', 'java': r'\w+\s+\w+\s*\(',
|
||||
'go': r'^func\s+(\w+)', 'rust': r'^fn\s+(\w+)',
|
||||
}
|
||||
for lang, pattern in func_patterns.items():
|
||||
funcs = re.findall(pattern, code, re.MULTILINE)
|
||||
if funcs:
|
||||
func_names = [f if isinstance(f, str) else next((x for x in f if x), '') for f in funcs if f]
|
||||
if func_names:
|
||||
summary_parts.append(f"Functions: {', '.join(func_names[:5])}")
|
||||
break
|
||||
class_patterns = {'python': r'^class\s+(\w+)', 'javascript': r'^class\s+(\w+)', 'java': r'^\s*class\s+(\w+)'}
|
||||
for lang, pattern in class_patterns.items():
|
||||
classes = re.findall(pattern, code, re.MULTILINE)
|
||||
if classes:
|
||||
summary_parts.append(f"Classes/Structs: {', '.join(classes[:3])}")
|
||||
break
|
||||
return '. '.join(summary_parts) if summary_parts else "Code changes detected"
|
||||
|
||||
def _generate_summary(self, ast_info: dict) -> str:
|
||||
summary_parts = []
|
||||
funcs = ast_info.get('functions', [])
|
||||
if funcs:
|
||||
func_names = [f['name'] for f in funcs if f.get('name')]
|
||||
if func_names:
|
||||
summary_parts.append(f"Functions: {', '.join(func_names[:5])}")
|
||||
classes = ast_info.get('classes', [])
|
||||
if classes:
|
||||
class_names = [c['name'] for c in classes if c.get('name')]
|
||||
if class_names:
|
||||
summary_parts.append(f"Classes: {', '.join(class_names[:3])}")
|
||||
return '. '.join(summary_parts) if summary_parts else "Code changes detected"
|
||||
|
||||
def summarize_change(self, old_code: str, new_code: str, language: str = "text") -> str:
|
||||
old_analysis = self.analyze_code(old_code, language)
|
||||
new_analysis = self.analyze_code(new_code, language)
|
||||
summary_parts = []
|
||||
old_funcs = set(f['name'] for f in old_analysis.get('functions', []) if f.get('name'))
|
||||
new_funcs = set(f['name'] for f in new_analysis.get('functions', []) if f.get('name'))
|
||||
added_funcs = new_funcs - old_funcs
|
||||
removed_funcs = old_funcs - new_funcs
|
||||
if added_funcs:
|
||||
summary_parts.append(f"Added functions: {', '.join(sorted(added_funcs))}")
|
||||
if removed_funcs:
|
||||
summary_parts.append(f"Removed functions: {', '.join(sorted(removed_funcs))}")
|
||||
old_classes = set(c['name'] for c in old_analysis.get('classes', []) if c.get('name'))
|
||||
new_classes = set(c['name'] for c in new_analysis.get('classes', []) if c.get('name'))
|
||||
added_classes = new_classes - old_classes
|
||||
removed_classes = old_classes - new_classes
|
||||
if added_classes:
|
||||
summary_parts.append(f"Added classes: {', '.join(sorted(added_classes))}")
|
||||
if removed_classes:
|
||||
summary_parts.append(f"Removed classes: {', '.join(sorted(removed_classes))}")
|
||||
line_diff = len(new_code.splitlines()) - len(old_code.splitlines())
|
||||
if line_diff != 0:
|
||||
summary_parts.append(f"Line count: {'+' if line_diff > 0 else ''}{line_diff}")
|
||||
return '. '.join(summary_parts) if summary_parts else "Code modified"
|
||||
|
||||
|
||||
def analyze_code(code: str, language: str = "text") -> dict:
|
||||
analyzer = CodeAnalyzer()
|
||||
return analyzer.analyze_code(code, language)
|
||||
|
||||
|
||||
def summarize_change(old_code: str, new_code: str, language: str = "text") -> str:
|
||||
analyzer = CodeAnalyzer()
|
||||
return analyzer.summarize_change(old_code, new_code, language)
|
||||
Reference in New Issue
Block a user