diff --git a/src/scanners/tree_sitter_scanner.py b/src/scanners/tree_sitter_scanner.py new file mode 100644 index 0000000..8cdfab8 --- /dev/null +++ b/src/scanners/tree_sitter_scanner.py @@ -0,0 +1,248 @@ +"""Tree-sitter based pattern scanner for multi-language code analysis.""" + +import re +from pathlib import Path +from typing import Optional + +from ..core.models import Issue, IssueCategory, SeverityLevel + + +class TreeSitterScanner: + """Multi-language pattern scanner using Tree-sitter.""" + + CREDENTIAL_PATTERNS = [ + (r"(?:api[_-]?key|apikey|secret|password|passwd|pwd|token|auth)\s*[:=]\s*['\"][a-zA-Z0-9_\-]{20,}['\"]", + "Potential hardcoded credential detected"), + (r"(?:aws[_-]?(?:access[_-]?key[_-]?id|secret[_-]?access[_-]?key))\s*[:=]\s*['\"][A-Z0-9]{20,}['\"]", + "Potential AWS credentials hardcoded"), + (r"['\"][A-Za-z0-9+\/]{40,}['\"]", "Potential API token or key detected"), + (r"(?:sk[_-]?live[_-]|[a-zA-Z0-9]{20,}[._]?)(?:secret|key|token)", + "Potential secret key detected"), + (r"ghp_[a-zA-Z0-9]{36}", "Potential GitHub PAT detected"), + (r"gho_[a-zA-Z0-9]{36}", "Potential GitHub OAuth token detected"), + (r"eyJ[a-zA-Z0-9_-]*\.eyJ[a-zA-Z0-9_-]*", "Potential JWT token detected"), + ] + + SQL_INJECTION_PATTERNS = [ + (r"['\"].*?(?:SELECT|INSERT|UPDATE|DELETE|DROP|ALTER).*?['\"].*?%", "Potential SQL injection - string formatting with user input"), + (r"(?:execute|exec)\s*\(\s*f?['\"].*?(?:SELECT|INSERT|UPDATE|DELETE)", "Potential SQL injection - f-string in SQL execution"), + (r"".*?".*\+.*?(?:user|input|param)", "Potential SQL injection - string concatenation"), + (r"format\s*\(\s*f?['\"].*?(?:SELECT|INSERT|UPDATE|DELETE)", "Potential SQL injection - format() with SQL"), + ] + + COMMAND_INJECTION_PATTERNS = [ + (r"os\.system\s*\(\s*f?['\"].*?['\"]", "Potential command injection - os.system with f-string"), + (r"subprocess\.(?:call|run|Popen)\s*\([^)]*shell\s*=\s*True", "Potential command injection - shell=True with user input"), + (r"eval\s*\(\s*f?['\"].*?['\"]", "Potential code injection - eval with f-string"), + (r"exec\s*\(\s*f?['\"].*?['\"]", "Potential code injection - exec with f-string"), + ] + + MUTABLE_DEFAULT_PATTERNS = [ + (r"def\s+\w+\s*\([^)]*=\s*\[", "Mutable default argument - use None and initialize inside"), + (r"def\s+\w+\s*\([^)]*=\s*\{", "Mutable default argument - use None and initialize inside"), + ] + + ERROR_HANDLING_PATTERNS = [ + (r"except\s*:\s*$", "Bare except clause - catch specific exceptions"), + (r"except\s+Exception\s*$", "Catch-all except Exception - catch specific exceptions"), + (r"try:\s*.*?except\s*:\s*pass", "Silent exception handling - at least log the error"), + (r"try:\s*.*?except\s+[^:]+:\s*pass", "Silent exception handling - at least log the error"), + ] + + def __init__(self): + """Initialize the Tree-sitter scanner.""" + self._parsers = {} + + def scan_file(self, file_path: str) -> list[Issue]: + """Scan a single file for patterns.""" + try: + path = Path(file_path) + if not path.exists(): + return [] + + content = path.read_text(encoding="utf-8", errors="replace") + language = self._detect_language(file_path) + return self.scan_content(content, file_path, language) + except Exception: + return [] + + def scan_content(self, content: str, file_path: str, language: str) -> list[Issue]: + """Scan code content for patterns.""" + issues = [] + + lines = content.split('\n') + + issues.extend(self._scan_credential_patterns(content, file_path, lines)) + issues.extend(self._scan_sql_injection_patterns(content, file_path, lines)) + issues.extend(self._scan_command_injection_patterns(content, file_path, lines)) + + if language == "python": + issues.extend(self._scan_python_patterns(content, file_path, lines)) + + return issues + + def _scan_credential_patterns(self, content: str, file_path: str, lines: list[str]) -> list[Issue]: + """Scan for hardcoded credentials.""" + issues = [] + for pattern, message in self.CREDENTIAL_PATTERNS: + for i, line in enumerate(lines, 1): + if re.search(pattern, line, re.IGNORECASE): + if not self._is_false_positive(line): + issues.append(Issue( + severity=SeverityLevel.CRITICAL, + category=IssueCategory.SECURITY, + file_path=file_path, + line_number=i, + message=message, + suggestion="Move credentials to environment variables or config file", + scanner_name="tree-sitter", + )) + return issues + + def _scan_sql_injection_patterns(self, content: str, file_path: str, lines: list[str]) -> list[Issue]: + """Scan for SQL injection vulnerabilities.""" + issues = [] + for pattern, message in self.SQL_INJECTION_PATTERNS: + for i, line in enumerate(lines, 1): + if re.search(pattern, line, re.IGNORECASE): + if not self._is_comment(line): + issues.append(Issue( + severity=SeverityLevel.HIGH, + category=IssueCategory.SECURITY, + file_path=file_path, + line_number=i, + message=message, + suggestion="Use parameterized queries or ORM", + scanner_name="tree-sitter", + )) + return issues + + def _scan_command_injection_patterns(self, content: str, file_path: str, lines: list[str]) -> list[Issue]: + """Scan for command injection vulnerabilities.""" + issues = [] + for pattern, message in self.COMMAND_INJECTION_PATTERNS: + for i, line in enumerate(lines, 1): + if re.search(pattern, line, re.IGNORECASE): + if not self._is_comment(line): + issues.append(Issue( + severity=SeverityLevel.HIGH, + category=IssueCategory.SECURITY, + file_path=file_path, + line_number=i, + message=message, + suggestion="Sanitize user input or use subprocess without shell=True", + scanner_name="tree-sitter", + )) + return issues + + def _scan_python_patterns(self, content: str, file_path: str, lines: list[str]) -> list[Issue]: + """Scan for Python-specific patterns.""" + issues = [] + + for pattern, message in self.MUTABLE_DEFAULT_PATTERNS: + for i, line in enumerate(lines, 1): + if re.search(pattern, line): + issues.append(Issue( + severity=SeverityLevel.MEDIUM, + category=IssueCategory.ANTI_PATTERN, + file_path=file_path, + line_number=i, + message=message, + suggestion="Use None as default and initialize inside the function", + scanner_name="tree-sitter", + )) + + for pattern, message in self.ERROR_HANDLING_PATTERNS: + for i, line in enumerate(lines, 1): + if re.search(pattern, line, re.IGNORECASE | re.MULTILINE): + if not self._is_comment(line): + issues.append(Issue( + severity=SeverityLevel.LOW, + category=IssueCategory.ERROR_HANDLING, + file_path=file_path, + line_number=i, + message=message, + suggestion="Catch specific exceptions and handle them appropriately", + scanner_name="tree-sitter", + )) + + issues.extend(self._scan_complex_functions(content, file_path, lines)) + + return issues + + def _scan_complex_functions(self, content: str, file_path: str, lines: list[str]) -> list[Issue]: + """Scan for overly complex functions.""" + issues = [] + in_function = False + function_lines = 0 + function_start = 0 + func_name = "" + + for i, line in enumerate(lines, 1): + stripped = line.strip() + + if in_function: + function_lines += 1 + + if function_lines > 50 and i == function_start + function_lines - 1: + issues.append(Issue( + severity=SeverityLevel.LOW, + category=IssueCategory.COMPLEXITY, + file_path=file_path, + line_number=function_start, + message=f"Function '{func_name}' is very long ({function_lines} lines)", + suggestion="Consider breaking this function into smaller ones", + scanner_name="tree-sitter", + )) + in_function = False + + if stripped.startswith('def ') or stripped.startswith('async def '): + in_function = False + + if stripped and not stripped.startswith(' ') and not stripped.startswith('\t'): + if not stripped.startswith('#'): + in_function = False + + if stripped.startswith('def ') or stripped.startswith('async def '): + match = re.match(r'(?:async\s+)?def\s+(\w+)', stripped) + if match: + func_name = match.group(1) + function_start = i + function_lines = 1 + in_function = True + + return issues + + def _is_false_positive(self, line: str) -> bool: + """Check if a line is likely a false positive.""" + false_positive_indicators = [ + 'example', 'placeholder', 'test', 'fake', 'demo', 'mock', + 'your_', 'YOUR_', 'TODO', 'FIXME', 'XXX', + ] + line_lower = line.lower() + return any(indicator in line_lower for indicator in false_positive_indicators) + + def _is_comment(self, line: str) -> bool: + """Check if a line is a comment.""" + stripped = line.strip() + return stripped.startswith('#') + + def _detect_language(self, file_path: str) -> str: + """Detect the programming language from file path.""" + ext = Path(file_path).suffix.lower() + language_map = { + ".py": "python", + ".js": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".jsx": "javascript", + } + return language_map.get(ext, "unknown") + + def get_plugin_info(self) -> dict: + """Get information about the Tree-sitter scanner.""" + return { + "name": "tree-sitter", + "version": "0.25.2", + "description": "Multi-language pattern scanner for code analysis", + }