diff --git a/i18n_guardian/scanners/string_scanner.py b/i18n_guardian/scanners/string_scanner.py new file mode 100644 index 0000000..48d90a2 --- /dev/null +++ b/i18n_guardian/scanners/string_scanner.py @@ -0,0 +1,191 @@ +"""String scanner for detecting hardcoded strings.""" + +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import List, Optional, Set + +from i18n_guardian.parsers.base import StringLiteral +from i18n_guardian.parsers.simple import get_default_registry + + +@dataclass +class Violation: + """Represents a hardcoded string violation.""" + + literal: StringLiteral + suggested_key: str + reason: str = "Hardcoded string detected" + + +@dataclass +class ScanResult: + """Result of a scan operation.""" + + violations: List[Violation] = field(default_factory=list) + files_scanned: int = 0 + strings_found: int = 0 + violations_count: int = 0 + + +class StringScanner: + """Scanner for hardcoded strings in code.""" + + URL_PATTERN = re.compile( + r"^https?://", + re.IGNORECASE, + ) + + EMAIL_PATTERN = re.compile( + r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", + ) + + REGEX_PATTERN = re.compile( + r"^[/^].*[/$]$", + ) + + I18N_CALL_PATTERN = re.compile( + r"^(\w+)\s*\(", + ) + + def __init__( + self, + min_length: int = 3, + exclude_patterns: Optional[List[str]] = None, + i18n_functions: Optional[List[str]] = None, + ) -> None: + self.min_length = min_length + self.exclude_patterns = exclude_patterns or [] + self.i18n_functions = set(i18n_functions or []) + self.registry = get_default_registry() + + def scan(self, path: Path, i18n_functions: Optional[Set[str]] = None) -> ScanResult: + """Scan a directory for hardcoded strings.""" + result = ScanResult() + functions_to_exclude = self.i18n_functions.copy() + if i18n_functions: + functions_to_exclude.update(i18n_functions) + + for file_path in self._iter_files(path): + if not self._should_include(file_path): + continue + + literals = self._extract_literals(file_path) + result.files_scanned += 1 + + for literal in literals: + result.strings_found += 1 + if self._is_violation(literal, functions_to_exclude): + suggested_key = self._generate_key(literal) + result.violations.append( + Violation( + literal=literal, + suggested_key=suggested_key, + ) + ) + + result.violations_count = len(result.violations) + return result + + def _iter_files(self, path: Path) -> List[Path]: + """Iterate over files in path.""" + files = [] + if path.is_file(): + return [path] + + for ext in self.registry.list_extensions(): + files.extend(path.rglob(f"*{ext}")) + + return files + + def _should_include(self, file_path: Path) -> bool: + """Check if file should be included in scan.""" + for pattern in self.exclude_patterns: + if file_path.match(pattern): + return False + return True + + def _extract_literals(self, file_path: Path) -> List[StringLiteral]: + """Extract string literals from a file.""" + parser = self.registry.get(file_path) + if parser: + return parser.parse(file_path) + return [] + + def _is_violation(self, literal: StringLiteral, exclude_functions: Set[str]) -> bool: + """Check if a string literal is a violation.""" + value = literal.value + + if len(value) < self.min_length: + return False + + if self._is_url(value): + return False + + if self._is_email(value): + return False + + if self._is_regex(value): + return False + + if self._is_i18n_call(value, exclude_functions): + return False + + if self._is_numeric(value): + return False + + if self._is_single_char(value): + return False + + return True + + def _is_url(self, value: str) -> bool: + """Check if value is a URL.""" + return bool(self.URL_PATTERN.match(value)) + + def _is_email(self, value: str) -> bool: + """Check if value is an email.""" + return bool(self.EMAIL_PATTERN.match(value)) + + def _is_regex(self, value: str) -> bool: + """Check if value looks like a regex pattern.""" + return bool(self.REGEX_PATTERN.match(value)) + + def _is_i18n_call(self, value: str, exclude_functions: Set[str]) -> bool: + """Check if value looks like an i18n function call.""" + match = self.I18N_CALL_PATTERN.match(value) + if match: + func_name = match.group(1) + if func_name in exclude_functions: + return True + return False + + def _is_numeric(self, value: str) -> bool: + """Check if value is numeric.""" + try: + float(value) + return True + except ValueError: + return False + + def _is_single_char(self, value: str) -> bool: + """Check if value is a single character.""" + return len(value) == 1 + + def _generate_key(self, literal: StringLiteral) -> str: + """Generate a suggested translation key.""" + value = literal.value.lower() + key = re.sub(r"[^a-z0-9]+", "_", value) + key = key.strip("_") + + file_path = literal.file_path + relative_path = file_path.relative_to(file_path.parent) + + parts = relative_path.stem.split("/") + path_prefix = "_".join(parts) + + suggested = f"{path_prefix}_{key}" + if len(suggested) > 100: + suggested = suggested[:100].rstrip("_") + + return suggested