"""String scanner for detecting hardcoded strings.""" import re from dataclasses import dataclass, field from pathlib import Path from typing import List, Optional, Set from i18n_guardian.parsers.base import StringLiteral from i18n_guardian.parsers.simple import get_default_registry @dataclass class Violation: """Represents a hardcoded string violation.""" literal: StringLiteral suggested_key: str reason: str = "Hardcoded string detected" @dataclass class ScanResult: """Result of a scan operation.""" violations: List[Violation] = field(default_factory=list) files_scanned: int = 0 strings_found: int = 0 violations_count: int = 0 class StringScanner: """Scanner for hardcoded strings in code.""" URL_PATTERN = re.compile( r"^https?://", re.IGNORECASE, ) EMAIL_PATTERN = re.compile( r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", ) REGEX_PATTERN = re.compile( r"^[/^].*[/$]$", ) I18N_CALL_PATTERN = re.compile( r"^(\w+)\s*\(", ) def __init__( self, min_length: int = 3, exclude_patterns: Optional[List[str]] = None, i18n_functions: Optional[List[str]] = None, ) -> None: self.min_length = min_length self.exclude_patterns = exclude_patterns or [] self.i18n_functions = set(i18n_functions or []) self.registry = get_default_registry() def scan(self, path: Path, i18n_functions: Optional[Set[str]] = None) -> ScanResult: """Scan a directory for hardcoded strings.""" result = ScanResult() functions_to_exclude = self.i18n_functions.copy() if i18n_functions: functions_to_exclude.update(i18n_functions) for file_path in self._iter_files(path): if not self._should_include(file_path): continue literals = self._extract_literals(file_path) result.files_scanned += 1 for literal in literals: result.strings_found += 1 if self._is_violation(literal, functions_to_exclude): suggested_key = self._generate_key(literal) result.violations.append( Violation( literal=literal, suggested_key=suggested_key, ) ) result.violations_count = len(result.violations) return result def _iter_files(self, path: Path) -> List[Path]: """Iterate over files in path.""" files = [] if path.is_file(): return [path] for ext in self.registry.list_extensions(): files.extend(path.rglob(f"*{ext}")) return files def _should_include(self, file_path: Path) -> bool: """Check if file should be included in scan.""" for pattern in self.exclude_patterns: if file_path.match(pattern): return False return True def _extract_literals(self, file_path: Path) -> List[StringLiteral]: """Extract string literals from a file.""" parser = self.registry.get(file_path) if parser: return parser.parse(file_path) return [] def _is_violation(self, literal: StringLiteral, exclude_functions: Set[str]) -> bool: """Check if a string literal is a violation.""" value = literal.value if len(value) < self.min_length: return False if self._is_url(value): return False if self._is_email(value): return False if self._is_regex(value): return False if self._is_i18n_call(value, exclude_functions): return False if self._is_numeric(value): return False if self._is_single_char(value): return False return True def _is_url(self, value: str) -> bool: """Check if value is a URL.""" return bool(self.URL_PATTERN.match(value)) def _is_email(self, value: str) -> bool: """Check if value is an email.""" return bool(self.EMAIL_PATTERN.match(value)) def _is_regex(self, value: str) -> bool: """Check if value looks like a regex pattern.""" return bool(self.REGEX_PATTERN.match(value)) def _is_i18n_call(self, value: str, exclude_functions: Set[str]) -> bool: """Check if value looks like an i18n function call.""" match = self.I18N_CALL_PATTERN.match(value) if match: func_name = match.group(1) if func_name in exclude_functions: return True return False def _is_numeric(self, value: str) -> bool: """Check if value is numeric.""" try: float(value) return True except ValueError: return False def _is_single_char(self, value: str) -> bool: """Check if value is a single character.""" return len(value) == 1 def _generate_key(self, literal: StringLiteral) -> str: """Generate a suggested translation key.""" value = literal.value.lower() key = re.sub(r"[^a-z0-9]+", "_", value) key = key.strip("_") file_path = literal.file_path relative_path = file_path.relative_to(file_path.parent) parts = relative_path.stem.split("/") path_prefix = "_".join(parts) suggested = f"{path_prefix}_{key}" if len(suggested) > 100: suggested = suggested[:100].rstrip("_") return suggested