diff --git a/src/code_privacy_shield/patterns.py b/src/code_privacy_shield/patterns.py new file mode 100644 index 0000000..af0591a --- /dev/null +++ b/src/code_privacy_shield/patterns.py @@ -0,0 +1,139 @@ +import re +from typing import Dict, List, Pattern, Tuple + + +class PatternLibrary: + API_KEY_PATTERNS = [ + (r"(?i)(api[_-]?key['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{20,})", "API Key"), + (r"(?i)(secret['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{20,})", "Secret"), + (r"(?i)(token['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{20,})", "Token"), + (r"(?i)(auth[_-]?token['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{20,})", "Auth Token"), + (r"sk-[a-zA-Z0-9]{20,}", "OpenAI API Key"), + (r"sk-proj-[a-zA-Z0-9_-]{20,}", "OpenAI Project Key"), + (r"(?i)(ghp_|gho_|ghu_|ghs_|ghr_)[a-zA-Z0-9]{36,}", "GitHub Token"), + (r"(?i)(github[_-]?token['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{30,})", "GitHub Token"), + (r"(?i)AIza[0-9A-Za-z\\-_]{35}", "Google API Key"), + (r"(?i)(firebase['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{20,})", "Firebase Key"), + (r"(?i)(aws[_-]?access[_-]?key[_-]?id['\"]?\s*[:=]\s*['\"]?)([A-Z0-9]{20})", "AWS Access Key ID"), + (r"(?i)(aws[_-]?secret[_-]?access[_-]?key['\"]?\s*[:=]\s*['\"]?)([A-Za-z0-9/+=]{40})", "AWS Secret Key"), + (r"(?i)(slack[_-]?token['\"]?\s*[:=]\s*['\"]?)(xox[baprs]-([0-9a-zA-Z]{10,48})?)", "Slack Token"), + (r"(?i)(stripe[_-]?key['\"]?\s*[:=]\s*['\"]?)(sk_live_[0-9a-zA-Z]{24,})", "Stripe Secret Key"), + (r"(?i)(stripe[_-]?pub[_-]?key['\"]?\s*[:=]\s*['\"]?)(pk_live_[0-9a-zA-Z]{24,})", "Stripe Public Key"), + (r"(?i)(sendgrid['\"]?\s*[:=]\s*['\"]?)(SG\.[a-zA-Z0-9_-]{22}\.[a-zA-Z0-9_-]{43})", "SendGrid API Key"), + (r"(?i)(twilio['\"]?\s*[:=]\s*['\"]?)(SK[0-9a-f]{32})", "Twilio API Key"), + (r"(?i)(twilio[_-]?auth[_-]?token['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9]{32})", "Twilio Auth Token"), + (r"(?i)(heroku[_-]?api[_-]?key['\"]?\s*[:=]\s*['\"]?)([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})", "Heroku API Key"), + (r"(?i)(new[_-]?relic['\"]?\s*[:=]\s*['\"]?)(NRRA-[a-zA-Z0-9]{32})", "New Relic API Key"), + (r"(?i)(private[_-]?key['\"]?\s*[:=]\s*['\"]?)(-----BEGIN RSA PRIVATE KEY-----)", "RSA Private Key Header"), + (r"(?i)(private[_-]?key['\"]?\s*[:=]\s*['\"]?)(-----BEGIN EC PRIVATE KEY-----)", "EC Private Key Header"), + (r"(?i)(private[_-]?key['\"]?\s*[:=]\s*['\"]?)(-----BEGIN PRIVATE KEY-----)", "Private Key Header"), + ] + + PII_PATTERNS = [ + (r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "Email Address"), + (r"\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b", "Phone Number"), + (r"\b\d{3}-\d{2}-\d{4}\b", "SSN"), + (r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b", "Credit Card"), + (r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14})\b", "Credit Card (Visa/Mastercard)"), + (r"\b(?:3[47][0-9]{13})\b", "Credit Card (Amex)"), + (r"(?i)(name['\"]?\s*[:=]\s*['\"]?)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", "Full Name"), + (r"(?i)(first[_-]?name['\"]?\s*[:=]\s*['\"]?)([A-Z][a-z]+)", "First Name"), + (r"(?i)(last[_-]?name['\"]?\s*[:=]\s*['\"]?)([A-Z][a-z]+)", "Last Name"), + (r"(?i)(address['\"]?\s*[:=]\s*['\"]?)([0-9]{1,5}\s+[A-Za-z0-9\s,]+(?:\s+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Way|Place|Pl))?)", "Street Address"), + (r"\b[0-9]{5}(?:-[0-9]{4})?\b", "ZIP Code"), + (r"(?i)(password['\"]?\s*[:=]\s*['\"]?)([^\s'\"]{8,})", "Password"), + (r"(?i)(username['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_.-]{3,})", "Username"), + (r"(?i)(user[_-]?id['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{8,})", "User ID"), + ] + + DATABASE_PATTERNS = [ + (r"(?i)(postgresql://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "PostgreSQL Connection"), + (r"(?i)(postgres://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "PostgreSQL Connection (alt)"), + (r"(?i)(mysql://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "MySQL Connection"), + (r"(?i)(mysql://[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "MySQL Connection (no password)"), + (r"(?i)(mongodb(\+srv)?://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+(/\w+)?)", "MongoDB Connection"), + (r"(?i)(redis://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+(/\d+)?)", "Redis Connection"), + (r"(?i)(rediss://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+(/\d+)?)", "Redis Connection (SSL)"), + (r"(?i)(sqlite:///[\w/.-]+\.db)", "SQLite Connection"), + (r"(?i)(sqlserver://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "SQL Server Connection"), + (r"(?i)(oracle://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "Oracle Connection"), + (r"(?i)(db2://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "DB2 Connection"), + (r"(?i)(cockroachdb://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "CockroachDB Connection"), + (r"(?i)(cassandra://[a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+@[a-zA-Z0-9.-]+:\d+/\w+)", "Cassandra Connection"), + (r"(?i)(connection[_-]?string['\"]?\s*[:=]\s*['\"]?)([^'\"]+)", "Generic Connection String"), + ] + + ENV_VAR_PATTERNS = [ + (r"os\.environ(?:\[|\.get\()\s*['\"]([A-Z0-9_]+)['\"]", "Environment Variable Access"), + (r"os\.getenv\s*\(\s*['\"]([A-Z0-9_]+)['\"]\s*\)", "Environment Variable Access (getenv)"), + (r"dotenv\.load\(['\"]([^'\")]+)['\"]\)", "Dotenv File Load"), + (r"from dotenv import.*['\"]([^'\")]+)['\"]", "Dotenv Import"), + (r"(?i)(export\s+[A-Z0-9_]+=)", "Shell Export Statement"), + (r"(?i)([A-Z0-9_]+=)", "Environment Variable Assignment"), + ] + + IP_ADDRESS_PATTERNS = [ + (r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b", "IPv4 Address"), + (r"(?i)(host['\"]?\s*[:=]\s*['\"]?)((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))", "Host IP Address"), + ] + + AUTHORIZATION_PATTERNS = [ + (r"(?i)(Bearer\s+)([a-zA-Z0-9_-]{20,})", "Bearer Token"), + (r"(?i)(Basic\s+)([a-zA-Z0-9+/=]{20,})", "Basic Auth Header"), + (r"(?i)(Authorization:\s*)(Bearer\s+[a-zA-Z0-9_-]{20,})", "Authorization Header"), + (r"(?i)(auth[_-]?header['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{20,})", "Auth Header Value"), + (r"(?i)(x-api-key['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{20,})", "X-API-Key Header"), + (r"(?i)(api[_-]?token['\"]?\s*[:=]\s*['\"]?)([a-zA-Z0-9_-]{20,})", "API Token"), + ] + + def __init__(self): + self._compiled_patterns: Dict[str, List[Tuple[Pattern, str]]] = {} + + def _compile_patterns(self, patterns: List[Tuple[str, str]]) -> List[Tuple[Pattern, str]]: + compiled = [] + for pattern, name in patterns: + try: + compiled.append((re.compile(pattern, re.IGNORECASE), name)) + except re.error: + continue + return compiled + + def get_api_key_patterns(self) -> List[Tuple[Pattern, str]]: + if "api_keys" not in self._compiled_patterns: + self._compiled_patterns["api_keys"] = self._compile_patterns(self.API_KEY_PATTERNS) + return self._compiled_patterns["api_keys"] + + def get_pii_patterns(self) -> List[Tuple[Pattern, str]]: + if "pii" not in self._compiled_patterns: + self._compiled_patterns["pii"] = self._compile_patterns(self.PII_PATTERNS) + return self._compiled_patterns["pii"] + + def get_database_patterns(self) -> List[Tuple[Pattern, str]]: + if "database" not in self._compiled_patterns: + self._compiled_patterns["database"] = self._compile_patterns(self.DATABASE_PATTERNS) + return self._compiled_patterns["database"] + + def get_env_var_patterns(self) -> List[Tuple[Pattern, str]]: + if "env_var" not in self._compiled_patterns: + self._compiled_patterns["env_var"] = self._compile_patterns(self.ENV_VAR_PATTERNS) + return self._compiled_patterns["env_var"] + + def get_ip_patterns(self) -> List[Tuple[Pattern, str]]: + if "ip" not in self._compiled_patterns: + self._compiled_patterns["ip"] = self._compile_patterns(self.IP_ADDRESS_PATTERNS) + return self._compiled_patterns["ip"] + + def get_authorization_patterns(self) -> List[Tuple[Pattern, str]]: + if "authorization" not in self._compiled_patterns: + self._compiled_patterns["authorization"] = self._compile_patterns(self.AUTHORIZATION_PATTERNS) + return self._compiled_patterns["authorization"] + + def get_all_patterns(self) -> Dict[str, List[Tuple[Pattern, str]]]: + return { + "api_keys": self.get_api_key_patterns(), + "pii": self.get_pii_patterns(), + "database": self.get_database_patterns(), + "env_var": self.get_env_var_patterns(), + "ip": self.get_ip_patterns(), + "authorization": self.get_authorization_patterns(), + }