"""Bidirectional conversion from English descriptions to regex patterns.""" import re from typing import Any, Dict, List, Optional, Tuple from ..parser import parse_regex PATTERN_TEMPLATES = { "literal": { "patterns": [ r"the letter\s+'([^']+)'", r"the word\s+'([^']+)'", r"the text\s+'([^']+)'", r"'([^']+)'", r"the string\s+'([^']+)'", ], "builder": lambda m: re.escape(m.group(1)), }, "digit": { "patterns": [ r"a\s+digit", r"any\s+digit", r"digits?", ], "builder": lambda m: r"\d", }, "non_digit": { "patterns": [ r"a\s+non-?digit", r"any\s+non-?digit", r"non-?digits?", ], "builder": lambda m: r"\D", }, "word_char": { "patterns": [ r"a\s+word\s+character", r"any\s+word\s+character", r"word\s+characters?", ], "builder": lambda m: r"\w", }, "non_word_char": { "patterns": [ r"a\s+non-?word\s+character", r"any\s+non-?word\s+character", r"non-?word\s+characters?", ], "builder": lambda m: r"\W", }, "whitespace": { "patterns": [ r"a\s+whitespace", r"any\s+whitespace", r"whitespace", r"spaces?", ], "builder": lambda m: r"\s", }, "non_whitespace": { "patterns": [ r"a\s+non-?whitespace", r"any\s+non-?whitespace", r"non-?whitespace", ], "builder": lambda m: r"\S", }, "any_char": { "patterns": [ r"any\s+character", r"any\s+single\s+character", r"\.|\.any", ], "builder": lambda m: ".", }, "start": { "patterns": [ r"the\s+start\s+of\s+the\s+string", r"beginning", ], "builder": lambda m: "^", }, "end": { "patterns": [ r"the\s+end\s+of\s+the\s+string", r"end\s+of\s+string", ], "builder": lambda m: "$", }, "word_boundary": { "patterns": [ r"a\s+word\s+boundary", r"word\s+boundary", ], "builder": lambda m: r"\b", }, "non_word_boundary": { "patterns": [ r"a\s+non-?word\s+boundary", r"non-?word\s+boundary", ], "builder": lambda m: r"\B", }, "character_class_any": { "patterns": [ r"any\s+(?:of\s+)?(character|in)\s+([a-zA-Z])[-–—]([a-zA-Z])", r"(?:characters?|in)\s+range\s+([a-zA-Z])[-–—]([a-zA-Z])", ], "builder": lambda m: f"[{m.group(1)}-{m.group(2)}]", }, "character_class_specific": { "patterns": [ r"any\s+(?:of\s+)?['\"]?([a-zA-Z0-9])['\"]?", ], "builder": lambda m: f"[{m.group(1)}]", }, "optional": { "patterns": [ r"(?:optionally|optional|zero\s+or\s+one)\s+(.*)", ], "builder": lambda m: f"(?:{m.group(1)})?", }, "zero_or_more": { "patterns": [ r"(?:zero\s+or\s+more|star|asterisk)\s+(.*)", ], "builder": lambda m: f"(?:{m.group(1)})*", }, "one_or_more": { "patterns": [ r"(?:one\s+or\s+more|plus)\s+(.*)", ], "builder": lambda m: f"(?:{m.group(1)})+", }, "exactly": { "patterns": [ r"exactly\s+(\d+)\s+(?:times?)?\s+(.*)", ], "builder": lambda m: f"(?:{m.group(2)}){{{m.group(1)}}}", }, "between": { "patterns": [ r"between\s+(\d+)\s+and\s+(\d+)\s+(?:times?)?\s+(.*)", ], "builder": lambda m: f"(?:{m.group(3)}){{{m.group(1)},{m.group(2)}}}", }, "at_least": { "patterns": [ r"at\s+least\s+(\d+)\s+(?:times?)?\s+(.*)", ], "builder": lambda m: f"(?:{m.group(2)}){{{m.group(1)},}}", }, "group": { "patterns": [ r"(?:a\s+)?(?:capturing\s+)?group\s+(?:containing|with)\s+(.*)", ], "builder": lambda m: f"({m.group(1)})", }, "non_capturing_group": { "patterns": [ r"(?:a\s+)?non-?capturing\s+group\s+(?:containing|with)\s+(.*)", ], "builder": lambda m: f"(?:{m.group(1)})", }, "named_group": { "patterns": [ r"(?:a\s+)?(?:named\s+)?group\s+(?:named|called)\s+'([^']+)'\s+(?:containing|with)\s+(.*)", ], "builder": lambda m: f"(?P<{m.group(1)}>{m.group(2)})", }, "or": { "patterns": [ r"(.*?)\s+or\s+(.*)", ], "builder": lambda m: f"{m.group(1)}|{m.group(2)}", }, "alternation": { "patterns": [ r"(?:either\s+)?(.+?)\s+(?:or|\/\/)\s+(.+)", ], "builder": lambda m: f"{m.group(1)}|{m.group(2)}", }, } def parse_english(description: str) -> str: """Convert an English description to a regex pattern. Args: description: The English description of the pattern. Returns: The corresponding regex pattern. """ result = description result = re.sub(r"\s+", " ", result).strip() return result def english_to_regex(description: str, flavor: str = "pcre") -> Tuple[str, List[str]]: """Convert an English description to a regex pattern. Args: description: The English description of the pattern. flavor: The target regex flavor. Returns: A tuple of (regex_pattern, warnings). """ pattern = description.lower() warnings: List[str] = [] replacements = [] patterns = [ (r"the letter\s+'([^']+)'", lambda m: re.escape(m.group(1))), (r"the word\s+'([^']+)'", lambda m: re.escape(m.group(1))), (r"'([^']+)'", lambda m: re.escape(m.group(1))), (r"the string\s+'([^']+)'", lambda m: re.escape(m.group(1))), (r"the text\s+'([^']+)'", lambda m: re.escape(m.group(1))), (r"\bexactly\s+(\d+)\s+times?\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)}}}"), (r"\bzero\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})*"), (r"\bone\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})+"), (r"\boptionally\s+(.+)", lambda m: f"(?:{m.group(1)})?"), (r"\bat\s+least\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)},}}"), (r"\bbetween\s+(\d+)\s+and\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(3)}){{{m.group(1)},{m.group(2)}}}"), (r"\bany\s+character\b", lambda m: "."), (r"\ba\s+digit\b", lambda m: r"\d"), (r"\bdigits\b", lambda m: r"\d"), (r"\bany\s+digit\b", lambda m: r"\d"), (r"\bnon-?digit\b", lambda m: r"\D"), (r"\bword\s+character\b", lambda m: r"\w"), (r"\bany\s+word\s+character\b", lambda m: r"\w"), (r"\bnon-?word\s+character\b", lambda m: r"\W"), (r"\bwhitespace\b", lambda m: r"\s"), (r"\bspaces?\b", lambda m: r"\s"), (r"\bany\s+whitespace\b", lambda m: r"\s"), (r"\bnon-?whitespace\b", lambda m: r"\S"), (r"\bstart\s+of\s+string\b", lambda m: "^"), (r"\bbeginning\b", lambda m: "^"), (r"\bend\s+of\s+string\b", lambda m: "$"), (r"\bword\s+boundary\b", lambda m: r"\b"), (r"\bnon-?word\s+boundary\b", lambda m: r"\B"), (r"\bgroup\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"({m.group(1)})"), (r"\bnon-?capturing\s+group\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"(?:{m.group(1)})"), (r"\b(?P\w+)\s+group\b", lambda m: f"(?P<{m.group('name')}>"), ] for pattern_regex, replacement in patterns: regex = re.compile(pattern_regex, re.IGNORECASE) def make_replacer(r): return lambda m: r(m) replacements.append((regex, make_replacer(replacement))) result = pattern for regex, replacer in replacements: result = regex.sub(replacer, result) result = re.sub(r"\s+", "", result) result = re.sub(r"\[^?([a-z])-([a-z])\]", lambda m: f"[{m.group(1)}-{m.group(2)}]", result, flags=re.IGNORECASE) return result, warnings def validate_roundtrip(original: str, converted: str) -> Tuple[bool, Optional[str]]: """Validate that converting from regex to English and back produces a valid pattern. Args: original: The original regex pattern. converted: The pattern converted from English. Returns: A tuple of (is_valid, error_message). """ try: parse_regex(converted) return True, None except Exception as e: return False, str(e) def convert_english_to_regex(description: str, flavor: str = "pcre", validate: bool = True) -> Dict[str, Any]: """Convert English description to regex with full context. Args: description: The English description of the pattern. flavor: The target regex flavor. validate: Whether to validate the result. Returns: A dictionary with conversion results. """ pattern, warnings = english_to_regex(description, flavor) result: Dict[str, Any] = { "input": description, "output": pattern, "flavor": flavor, "warnings": warnings, } if validate: is_valid, error = validate_roundtrip(pattern, pattern) result["valid"] = is_valid if not is_valid: result["error"] = error return result