import re from typing import Dict, List, Optional, Tuple from ..parser import parse_regex PATTERN_TEMPLATES = { "literal": { "patterns": [ r"the letter\s+'([^']+)'", r"the word\s+'([^']+)'", r"the text\s+'([^']+)'", r"'([^']+)'", r"the string\s+'([^']+)'", ], "builder": lambda m: re.escape(m.group(1)), }, "digit": { "patterns": [ r"a\s+digit", r"any\s+digit", r"digits?", ], "builder": lambda m: r"\d", }, "non_digit": { "patterns": [ r"a\s+non-?digit", r"any\s+non-?digit", r"non-?digits?", ], "builder": lambda m: r"\D", }, "word_char": { "patterns": [ r"a\s+word\s+character", r"any\s+word\s+character", r"word\s+characters?", ], "builder": lambda m: r"\w", }, "non_word_char": { "patterns": [ r"a\s+non-?word\s+character", r"any\s+non-?word\s+character", r"non-?word\s+characters?", ], "builder": lambda m: r"\W", }, "whitespace": { "patterns": [ r"a\s+whitespace", r"any\s+whitespace", r"whitespace", r"spaces?", ], "builder": lambda m: r"\s", }, "non_whitespace": { "patterns": [ r"a\s+non-?whitespace", r"any\s+non-?whitespace", r"non-?whitespace", ], "builder": lambda m: r"\S", }, "any_char": { "patterns": [ r"any\s+character", r"any\s+single\s+character", r"\.|\.any", ], "builder": lambda m: ".", }, "start": { "patterns": [ r"the\s+start\s+of\s+the\s+string", r"beginning", ], "builder": lambda m: "^", }, "end": { "patterns": [ r"the\s+end\s+of\s+the\s+string", r"end\s+of\s+string", ], "builder": lambda m: "$", }, "word_boundary": { "patterns": [ r"a\s+word\s+boundary", r"word\s+boundary", ], "builder": lambda m: r"\b", }, "non_word_boundary": { "patterns": [ r"a\s+non-?word\s+boundary", r"non-?word\s+boundary", ], "builder": lambda m: r"\B", }, } def parse_english(description: str) -> str: result = description result = re.sub(r"\s+", " ", result).strip() return result def english_to_regex(description: str, flavor: str = "pcre") -> Tuple[str, List[str]]: pattern = description.lower() warnings = [] replacements = [] patterns = [ (r"the letter\s+'([^']+)'", lambda m: re.escape(m.group(1))), (r"the word\s+'([^']+)'", lambda m: re.escape(m.group(1))), (r"'([^']+)'", lambda m: re.escape(m.group(1))), (r"the string\s+'([^']+)'", lambda m: re.escape(m.group(1))), (r"the text\s+'([^']+)'", lambda m: re.escape(m.group(1))), (r"\bexactly\s+(\d+)\s+times?\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)}}}"), (r"\bzero\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})*"), (r"\bone\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})+"), (r"\boptionally\s+(.+)", lambda m: f"(?:{m.group(1)})?"), (r"\bat\s+least\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)},}}"), (r"\bbetween\s+(\d+)\s+and\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(3)}){{{m.group(1)},{m.group(2)}}}"), (r"\bany\s+character\b", lambda m: "."), (r"\ba\s+digit\b", lambda m: r"\d"), (r"\bdigits\b", lambda m: r"\d"), (r"\bany\s+digit\b", lambda m: r"\d"), (r"\bnon-?digit\b", lambda m: r"\D"), (r"\bword\s+character\b", lambda m: r"\w"), (r"\bany\s+word\s+character\b", lambda m: r"\w"), (r"\bnon-?word\s+character\b", lambda m: r"\W"), (r"\bwhitespace\b", lambda m: r"\s"), (r"\bspaces?\b", lambda m: r"\s"), (r"\bany\s+whitespace\b", lambda m: r"\s"), (r"\bnon-?whitespace\b", lambda m: r"\S"), (r"\bstart\s+of\s+string\b", lambda m: "^"), (r"\bbeginning\b", lambda m: "^"), (r"\bend\s+of\s+string\b", lambda m: "$"), (r"\bword\s+boundary\b", lambda m: r"\b"), (r"\bnon-?word\s+boundary\b", lambda m: r"\B"), (r"\bgroup\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"({m.group(1)})"), (r"\bnon-?capturing\s+group\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"(?:{m.group(1)})"), (r"\b(?P\w+)\s+group\b", lambda m: f"(?P<{m.group('name')}>"), ] for pattern_regex, replacement in patterns: regex = re.compile(pattern_regex, re.IGNORECASE) def make_replacer(r): return lambda m: r(m) replacements.append((regex, make_replacer(replacement))) result = pattern for regex, replacer in replacements: result = regex.sub(replacer, result) result = re.sub(r"\s+", "", result) result = re.sub(r"\[\^?([a-z])-([a-z])\]", lambda m: f"[{m.group(1)}-{m.group(2)}]", result, flags=re.IGNORECASE) return result, warnings def validate_roundtrip(original: str, converted: str) -> Tuple[bool, Optional[str]]: try: parse_regex(converted) return True, None except Exception as e: return False, str(e) def convert_english_to_regex(description: str, flavor: str = "pcre", validate: bool = True) -> Dict: pattern, warnings = english_to_regex(description, flavor) result = { "input": description, "output": pattern, "flavor": flavor, "warnings": warnings, } if validate: is_valid, error = validate_roundtrip(pattern, pattern) result["valid"] = is_valid if not is_valid: result["error"] = error return result