From c483efde18ffdd2158e8e364281315e903344caa Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Mon, 2 Feb 2026 06:26:48 +0000 Subject: [PATCH] Add converter, examples, and flavors modules --- regex_humanizer/converter/english_to_regex.py | 194 ++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 regex_humanizer/converter/english_to_regex.py diff --git a/regex_humanizer/converter/english_to_regex.py b/regex_humanizer/converter/english_to_regex.py new file mode 100644 index 0000000..e0fe9da --- /dev/null +++ b/regex_humanizer/converter/english_to_regex.py @@ -0,0 +1,194 @@ +import re +from typing import Dict, List, Optional, Tuple + +from ..parser import parse_regex + + +PATTERN_TEMPLATES = { + "literal": { + "patterns": [ + r"the letter\s+'([^']+)'", + r"the word\s+'([^']+)'", + r"the text\s+'([^']+)'", + r"'([^']+)'", + r"the string\s+'([^']+)'", + ], + "builder": lambda m: re.escape(m.group(1)), + }, + "digit": { + "patterns": [ + r"a\s+digit", + r"any\s+digit", + r"digits?", + ], + "builder": lambda m: r"\d", + }, + "non_digit": { + "patterns": [ + r"a\s+non-?digit", + r"any\s+non-?digit", + r"non-?digits?", + ], + "builder": lambda m: r"\D", + }, + "word_char": { + "patterns": [ + r"a\s+word\s+character", + r"any\s+word\s+character", + r"word\s+characters?", + ], + "builder": lambda m: r"\w", + }, + "non_word_char": { + "patterns": [ + r"a\s+non-?word\s+character", + r"any\s+non-?word\s+character", + r"non-?word\s+characters?", + ], + "builder": lambda m: r"\W", + }, + "whitespace": { + "patterns": [ + r"a\s+whitespace", + r"any\s+whitespace", + r"whitespace", + r"spaces?", + ], + "builder": lambda m: r"\s", + }, + "non_whitespace": { + "patterns": [ + r"a\s+non-?whitespace", + r"any\s+non-?whitespace", + r"non-?whitespace", + ], + "builder": lambda m: r"\S", + }, + "any_char": { + "patterns": [ + r"any\s+character", + r"any\s+single\s+character", + r"\.|\.any", + ], + "builder": lambda m: ".", + }, + "start": { + "patterns": [ + r"the\s+start\s+of\s+the\s+string", + r"beginning", + ], + "builder": lambda m: "^", + }, + "end": { + "patterns": [ + r"the\s+end\s+of\s+the\s+string", + r"end\s+of\s+string", + ], + "builder": lambda m: "$", + }, + "word_boundary": { + "patterns": [ + r"a\s+word\s+boundary", + r"word\s+boundary", + ], + "builder": lambda m: r"\b", + }, + "non_word_boundary": { + "patterns": [ + r"a\s+non-?word\s+boundary", + r"non-?word\s+boundary", + ], + "builder": lambda m: r"\B", + }, +} + + +def parse_english(description: str) -> str: + result = description + result = re.sub(r"\s+", " ", result).strip() + return result + + +def english_to_regex(description: str, flavor: str = "pcre") -> Tuple[str, List[str]]: + pattern = description.lower() + warnings = [] + + replacements = [] + + patterns = [ + (r"the letter\s+'([^']+)'", lambda m: re.escape(m.group(1))), + (r"the word\s+'([^']+)'", lambda m: re.escape(m.group(1))), + (r"'([^']+)'", lambda m: re.escape(m.group(1))), + (r"the string\s+'([^']+)'", lambda m: re.escape(m.group(1))), + (r"the text\s+'([^']+)'", lambda m: re.escape(m.group(1))), + (r"\bexactly\s+(\d+)\s+times?\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)}}}"), + (r"\bzero\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})*"), + (r"\bone\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})+"), + (r"\boptionally\s+(.+)", lambda m: f"(?:{m.group(1)})?"), + (r"\bat\s+least\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)},}}"), + (r"\bbetween\s+(\d+)\s+and\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(3)}){{{m.group(1)},{m.group(2)}}}"), + (r"\bany\s+character\b", lambda m: "."), + (r"\ba\s+digit\b", lambda m: r"\d"), + (r"\bdigits\b", lambda m: r"\d"), + (r"\bany\s+digit\b", lambda m: r"\d"), + (r"\bnon-?digit\b", lambda m: r"\D"), + (r"\bword\s+character\b", lambda m: r"\w"), + (r"\bany\s+word\s+character\b", lambda m: r"\w"), + (r"\bnon-?word\s+character\b", lambda m: r"\W"), + (r"\bwhitespace\b", lambda m: r"\s"), + (r"\bspaces?\b", lambda m: r"\s"), + (r"\bany\s+whitespace\b", lambda m: r"\s"), + (r"\bnon-?whitespace\b", lambda m: r"\S"), + (r"\bstart\s+of\s+string\b", lambda m: "^"), + (r"\bbeginning\b", lambda m: "^"), + (r"\bend\s+of\s+string\b", lambda m: "$"), + (r"\bword\s+boundary\b", lambda m: r"\b"), + (r"\bnon-?word\s+boundary\b", lambda m: r"\B"), + (r"\bgroup\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"({m.group(1)})"), + (r"\bnon-?capturing\s+group\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"(?:{m.group(1)})"), + (r"\b(?P\w+)\s+group\b", lambda m: f"(?P<{m.group('name')}>"), + ] + + for pattern_regex, replacement in patterns: + regex = re.compile(pattern_regex, re.IGNORECASE) + + def make_replacer(r): + return lambda m: r(m) + replacements.append((regex, make_replacer(replacement))) + + result = pattern + for regex, replacer in replacements: + result = regex.sub(replacer, result) + + result = re.sub(r"\s+", "", result) + + result = re.sub(r"\[\^?([a-z])-([a-z])\]", lambda m: f"[{m.group(1)}-{m.group(2)}]", result, flags=re.IGNORECASE) + + return result, warnings + + +def validate_roundtrip(original: str, converted: str) -> Tuple[bool, Optional[str]]: + try: + parse_regex(converted) + return True, None + except Exception as e: + return False, str(e) + + +def convert_english_to_regex(description: str, flavor: str = "pcre", validate: bool = True) -> Dict: + pattern, warnings = english_to_regex(description, flavor) + + result = { + "input": description, + "output": pattern, + "flavor": flavor, + "warnings": warnings, + } + + if validate: + is_valid, error = validate_roundtrip(pattern, pattern) + result["valid"] = is_valid + if not is_valid: + result["error"] = error + + return result