Files
regex-humanizer/regex_humanizer/converter/english_to_regex.py
7000pctAUTO c483efde18
Some checks failed
CI / build (push) Has been cancelled
CI / test (push) Has been cancelled
Add converter, examples, and flavors modules
2026-02-02 06:26:48 +00:00

195 lines
5.9 KiB
Python

import re
from typing import Dict, List, Optional, Tuple
from ..parser import parse_regex
PATTERN_TEMPLATES = {
"literal": {
"patterns": [
r"the letter\s+'([^']+)'",
r"the word\s+'([^']+)'",
r"the text\s+'([^']+)'",
r"'([^']+)'",
r"the string\s+'([^']+)'",
],
"builder": lambda m: re.escape(m.group(1)),
},
"digit": {
"patterns": [
r"a\s+digit",
r"any\s+digit",
r"digits?",
],
"builder": lambda m: r"\d",
},
"non_digit": {
"patterns": [
r"a\s+non-?digit",
r"any\s+non-?digit",
r"non-?digits?",
],
"builder": lambda m: r"\D",
},
"word_char": {
"patterns": [
r"a\s+word\s+character",
r"any\s+word\s+character",
r"word\s+characters?",
],
"builder": lambda m: r"\w",
},
"non_word_char": {
"patterns": [
r"a\s+non-?word\s+character",
r"any\s+non-?word\s+character",
r"non-?word\s+characters?",
],
"builder": lambda m: r"\W",
},
"whitespace": {
"patterns": [
r"a\s+whitespace",
r"any\s+whitespace",
r"whitespace",
r"spaces?",
],
"builder": lambda m: r"\s",
},
"non_whitespace": {
"patterns": [
r"a\s+non-?whitespace",
r"any\s+non-?whitespace",
r"non-?whitespace",
],
"builder": lambda m: r"\S",
},
"any_char": {
"patterns": [
r"any\s+character",
r"any\s+single\s+character",
r"\.|\.any",
],
"builder": lambda m: ".",
},
"start": {
"patterns": [
r"the\s+start\s+of\s+the\s+string",
r"beginning",
],
"builder": lambda m: "^",
},
"end": {
"patterns": [
r"the\s+end\s+of\s+the\s+string",
r"end\s+of\s+string",
],
"builder": lambda m: "$",
},
"word_boundary": {
"patterns": [
r"a\s+word\s+boundary",
r"word\s+boundary",
],
"builder": lambda m: r"\b",
},
"non_word_boundary": {
"patterns": [
r"a\s+non-?word\s+boundary",
r"non-?word\s+boundary",
],
"builder": lambda m: r"\B",
},
}
def parse_english(description: str) -> str:
result = description
result = re.sub(r"\s+", " ", result).strip()
return result
def english_to_regex(description: str, flavor: str = "pcre") -> Tuple[str, List[str]]:
pattern = description.lower()
warnings = []
replacements = []
patterns = [
(r"the letter\s+'([^']+)'", lambda m: re.escape(m.group(1))),
(r"the word\s+'([^']+)'", lambda m: re.escape(m.group(1))),
(r"'([^']+)'", lambda m: re.escape(m.group(1))),
(r"the string\s+'([^']+)'", lambda m: re.escape(m.group(1))),
(r"the text\s+'([^']+)'", lambda m: re.escape(m.group(1))),
(r"\bexactly\s+(\d+)\s+times?\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)}}}"),
(r"\bzero\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})*"),
(r"\bone\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})+"),
(r"\boptionally\s+(.+)", lambda m: f"(?:{m.group(1)})?"),
(r"\bat\s+least\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)},}}"),
(r"\bbetween\s+(\d+)\s+and\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(3)}){{{m.group(1)},{m.group(2)}}}"),
(r"\bany\s+character\b", lambda m: "."),
(r"\ba\s+digit\b", lambda m: r"\d"),
(r"\bdigits\b", lambda m: r"\d"),
(r"\bany\s+digit\b", lambda m: r"\d"),
(r"\bnon-?digit\b", lambda m: r"\D"),
(r"\bword\s+character\b", lambda m: r"\w"),
(r"\bany\s+word\s+character\b", lambda m: r"\w"),
(r"\bnon-?word\s+character\b", lambda m: r"\W"),
(r"\bwhitespace\b", lambda m: r"\s"),
(r"\bspaces?\b", lambda m: r"\s"),
(r"\bany\s+whitespace\b", lambda m: r"\s"),
(r"\bnon-?whitespace\b", lambda m: r"\S"),
(r"\bstart\s+of\s+string\b", lambda m: "^"),
(r"\bbeginning\b", lambda m: "^"),
(r"\bend\s+of\s+string\b", lambda m: "$"),
(r"\bword\s+boundary\b", lambda m: r"\b"),
(r"\bnon-?word\s+boundary\b", lambda m: r"\B"),
(r"\bgroup\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"({m.group(1)})"),
(r"\bnon-?capturing\s+group\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"(?:{m.group(1)})"),
(r"\b(?P<name>\w+)\s+group\b", lambda m: f"(?P<{m.group('name')}>"),
]
for pattern_regex, replacement in patterns:
regex = re.compile(pattern_regex, re.IGNORECASE)
def make_replacer(r):
return lambda m: r(m)
replacements.append((regex, make_replacer(replacement)))
result = pattern
for regex, replacer in replacements:
result = regex.sub(replacer, result)
result = re.sub(r"\s+", "", result)
result = re.sub(r"\[\^?([a-z])-([a-z])\]", lambda m: f"[{m.group(1)}-{m.group(2)}]", result, flags=re.IGNORECASE)
return result, warnings
def validate_roundtrip(original: str, converted: str) -> Tuple[bool, Optional[str]]:
try:
parse_regex(converted)
return True, None
except Exception as e:
return False, str(e)
def convert_english_to_regex(description: str, flavor: str = "pcre", validate: bool = True) -> Dict:
pattern, warnings = english_to_regex(description, flavor)
result = {
"input": description,
"output": pattern,
"flavor": flavor,
"warnings": warnings,
}
if validate:
is_valid, error = validate_roundtrip(pattern, pattern)
result["valid"] = is_valid
if not is_valid:
result["error"] = error
return result