315 lines
9.3 KiB
Python
315 lines
9.3 KiB
Python
"""Bidirectional conversion from English descriptions to regex patterns."""
|
|
|
|
import re
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from ..parser import parse_regex
|
|
|
|
|
|
PATTERN_TEMPLATES = {
|
|
"literal": {
|
|
"patterns": [
|
|
r"the letter\s+'([^']+)'",
|
|
r"the word\s+'([^']+)'",
|
|
r"the text\s+'([^']+)'",
|
|
r"'([^']+)'",
|
|
r"the string\s+'([^']+)'",
|
|
],
|
|
"builder": lambda m: re.escape(m.group(1)),
|
|
},
|
|
"digit": {
|
|
"patterns": [
|
|
r"a\s+digit",
|
|
r"any\s+digit",
|
|
r"digits?",
|
|
],
|
|
"builder": lambda m: r"\d",
|
|
},
|
|
"non_digit": {
|
|
"patterns": [
|
|
r"a\s+non-?digit",
|
|
r"any\s+non-?digit",
|
|
r"non-?digits?",
|
|
],
|
|
"builder": lambda m: r"\D",
|
|
},
|
|
"word_char": {
|
|
"patterns": [
|
|
r"a\s+word\s+character",
|
|
r"any\s+word\s+character",
|
|
r"word\s+characters?",
|
|
],
|
|
"builder": lambda m: r"\w",
|
|
},
|
|
"non_word_char": {
|
|
"patterns": [
|
|
r"a\s+non-?word\s+character",
|
|
r"any\s+non-?word\s+character",
|
|
r"non-?word\s+characters?",
|
|
],
|
|
"builder": lambda m: r"\W",
|
|
},
|
|
"whitespace": {
|
|
"patterns": [
|
|
r"a\s+whitespace",
|
|
r"any\s+whitespace",
|
|
r"whitespace",
|
|
r"spaces?",
|
|
],
|
|
"builder": lambda m: r"\s",
|
|
},
|
|
"non_whitespace": {
|
|
"patterns": [
|
|
r"a\s+non-?whitespace",
|
|
r"any\s+non-?whitespace",
|
|
r"non-?whitespace",
|
|
],
|
|
"builder": lambda m: r"\S",
|
|
},
|
|
"any_char": {
|
|
"patterns": [
|
|
r"any\s+character",
|
|
r"any\s+single\s+character",
|
|
r"\.|\.any",
|
|
],
|
|
"builder": lambda m: ".",
|
|
},
|
|
"start": {
|
|
"patterns": [
|
|
r"the\s+start\s+of\s+the\s+string",
|
|
r"beginning",
|
|
],
|
|
"builder": lambda m: "^",
|
|
},
|
|
"end": {
|
|
"patterns": [
|
|
r"the\s+end\s+of\s+the\s+string",
|
|
r"end\s+of\s+string",
|
|
],
|
|
"builder": lambda m: "$",
|
|
},
|
|
"word_boundary": {
|
|
"patterns": [
|
|
r"a\s+word\s+boundary",
|
|
r"word\s+boundary",
|
|
],
|
|
"builder": lambda m: r"\b",
|
|
},
|
|
"non_word_boundary": {
|
|
"patterns": [
|
|
r"a\s+non-?word\s+boundary",
|
|
r"non-?word\s+boundary",
|
|
],
|
|
"builder": lambda m: r"\B",
|
|
},
|
|
"character_class_any": {
|
|
"patterns": [
|
|
r"any\s+(?:of\s+)?(character|in)\s+([a-zA-Z])[-–—]([a-zA-Z])",
|
|
r"(?:characters?|in)\s+range\s+([a-zA-Z])[-–—]([a-zA-Z])",
|
|
],
|
|
"builder": lambda m: f"[{m.group(1)}-{m.group(2)}]",
|
|
},
|
|
"character_class_specific": {
|
|
"patterns": [
|
|
r"any\s+(?:of\s+)?['\"]?([a-zA-Z0-9])['\"]?",
|
|
],
|
|
"builder": lambda m: f"[{m.group(1)}]",
|
|
},
|
|
"optional": {
|
|
"patterns": [
|
|
r"(?:optionally|optional|zero\s+or\s+one)\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"(?:{m.group(1)})?",
|
|
},
|
|
"zero_or_more": {
|
|
"patterns": [
|
|
r"(?:zero\s+or\s+more|star|asterisk)\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"(?:{m.group(1)})*",
|
|
},
|
|
"one_or_more": {
|
|
"patterns": [
|
|
r"(?:one\s+or\s+more|plus)\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"(?:{m.group(1)})+",
|
|
},
|
|
"exactly": {
|
|
"patterns": [
|
|
r"exactly\s+(\d+)\s+(?:times?)?\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"(?:{m.group(2)}){{{m.group(1)}}}",
|
|
},
|
|
"between": {
|
|
"patterns": [
|
|
r"between\s+(\d+)\s+and\s+(\d+)\s+(?:times?)?\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"(?:{m.group(3)}){{{m.group(1)},{m.group(2)}}}",
|
|
},
|
|
"at_least": {
|
|
"patterns": [
|
|
r"at\s+least\s+(\d+)\s+(?:times?)?\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"(?:{m.group(2)}){{{m.group(1)},}}",
|
|
},
|
|
"group": {
|
|
"patterns": [
|
|
r"(?:a\s+)?(?:capturing\s+)?group\s+(?:containing|with)\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"({m.group(1)})",
|
|
},
|
|
"non_capturing_group": {
|
|
"patterns": [
|
|
r"(?:a\s+)?non-?capturing\s+group\s+(?:containing|with)\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"(?:{m.group(1)})",
|
|
},
|
|
"named_group": {
|
|
"patterns": [
|
|
r"(?:a\s+)?(?:named\s+)?group\s+(?:named|called)\s+'([^']+)'\s+(?:containing|with)\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"(?P<{m.group(1)}>{m.group(2)})",
|
|
},
|
|
"or": {
|
|
"patterns": [
|
|
r"(.*?)\s+or\s+(.*)",
|
|
],
|
|
"builder": lambda m: f"{m.group(1)}|{m.group(2)}",
|
|
},
|
|
"alternation": {
|
|
"patterns": [
|
|
r"(?:either\s+)?(.+?)\s+(?:or|\/\/)\s+(.+)",
|
|
],
|
|
"builder": lambda m: f"{m.group(1)}|{m.group(2)}",
|
|
},
|
|
}
|
|
|
|
|
|
def parse_english(description: str) -> str:
|
|
"""Convert an English description to a regex pattern.
|
|
|
|
Args:
|
|
description: The English description of the pattern.
|
|
|
|
Returns:
|
|
The corresponding regex pattern.
|
|
"""
|
|
result = description
|
|
|
|
result = re.sub(r"\s+", " ", result).strip()
|
|
|
|
return result
|
|
|
|
|
|
def english_to_regex(description: str, flavor: str = "pcre") -> Tuple[str, List[str]]:
|
|
"""Convert an English description to a regex pattern.
|
|
|
|
Args:
|
|
description: The English description of the pattern.
|
|
flavor: The target regex flavor.
|
|
|
|
Returns:
|
|
A tuple of (regex_pattern, warnings).
|
|
"""
|
|
pattern = description.lower()
|
|
|
|
warnings: List[str] = []
|
|
|
|
replacements = []
|
|
|
|
patterns = [
|
|
(r"the letter\s+'([^']+)'", lambda m: re.escape(m.group(1))),
|
|
(r"the word\s+'([^']+)'", lambda m: re.escape(m.group(1))),
|
|
(r"'([^']+)'", lambda m: re.escape(m.group(1))),
|
|
(r"the string\s+'([^']+)'", lambda m: re.escape(m.group(1))),
|
|
(r"the text\s+'([^']+)'", lambda m: re.escape(m.group(1))),
|
|
(r"\bexactly\s+(\d+)\s+times?\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)}}}"),
|
|
(r"\bzero\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})*"),
|
|
(r"\bone\s+or\s+more\s+(.+)", lambda m: f"(?:{m.group(1)})+"),
|
|
(r"\boptionally\s+(.+)", lambda m: f"(?:{m.group(1)})?"),
|
|
(r"\bat\s+least\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(2)}){{{m.group(1)},}}"),
|
|
(r"\bbetween\s+(\d+)\s+and\s+(\d+)\s+(.+)", lambda m: f"(?:{m.group(3)}){{{m.group(1)},{m.group(2)}}}"),
|
|
(r"\bany\s+character\b", lambda m: "."),
|
|
(r"\ba\s+digit\b", lambda m: r"\d"),
|
|
(r"\bdigits\b", lambda m: r"\d"),
|
|
(r"\bany\s+digit\b", lambda m: r"\d"),
|
|
(r"\bnon-?digit\b", lambda m: r"\D"),
|
|
(r"\bword\s+character\b", lambda m: r"\w"),
|
|
(r"\bany\s+word\s+character\b", lambda m: r"\w"),
|
|
(r"\bnon-?word\s+character\b", lambda m: r"\W"),
|
|
(r"\bwhitespace\b", lambda m: r"\s"),
|
|
(r"\bspaces?\b", lambda m: r"\s"),
|
|
(r"\bany\s+whitespace\b", lambda m: r"\s"),
|
|
(r"\bnon-?whitespace\b", lambda m: r"\S"),
|
|
(r"\bstart\s+of\s+string\b", lambda m: "^"),
|
|
(r"\bbeginning\b", lambda m: "^"),
|
|
(r"\bend\s+of\s+string\b", lambda m: "$"),
|
|
(r"\bword\s+boundary\b", lambda m: r"\b"),
|
|
(r"\bnon-?word\s+boundary\b", lambda m: r"\B"),
|
|
(r"\bgroup\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"({m.group(1)})"),
|
|
(r"\bnon-?capturing\s+group\s+containing\s+(.+?)(?:\s+(?:or|and)|$)", lambda m: f"(?:{m.group(1)})"),
|
|
(r"\b(?P<name>\w+)\s+group\b", lambda m: f"(?P<{m.group('name')}>"),
|
|
]
|
|
|
|
for pattern_regex, replacement in patterns:
|
|
regex = re.compile(pattern_regex, re.IGNORECASE)
|
|
|
|
def make_replacer(r):
|
|
return lambda m: r(m)
|
|
replacements.append((regex, make_replacer(replacement)))
|
|
|
|
result = pattern
|
|
for regex, replacer in replacements:
|
|
result = regex.sub(replacer, result)
|
|
|
|
result = re.sub(r"\s+", "", result)
|
|
|
|
result = re.sub(r"\[^?([a-z])-([a-z])\]", lambda m: f"[{m.group(1)}-{m.group(2)}]", result, flags=re.IGNORECASE)
|
|
|
|
return result, warnings
|
|
|
|
|
|
def validate_roundtrip(original: str, converted: str) -> Tuple[bool, Optional[str]]:
|
|
"""Validate that converting from regex to English and back produces a valid pattern.
|
|
|
|
Args:
|
|
original: The original regex pattern.
|
|
converted: The pattern converted from English.
|
|
|
|
Returns:
|
|
A tuple of (is_valid, error_message).
|
|
"""
|
|
try:
|
|
parse_regex(converted)
|
|
return True, None
|
|
except Exception as e:
|
|
return False, str(e)
|
|
|
|
|
|
def convert_english_to_regex(description: str, flavor: str = "pcre", validate: bool = True) -> Dict[str, Any]:
|
|
"""Convert English description to regex with full context.
|
|
|
|
Args:
|
|
description: The English description of the pattern.
|
|
flavor: The target regex flavor.
|
|
validate: Whether to validate the result.
|
|
|
|
Returns:
|
|
A dictionary with conversion results.
|
|
"""
|
|
pattern, warnings = english_to_regex(description, flavor)
|
|
|
|
result: Dict[str, Any] = {
|
|
"input": description,
|
|
"output": pattern,
|
|
"flavor": flavor,
|
|
"warnings": warnings,
|
|
}
|
|
|
|
if validate:
|
|
is_valid, error = validate_roundtrip(pattern, pattern)
|
|
result["valid"] = is_valid
|
|
if not is_valid:
|
|
result["error"] = error
|
|
|
|
return result
|