289 lines
11 KiB
Python
289 lines
11 KiB
Python
from .parser import (
|
|
RegexNode, NodeType, LiteralNode, CharacterClassNode,
|
|
QuantifierNode, GroupNode, RegexParser
|
|
)
|
|
|
|
|
|
class RegexTranslator:
|
|
"""Translates regex AST nodes to human-readable English."""
|
|
|
|
def __init__(self, flavor: str = "pcre"):
|
|
self.flavor = flavor
|
|
|
|
def translate(self, pattern: str) -> str:
|
|
"""Translate a regex pattern to human-readable English."""
|
|
parser = RegexParser(pattern, self.flavor)
|
|
ast = parser.parse()
|
|
return self._translate_node(ast)
|
|
|
|
def _translate_node(self, node: RegexNode) -> str:
|
|
"""Translate a single node."""
|
|
if node is None:
|
|
return ""
|
|
|
|
handlers = {
|
|
NodeType.SEQUENCE: self._translate_sequence,
|
|
NodeType.LITERAL: self._translate_literal,
|
|
NodeType.ESCAPED_CHAR: self._translate_escaped_char,
|
|
NodeType.DOT: self._translate_dot,
|
|
NodeType.POSITIVE_SET: self._translate_positive_set,
|
|
NodeType.NEGATIVE_SET: self._translate_negative_set,
|
|
NodeType.CAPTURING_GROUP: self._translate_capturing_group,
|
|
NodeType.NON_CAPTURING_GROUP: self._translate_non_capturing_group,
|
|
NodeType.NAMED_GROUP: self._translate_named_group,
|
|
NodeType.LOOKAHEAD: self._translate_lookahead,
|
|
NodeType.NEGATIVE_LOOKAHEAD: self._translate_negative_lookahead,
|
|
NodeType.LOOKBEHIND: self._translate_lookbehind,
|
|
NodeType.NEGATIVE_LOOKBEHIND: self._translate_negative_lookbehind,
|
|
NodeType.QUANTIFIER: self._translate_quantifier,
|
|
NodeType.ANCHOR_START: self._translate_anchor_start,
|
|
NodeType.ANCHOR_END: self._translate_anchor_end,
|
|
NodeType.WORD_BOUNDARY: self._translate_word_boundary,
|
|
NodeType.NON_WORD_BOUNDARY: self._translate_non_word_boundary,
|
|
NodeType.BRANCH: self._translate_branch,
|
|
NodeType.START_OF_STRING: self._translate_start_of_string,
|
|
NodeType.END_OF_STRING: self._translate_end_of_string,
|
|
NodeType.DIGIT: self._translate_digit,
|
|
NodeType.NON_DIGIT: self._translate_non_digit,
|
|
NodeType.WORD_CHAR: self._translate_word_char,
|
|
NodeType.NON_WORD_CHAR: self._translate_non_word_char,
|
|
NodeType.WHITESPACE: self._translate_whitespace,
|
|
NodeType.NON_WHITESPACE: self._translate_non_whitespace,
|
|
NodeType.BACKREFERENCE: self._translate_backreference,
|
|
}
|
|
|
|
handler = handlers.get(node.node_type)
|
|
if handler:
|
|
return handler(node)
|
|
return f"[{node.node_type.value}]"
|
|
|
|
def _translate_sequence(self, node: RegexNode) -> str:
|
|
"""Translate a sequence of nodes."""
|
|
if not node.children:
|
|
return "empty string"
|
|
|
|
parts = []
|
|
for child in node.children:
|
|
if child.node_type == NodeType.BRANCH:
|
|
branch_parts = [self._translate_node(c) for c in child.children]
|
|
if len(branch_parts) == 1:
|
|
parts.append(branch_parts[0])
|
|
else:
|
|
parts.append("(" + " OR ".join(branch_parts) + ")")
|
|
else:
|
|
parts.append(self._translate_node(child))
|
|
|
|
return "".join(parts)
|
|
|
|
def _translate_branch(self, node: RegexNode) -> str:
|
|
"""Translate a branch (alternation)."""
|
|
if not node.children:
|
|
return ""
|
|
|
|
parts = [self._translate_node(child) for child in node.children]
|
|
return " OR ".join(parts)
|
|
|
|
def _translate_literal(self, node: LiteralNode) -> str:
|
|
"""Translate a literal node."""
|
|
value = node.value
|
|
value = value.replace("\\", "backslash ")
|
|
value = value.replace(".", "period ")
|
|
value = value.replace("*", "asterisk ")
|
|
value = value.replace("+", "plus ")
|
|
value = value.replace("?", "question mark ")
|
|
value = value.replace("$", "dollar sign ")
|
|
value = value.replace("^", "caret ")
|
|
value = value.replace("|", "pipe ")
|
|
value = value.replace("(", "left parenthesis ")
|
|
value = value.replace(")", "right parenthesis ")
|
|
value = value.replace("[", "left bracket ")
|
|
value = value.replace("]", "right bracket ")
|
|
value = value.replace("{", "left brace ")
|
|
value = value.replace("}", "right brace ")
|
|
value = value.replace("\t", "tab ")
|
|
value = value.replace("\n", "newline ")
|
|
value = value.replace("\r", "carriage return ")
|
|
value = value.replace(" ", "space ")
|
|
return value
|
|
|
|
def _translate_escaped_char(self, node: LiteralNode) -> str:
|
|
"""Translate an escaped character."""
|
|
value = node.value
|
|
if value == " ":
|
|
return "space"
|
|
elif value == "\t":
|
|
return "tab character (escape sequence \\t)"
|
|
elif value == "\n":
|
|
return "newline character (escape sequence \\n)"
|
|
elif value == "\r":
|
|
return "carriage return (escape sequence \\r)"
|
|
return f"'{value}'"
|
|
|
|
def _translate_dot(self, node: RegexNode) -> str:
|
|
"""Translate a dot (any character)."""
|
|
return "any single character"
|
|
|
|
def _translate_positive_set(self, node: CharacterClassNode) -> str:
|
|
"""Translate a positive character set like [a-z]."""
|
|
parts = []
|
|
|
|
for start, end in node.ranges:
|
|
parts.append(f"any character from {start} through {end}")
|
|
|
|
for char in node.characters:
|
|
if char == '-':
|
|
parts.append("hyphen")
|
|
else:
|
|
parts.append(f"'{char}'")
|
|
|
|
if not parts:
|
|
return "any character in empty set"
|
|
|
|
if len(parts) == 1:
|
|
return parts[0]
|
|
|
|
return "any of: " + ", ".join(parts)
|
|
|
|
def _translate_negative_set(self, node: CharacterClassNode) -> str:
|
|
"""Translate a negative character set like [^a-z]."""
|
|
positive = self._translate_positive_set(node)
|
|
if positive.startswith("any character from"):
|
|
return "any character EXCEPT " + positive[20:]
|
|
return f"any character EXCEPT {positive[7:]}"
|
|
|
|
def _translate_capturing_group(self, node: GroupNode) -> str:
|
|
"""Translate a capturing group."""
|
|
if node.children:
|
|
content = self._translate_node(node.children[0])
|
|
return f"capturing group: ({content})"
|
|
return "capturing group: ()"
|
|
|
|
def _translate_non_capturing_group(self, node: GroupNode) -> str:
|
|
"""Translate a non-capturing group."""
|
|
if node.children:
|
|
content = self._translate_node(node.children[0])
|
|
return f"non-capturing group: ({content})"
|
|
return "non-capturing group: ()"
|
|
|
|
def _translate_named_group(self, node: GroupNode) -> str:
|
|
"""Translate a named group."""
|
|
name = node.name or "unnamed"
|
|
if node.children:
|
|
content = self._translate_node(node.children[0])
|
|
return f"named group '{name}': ({content})"
|
|
return f"named group '{name}': ()"
|
|
|
|
def _translate_lookahead(self, node: GroupNode) -> str:
|
|
"""Translate a positive lookahead."""
|
|
if node.children:
|
|
content = self._translate_node(node.children[0])
|
|
return f"followed by ({content})"
|
|
return "followed by ()"
|
|
|
|
def _translate_negative_lookahead(self, node: GroupNode) -> str:
|
|
"""Translate a negative lookahead."""
|
|
if node.children:
|
|
content = self._translate_node(node.children[0])
|
|
return f"NOT followed by ({content})"
|
|
return "NOT followed by ()"
|
|
|
|
def _translate_lookbehind(self, node: GroupNode) -> str:
|
|
"""Translate a lookbehind."""
|
|
if node.children:
|
|
content = self._translate_node(node.children[0])
|
|
return f"preceded by ({content})"
|
|
return "preceded by ()"
|
|
|
|
def _translate_negative_lookbehind(self, node: GroupNode) -> str:
|
|
"""Translate a negative lookbehind."""
|
|
if node.children:
|
|
content = self._translate_node(node.children[0])
|
|
return f"NOT preceded by ({content})"
|
|
return "NOT preceded by ()"
|
|
|
|
def _translate_quantifier(self, node: QuantifierNode) -> str:
|
|
"""Translate a quantifier."""
|
|
if not node.children:
|
|
return "[empty quantifier]"
|
|
|
|
child = node.children[0]
|
|
base = self._translate_node(child)
|
|
|
|
lazy_str = " (lazy)" if node.is_lazy else ""
|
|
possessive_str = " (possessive)" if node.is_possessive else ""
|
|
|
|
if node.min_count == 0 and node.max_count == 1:
|
|
return f"optional: {base}{lazy_str}{possessive_str}"
|
|
elif node.min_count == 0 and node.max_count == float('inf'):
|
|
return f"zero or more of: {base}{lazy_str}{possessive_str}"
|
|
elif node.min_count == 1 and node.max_count == float('inf'):
|
|
return f"one or more of: {base}{lazy_str}{possessive_str}"
|
|
elif node.min_count == node.max_count:
|
|
count = node.min_count
|
|
if count == 1:
|
|
return base
|
|
else:
|
|
return f"exactly {count} of: {base}{lazy_str}{possessive_str}"
|
|
elif node.max_count == float('inf'):
|
|
return f"at least {node.min_count} of: {base}{lazy_str}{possessive_str}"
|
|
else:
|
|
return f"between {node.min_count} and {node.max_count} of: {base}{lazy_str}{possessive_str}"
|
|
|
|
def _translate_anchor_start(self, node: RegexNode) -> str:
|
|
"""Translate start anchor."""
|
|
return "at the start of line or string"
|
|
|
|
def _translate_anchor_end(self, node: RegexNode) -> str:
|
|
"""Translate end anchor."""
|
|
return "at the end of line or string"
|
|
|
|
def _translate_word_boundary(self, node: RegexNode) -> str:
|
|
"""Translate word boundary."""
|
|
return "at a word boundary"
|
|
|
|
def _translate_non_word_boundary(self, node: RegexNode) -> str:
|
|
"""Translate non-word boundary."""
|
|
return "not at a word boundary"
|
|
|
|
def _translate_start_of_string(self, node: RegexNode) -> str:
|
|
"""Translate start of string anchor."""
|
|
return "at the start of the string"
|
|
|
|
def _translate_end_of_string(self, node: RegexNode) -> str:
|
|
"""Translate end of string anchor."""
|
|
return "at the end of the string"
|
|
|
|
def _translate_digit(self, node: RegexNode) -> str:
|
|
"""Translate digit character class."""
|
|
return "any digit (0-9)"
|
|
|
|
def _translate_non_digit(self, node: RegexNode) -> str:
|
|
"""Translate non-digit character class."""
|
|
return "any non-digit character"
|
|
|
|
def _translate_word_char(self, node: RegexNode) -> str:
|
|
"""Translate word character class."""
|
|
return "any word character (a-z, A-Z, 0-9, underscore)"
|
|
|
|
def _translate_non_word_char(self, node: RegexNode) -> str:
|
|
"""Translate non-word character class."""
|
|
return "any non-word character"
|
|
|
|
def _translate_whitespace(self, node: RegexNode) -> str:
|
|
"""Translate whitespace character class."""
|
|
return "any whitespace character (space, tab, newline, etc.)"
|
|
|
|
def _translate_non_whitespace(self, node: RegexNode) -> str:
|
|
"""Translate non-whitespace character class."""
|
|
return "any non-whitespace character"
|
|
|
|
def _translate_backreference(self, node: RegexNode) -> str:
|
|
"""Translate a backreference."""
|
|
return f"same as capture group \\{node.raw}"
|
|
|
|
|
|
def translate_regex(pattern: str, flavor: str = "pcre") -> str:
|
|
"""Translate a regex pattern to human-readable English."""
|
|
translator = RegexTranslator(flavor)
|
|
return translator.translate(pattern)
|