"""Translator for converting regex AST to human-readable English.""" from typing import Optional from .parser import ( RegexNode, NodeType, LiteralNode, CharacterClassNode, QuantifierNode, GroupNode, RegexParser ) class RegexTranslator: """Translates regex AST nodes to human-readable English.""" def __init__(self, flavor: str = "pcre"): self.flavor = flavor def translate(self, pattern: str) -> str: """Translate a regex pattern to human-readable English.""" parser = RegexParser(pattern, self.flavor) ast = parser.parse() return self._translate_node(ast) def _translate_node(self, node: RegexNode) -> str: """Translate a single node.""" if node is None: return "" handlers = { NodeType.SEQUENCE: self._translate_sequence, NodeType.LITERAL: self._translate_literal, NodeType.ESCAPED_CHAR: self._translate_escaped_char, NodeType.DOT: self._translate_dot, NodeType.POSITIVE_SET: self._translate_positive_set, NodeType.NEGATIVE_SET: self._translate_negative_set, NodeType.CAPTURING_GROUP: self._translate_capturing_group, NodeType.NON_CAPTURING_GROUP: self._translate_non_capturing_group, NodeType.NAMED_GROUP: self._translate_named_group, NodeType.LOOKAHEAD: self._translate_lookahead, NodeType.NEGATIVE_LOOKAHEAD: self._translate_negative_lookahead, NodeType.LOOKBEHIND: self._translate_lookbehind, NodeType.NEGATIVE_LOOKBEHIND: self._translate_negative_lookbehind, NodeType.QUANTIFIER: self._translate_quantifier, NodeType.ANCHOR_START: self._translate_anchor_start, NodeType.ANCHOR_END: self._translate_anchor_end, NodeType.WORD_BOUNDARY: self._translate_word_boundary, NodeType.NON_WORD_BOUNDARY: self._translate_non_word_boundary, NodeType.BRANCH: self._translate_branch, NodeType.START_OF_STRING: self._translate_start_of_string, NodeType.END_OF_STRING: self._translate_end_of_string, NodeType.DIGIT: self._translate_digit, NodeType.NON_DIGIT: self._translate_non_digit, NodeType.WORD_CHAR: self._translate_word_char, NodeType.NON_WORD_CHAR: self._translate_non_word_char, NodeType.WHITESPACE: self._translate_whitespace, NodeType.NON_WHITESPACE: self._translate_non_whitespace, NodeType.BACKREFERENCE: self._translate_backreference, } handler = handlers.get(node.node_type) if handler: return handler(node) return f"[{node.node_type.value}]" def _translate_sequence(self, node: RegexNode) -> str: """Translate a sequence of nodes.""" if not node.children: return "empty string" parts = [] for child in node.children: if child.node_type == NodeType.BRANCH: branch_parts = [self._translate_node(c) for c in child.children] if len(branch_parts) == 1: parts.append(branch_parts[0]) else: parts.append("(" + " OR ".join(branch_parts) + ")") else: parts.append(self._translate_node(child)) return "".join(parts) def _translate_branch(self, node: RegexNode) -> str: """Translate a branch (alternation).""" if not node.children: return "" parts = [self._translate_node(child) for child in node.children] return " OR ".join(parts) def _translate_literal(self, node: LiteralNode) -> str: """Translate a literal node.""" value = node.value value = value.replace("\\", "backslash ") value = value.replace(".", "period ") value = value.replace("*", "asterisk ") value = value.replace("+", "plus ") value = value.replace("?", "question mark ") value = value.replace("$", "dollar sign ") value = value.replace("^", "caret ") value = value.replace("|", "pipe ") value = value.replace("(", "left parenthesis ") value = value.replace(")", "right parenthesis ") value = value.replace("[", "left bracket ") value = value.replace("]", "right bracket ") value = value.replace("{", "left brace ") value = value.replace("}", "right brace ") value = value.replace("\t", "tab ") value = value.replace("\n", "newline ") value = value.replace("\r", "carriage return ") value = value.replace(" ", "space ") return value def _translate_escaped_char(self, node: LiteralNode) -> str: """Translate an escaped character.""" value = node.value if value == " ": return "space" elif value == "\t": return "tab character (escape sequence \\t)" elif value == "\n": return "newline character (escape sequence \\n)" elif value == "\r": return "carriage return (escape sequence \\r)" return f"'{value}'" def _translate_dot(self, node: RegexNode) -> str: """Translate a dot (any character).""" return "any single character" def _translate_positive_set(self, node: CharacterClassNode) -> str: """Translate a positive character set like [a-z].""" parts = [] for start, end in node.ranges: parts.append(f"any character from {start} through {end}") for char in node.characters: if char == '-': parts.append("hyphen") else: parts.append(f"'{char}'") if not parts: return "any character in empty set" if len(parts) == 1: return parts[0] return "any of: " + ", ".join(parts) def _translate_negative_set(self, node: CharacterClassNode) -> str: """Translate a negative character set like [^a-z].""" positive = self._translate_positive_set(node) if positive.startswith("any character from"): return "any character EXCEPT " + positive[20:] return f"any character EXCEPT {positive[7:]}" def _translate_capturing_group(self, node: GroupNode) -> str: """Translate a capturing group.""" if node.children: content = self._translate_node(node.children[0]) return f"capturing group: ({content})" return "capturing group: ()" def _translate_non_capturing_group(self, node: GroupNode) -> str: """Translate a non-capturing group.""" if node.children: content = self._translate_node(node.children[0]) return f"non-capturing group: ({content})" return "non-capturing group: ()" def _translate_named_group(self, node: GroupNode) -> str: """Translate a named group.""" name = node.name or "unnamed" if node.children: content = self._translate_node(node.children[0]) return f"named group '{name}': ({content})" return f"named group '{name}': ()" def _translate_lookahead(self, node: GroupNode) -> str: """Translate a positive lookahead.""" if node.children: content = self._translate_node(node.children[0]) return f"followed by ({content})" return "followed by ()" def _translate_negative_lookahead(self, node: GroupNode) -> str: """Translate a negative lookahead.""" if node.children: content = self._translate_node(node.children[0]) return f"NOT followed by ({content})" return "NOT followed by ()" def _translate_lookbehind(self, node: GroupNode) -> str: """Translate a lookbehind.""" if node.children: content = self._translate_node(node.children[0]) return f"preceded by ({content})" return "preceded by ()" def _translate_negative_lookbehind(self, node: GroupNode) -> str: """Translate a negative lookbehind.""" if node.children: content = self._translate_node(node.children[0]) return f"NOT preceded by ({content})" return "NOT preceded by ()" def _translate_quantifier(self, node: QuantifierNode) -> str: """Translate a quantifier.""" if not node.children: return "[empty quantifier]" child = node.children[0] base = self._translate_node(child) lazy_str = " (lazy)" if node.is_lazy else "" possessive_str = " (possessive)" if node.is_possessive else "" if node.min_count == 0 and node.max_count == 1: return f"optional: {base}{lazy_str}{possessive_str}" elif node.min_count == 0 and node.max_count == float('inf'): return f"zero or more of: {base}{lazy_str}{possessive_str}" elif node.min_count == 1 and node.max_count == float('inf'): return f"one or more of: {base}{lazy_str}{possessive_str}" elif node.min_count == node.max_count: count = node.min_count if count == 1: return base else: return f"exactly {count} of: {base}{lazy_str}{possessive_str}" elif node.max_count == float('inf'): return f"at least {node.min_count} of: {base}{lazy_str}{possessive_str}" else: return f"between {node.min_count} and {node.max_count} of: {base}{lazy_str}{possessive_str}" def _translate_anchor_start(self, node: RegexNode) -> str: """Translate start anchor.""" return "at the start of line or string" def _translate_anchor_end(self, node: RegexNode) -> str: """Translate end anchor.""" return "at the end of line or string" def _translate_word_boundary(self, node: RegexNode) -> str: """Translate word boundary.""" return "at a word boundary" def _translate_non_word_boundary(self, node: RegexNode) -> str: """Translate non-word boundary.""" return "not at a word boundary" def _translate_start_of_string(self, node: RegexNode) -> str: """Translate start of string anchor.""" return "at the start of the string" def _translate_end_of_string(self, node: RegexNode) -> str: """Translate end of string anchor.""" return "at the end of the string" def _translate_digit(self, node: RegexNode) -> str: """Translate digit character class.""" return "any digit (0-9)" def _translate_non_digit(self, node: RegexNode) -> str: """Translate non-digit character class.""" return "any non-digit character" def _translate_word_char(self, node: RegexNode) -> str: """Translate word character class.""" return "any word character (a-z, A-Z, 0-9, underscore)" def _translate_non_word_char(self, node: RegexNode) -> str: """Translate non-word character class.""" return "any non-word character" def _translate_whitespace(self, node: RegexNode) -> str: """Translate whitespace character class.""" return "any whitespace character (space, tab, newline, etc.)" def _translate_non_whitespace(self, node: RegexNode) -> str: """Translate non-whitespace character class.""" return "any non-whitespace character" def _translate_backreference(self, node: RegexNode) -> str: """Translate a backreference.""" return f"same as capture group \\\{node.raw}" def translate_regex(pattern: str, flavor: str = "pcre") -> str: """Translate a regex pattern to human-readable English.""" translator = RegexTranslator(flavor) return translator.translate(pattern)