Files
api-testgen-cli/regex_humanizer/translator.py
CI Bot 52e792305b feat: initial commit for regex-humanizer-cli
- Add regex parser, translator, and test generator
- Add CLI with explain, test, interactive commands
- Add multi-flavor support (PCRE, JavaScript, Python)
- Add Gitea Actions CI workflow
- Add comprehensive README documentation
2026-02-06 03:02:57 +00:00

292 lines
12 KiB
Python

"""Translator for converting regex AST to human-readable English."""
from .parser import (
RegexNode, NodeType, LiteralNode, CharacterClassNode,
QuantifierNode, GroupNode, RegexParser
)
class RegexTranslator:
"""Translates regex AST nodes to human-readable English."""
def __init__(self, flavor: str = "pcre"):
self.flavor = flavor
def translate(self, pattern: str) -> str:
"""Translate a regex pattern to human-readable English."""
parser = RegexParser(pattern, self.flavor)
ast = parser.parse()
return self._translate_node(ast)
def _translate_node(self, node: RegexNode) -> str:
"""Translate a single node."""
if node is None:
return ""
handlers = {
NodeType.SEQUENCE: self._translate_sequence,
NodeType.LITERAL: self._translate_literal,
NodeType.ESCAPED_CHAR: self._translate_escaped_char,
NodeType.DOT: self._translate_dot,
NodeType.POSITIVE_SET: self._translate_positive_set,
NodeType.NEGATIVE_SET: self._translate_negative_set,
NodeType.CAPTURING_GROUP: self._translate_capturing_group,
NodeType.NON_CAPTURING_GROUP: self._translate_non_capturing_group,
NodeType.NAMED_GROUP: self._translate_named_group,
NodeType.LOOKAHEAD: self._translate_lookahead,
NodeType.NEGATIVE_LOOKAHEAD: self._translate_negative_lookahead,
NodeType.LOOKBEHIND: self._translate_lookbehind,
NodeType.NEGATIVE_LOOKBEHIND: self._translate_negative_lookbehind,
NodeType.QUANTIFIER: self._translate_quantifier,
NodeType.ANCHOR_START: self._translate_anchor_start,
NodeType.ANCHOR_END: self._translate_anchor_end,
NodeType.WORD_BOUNDARY: self._translate_word_boundary,
NodeType.NON_WORD_BOUNDARY: self._translate_non_word_boundary,
NodeType.BRANCH: self._translate_branch,
NodeType.START_OF_STRING: self._translate_start_of_string,
NodeType.END_OF_STRING: self._translate_end_of_string,
NodeType.DIGIT: self._translate_digit,
NodeType.NON_DIGIT: self._translate_non_digit,
NodeType.WORD_CHAR: self._translate_word_char,
NodeType.NON_WORD_CHAR: self._translate_non_word_char,
NodeType.WHITESPACE: self._translate_whitespace,
NodeType.NON_WHITESPACE: self._translate_non_whitespace,
NodeType.BACKREFERENCE: self._translate_backreference,
}
handler = handlers.get(node.node_type)
if handler:
return handler(node)
return f"[{node.node_type.value}]"
def _translate_sequence(self, node: RegexNode) -> str:
"""Translate a sequence of nodes."""
if not node.children:
return "empty string"
parts = []
for child in node.children:
if child.node_type == NodeType.BRANCH:
branch_parts = [self._translate_node(c) for c in child.children]
if len(branch_parts) == 1:
parts.append(branch_parts[0])
else:
parts.append("(" + " OR ".join(branch_parts) + ")")
else:
parts.append(self._translate_node(child))
return "".join(parts)
def _translate_branch(self, node: RegexNode) -> str:
"""Translate a branch (alternation)."""
if not node.children:
return ""
parts = [self._translate_node(child) for child in node.children]
return " OR ".join(parts)
def _translate_literal(self, node: LiteralNode) -> str:
"""Translate a literal node."""
value = node.value
value = value.replace("\\", "backslash ")
value = value.replace(".", "period ")
value = value.replace("*", "asterisk ")
value = value.replace("+", "plus ")
value = value.replace("?", "question mark ")
value = value.replace("$", "dollar sign ")
value = value.replace("^", "caret ")
value = value.replace("|", "pipe ")
value = value.replace("(", "left parenthesis ")
value = value.replace(")", "right parenthesis ")
value = value.replace("[", "left bracket ")
value = value.replace("]", "right bracket ")
value = value.replace("{", "left brace ")
value = value.replace("}", "right brace ")
value = value.replace("\t", "tab ")
value = value.replace("\n", "newline ")
value = value.replace("\r", "carriage return ")
value = value.replace(" ", "space ")
return value
def _translate_escaped_char(self, node: LiteralNode) -> str:
"""Translate an escaped character."""
value = node.value
if value == " ":
return "space"
elif value == "\t":
return "tab character (escape sequence \\t)"
elif value == "\n":
return "newline character (escape sequence \\n)"
elif value == "\r":
return "carriage return (escape sequence \\r)"
return f"'{value}'"
def _translate_dot(self, node: RegexNode) -> str:
"""Translate a dot (any character)."""
return "any single character"
def _translate_positive_set(self, node: CharacterClassNode) -> str:
"""Translate a positive character set like [a-z]."""
parts = []
for start, end in node.ranges:
parts.append(f"any character from {start} through {end}")
for char in node.characters:
if char == '-':
parts.append("hyphen")
else:
parts.append(f"'{char}'")
if not parts:
return "any character in empty set"
if len(parts) == 1:
return parts[0]
return "any of: " + ", ".join(parts)
def _translate_negative_set(self, node: CharacterClassNode) -> str:
"""Translate a negative character set like [^a-z]."""
positive = self._translate_positive_set(node)
if positive.startswith("any character from"):
return "any character EXCEPT " + positive[20:]
return f"any character EXCEPT {positive[7:]}"
def _translate_capturing_group(self, node: GroupNode) -> str:
"""Translate a capturing group."""
if node.children:
content = self._translate_node(node.children[0])
return f"capturing group: ({content})"
return "capturing group: ()"
def _translate_non_capturing_group(self, node: GroupNode) -> str:
"""Translate a non-capturing group."""
if node.children:
content = self._translate_node(node.children[0])
return f"non-capturing group: ({content})"
return "non-capturing group: ()"
def _translate_named_group(self, node: GroupNode) -> str:
"""Translate a named group."""
name = node.name or "unnamed"
if node.children:
content = self._translate_node(node.children[0])
return f"named group '{name}': ({content})"
return f"named group '{name}': ()"
def _translate_lookahead(self, node: GroupNode) -> str:
"""Translate a positive lookahead."""
if node.children:
content = self._translate_node(node.children[0])
return f"followed by ({content})"
return "followed by ()"
def _translate_negative_lookahead(self, node: GroupNode) -> str:
"""Translate a negative lookahead."""
if node.children:
content = self._translate_node(node.children[0])
return f"NOT followed by ({content})"
return "NOT followed by ()"
def _translate_lookbehind(self, node: GroupNode) -> str:
"""Translate a lookbehind."""
if node.children:
content = self._translate_node(node.children[0])
return f"preceded by ({content})"
return "preceded by ()"
def _translate_negative_lookbehind(self, node: GroupNode) -> str:
"""Translate a negative lookbehind."""
if node.children:
content = self._translate_node(node.children[0])
return f"NOT preceded by ({content})"
return "NOT preceded by ()"
def _translate_quantifier(self, node: QuantifierNode) -> str:
"""Translate a quantifier."""
if not node.children:
return "[empty quantifier]"
child = node.children[0]
base = self._translate_node(child)
lazy_str = " (lazy)" if node.is_lazy else ""
possessive_str = " (possessive)" if node.is_possessive else ""
if node.min_count == 0 and node.max_count == 1:
return f"optional: {base}{lazy_str}{possessive_str}"
elif node.min_count == 0 and node.max_count == float('inf'):
return f"zero or more of: {base}{lazy_str}{possessive_str}"
elif node.min_count == 1 and node.max_count == float('inf'):
return f"one or more of: {base}{lazy_str}{possessive_str}"
elif node.min_count == node.max_count:
count = node.min_count
if count == 1:
return base
else:
return f"exactly {count} of: {base}{lazy_str}{possessive_str}"
elif node.max_count == float('inf'):
return f"at least {node.min_count} of: {base}{lazy_str}{possessive_str}"
else:
return f"between {node.min_count} and {node.max_count} of: {base}{lazy_str}{possessive_str}"
def _translate_anchor_start(self, node: RegexNode) -> str:
"""Translate start anchor."""
return "at the start of line or string"
def _translate_anchor_end(self, node: RegexNode) -> str:
"""Translate end anchor."""
return "at the end of line or string"
def _translate_word_boundary(self, node: RegexNode) -> str:
"""Translate word boundary."""
return "at a word boundary"
def _translate_non_word_boundary(self, node: RegexNode) -> str:
"""Translate non-word boundary."""
return "not at a word boundary"
def _translate_start_of_string(self, node: RegexNode) -> str:
"""Translate start of string anchor."""
return "at the start of the string"
def _translate_end_of_string(self, node: RegexNode) -> str:
"""Translate end of string anchor."""
return "at the end of the string"
def _translate_digit(self, node: RegexNode) -> str:
"""Translate digit character class."""
return "any digit (0-9)"
def _translate_non_digit(self, node: RegexNode) -> str:
"""Translate non-digit character class."""
return "any non-digit character"
def _translate_word_char(self, node: RegexNode) -> str:
"""Translate word character class."""
return "any word character (a-z, A-Z, 0-9, underscore)"
def _translate_non_word_char(self, node: RegexNode) -> str:
"""Translate non-word character class."""
return "any non-word character"
def _translate_whitespace(self, node: RegexNode) -> str:
"""Translate whitespace character class."""
return "any whitespace character (space, tab, newline, etc.)"
def _translate_non_whitespace(self, node: RegexNode) -> str:
"""Translate non-whitespace character class."""
return "any non-whitespace character"
def _translate_backreference(self, node: RegexNode) -> str:
"""Translate a backreference."""
return f"same as capture group \\{node.raw}"
def translate_regex(pattern: str, flavor: str = "pcre") -> str:
"""Translate a regex pattern to human-readable English."""
translator = RegexTranslator(flavor)
return translator.translate(pattern)