Initial upload: regex-humanizer-cli with CI/CD workflow
This commit is contained in:
291
regex_humanizer/translator.py
Normal file
291
regex_humanizer/translator.py
Normal file
@@ -0,0 +1,291 @@
|
||||
"""Translator for converting regex AST to human-readable English."""
|
||||
|
||||
from typing import Optional
|
||||
from .parser import (
|
||||
RegexNode, NodeType, LiteralNode, CharacterClassNode,
|
||||
QuantifierNode, GroupNode, RegexParser
|
||||
)
|
||||
|
||||
|
||||
class RegexTranslator:
|
||||
"""Translates regex AST nodes to human-readable English."""
|
||||
|
||||
def __init__(self, flavor: str = "pcre"):
|
||||
self.flavor = flavor
|
||||
|
||||
def translate(self, pattern: str) -> str:
|
||||
"""Translate a regex pattern to human-readable English."""
|
||||
parser = RegexParser(pattern, self.flavor)
|
||||
ast = parser.parse()
|
||||
return self._translate_node(ast)
|
||||
|
||||
def _translate_node(self, node: RegexNode) -> str:
|
||||
"""Translate a single node."""
|
||||
if node is None:
|
||||
return ""
|
||||
|
||||
handlers = {
|
||||
NodeType.SEQUENCE: self._translate_sequence,
|
||||
NodeType.LITERAL: self._translate_literal,
|
||||
NodeType.ESCAPED_CHAR: self._translate_escaped_char,
|
||||
NodeType.DOT: self._translate_dot,
|
||||
NodeType.POSITIVE_SET: self._translate_positive_set,
|
||||
NodeType.NEGATIVE_SET: self._translate_negative_set,
|
||||
NodeType.CAPTURING_GROUP: self._translate_capturing_group,
|
||||
NodeType.NON_CAPTURING_GROUP: self._translate_non_capturing_group,
|
||||
NodeType.NAMED_GROUP: self._translate_named_group,
|
||||
NodeType.LOOKAHEAD: self._translate_lookahead,
|
||||
NodeType.NEGATIVE_LOOKAHEAD: self._translate_negative_lookahead,
|
||||
NodeType.LOOKBEHIND: self._translate_lookbehind,
|
||||
NodeType.NEGATIVE_LOOKBEHIND: self._translate_negative_lookbehind,
|
||||
NodeType.QUANTIFIER: self._translate_quantifier,
|
||||
NodeType.ANCHOR_START: self._translate_anchor_start,
|
||||
NodeType.ANCHOR_END: self._translate_anchor_end,
|
||||
NodeType.WORD_BOUNDARY: self._translate_word_boundary,
|
||||
NodeType.NON_WORD_BOUNDARY: self._translate_non_word_boundary,
|
||||
NodeType.BRANCH: self._translate_branch,
|
||||
NodeType.START_OF_STRING: self._translate_start_of_string,
|
||||
NodeType.END_OF_STRING: self._translate_end_of_string,
|
||||
NodeType.DIGIT: self._translate_digit,
|
||||
NodeType.NON_DIGIT: self._translate_non_digit,
|
||||
NodeType.WORD_CHAR: self._translate_word_char,
|
||||
NodeType.NON_WORD_CHAR: self._translate_non_word_char,
|
||||
NodeType.WHITESPACE: self._translate_whitespace,
|
||||
NodeType.NON_WHITESPACE: self._translate_non_whitespace,
|
||||
NodeType.BACKREFERENCE: self._translate_backreference,
|
||||
}
|
||||
|
||||
handler = handlers.get(node.node_type)
|
||||
if handler:
|
||||
return handler(node)
|
||||
return f"[{node.node_type.value}]"
|
||||
|
||||
def _translate_sequence(self, node: RegexNode) -> str:
|
||||
"""Translate a sequence of nodes."""
|
||||
if not node.children:
|
||||
return "empty string"
|
||||
|
||||
parts = []
|
||||
for child in node.children:
|
||||
if child.node_type == NodeType.BRANCH:
|
||||
branch_parts = [self._translate_node(c) for c in child.children]
|
||||
if len(branch_parts) == 1:
|
||||
parts.append(branch_parts[0])
|
||||
else:
|
||||
parts.append("(" + " OR ".join(branch_parts) + ")")
|
||||
else:
|
||||
parts.append(self._translate_node(child))
|
||||
|
||||
return "".join(parts)
|
||||
|
||||
def _translate_branch(self, node: RegexNode) -> str:
|
||||
"""Translate a branch (alternation)."""
|
||||
if not node.children:
|
||||
return ""
|
||||
|
||||
parts = [self._translate_node(child) for child in node.children]
|
||||
return " OR ".join(parts)
|
||||
|
||||
def _translate_literal(self, node: LiteralNode) -> str:
|
||||
"""Translate a literal node."""
|
||||
value = node.value
|
||||
value = value.replace("\\", "backslash ")
|
||||
value = value.replace(".", "period ")
|
||||
value = value.replace("*", "asterisk ")
|
||||
value = value.replace("+", "plus ")
|
||||
value = value.replace("?", "question mark ")
|
||||
value = value.replace("$", "dollar sign ")
|
||||
value = value.replace("^", "caret ")
|
||||
value = value.replace("|", "pipe ")
|
||||
value = value.replace("(", "left parenthesis ")
|
||||
value = value.replace(")", "right parenthesis ")
|
||||
value = value.replace("[", "left bracket ")
|
||||
value = value.replace("]", "right bracket ")
|
||||
value = value.replace("{", "left brace ")
|
||||
value = value.replace("}", "right brace ")
|
||||
value = value.replace("\t", "tab ")
|
||||
value = value.replace("\n", "newline ")
|
||||
value = value.replace("\r", "carriage return ")
|
||||
value = value.replace(" ", "space ")
|
||||
return value
|
||||
|
||||
def _translate_escaped_char(self, node: LiteralNode) -> str:
|
||||
"""Translate an escaped character."""
|
||||
value = node.value
|
||||
if value == " ":
|
||||
return "space"
|
||||
elif value == "\t":
|
||||
return "tab character (escape sequence \\t)"
|
||||
elif value == "\n":
|
||||
return "newline character (escape sequence \\n)"
|
||||
elif value == "\r":
|
||||
return "carriage return (escape sequence \\r)"
|
||||
return f"'{value}'"
|
||||
|
||||
def _translate_dot(self, node: RegexNode) -> str:
|
||||
"""Translate a dot (any character)."""
|
||||
return "any single character"
|
||||
|
||||
def _translate_positive_set(self, node: CharacterClassNode) -> str:
|
||||
"""Translate a positive character set like [a-z]."""
|
||||
parts = []
|
||||
|
||||
for start, end in node.ranges:
|
||||
parts.append(f"any character from {start} through {end}")
|
||||
|
||||
for char in node.characters:
|
||||
if char == '-':
|
||||
parts.append("hyphen")
|
||||
else:
|
||||
parts.append(f"'{char}'")
|
||||
|
||||
if not parts:
|
||||
return "any character in empty set"
|
||||
|
||||
if len(parts) == 1:
|
||||
return parts[0]
|
||||
|
||||
return "any of: " + ", ".join(parts)
|
||||
|
||||
def _translate_negative_set(self, node: CharacterClassNode) -> str:
|
||||
"""Translate a negative character set like [^a-z]."""
|
||||
positive = self._translate_positive_set(node)
|
||||
if positive.startswith("any character from"):
|
||||
return "any character EXCEPT " + positive[20:]
|
||||
return f"any character EXCEPT {positive[7:]}"
|
||||
|
||||
def _translate_capturing_group(self, node: GroupNode) -> str:
|
||||
"""Translate a capturing group."""
|
||||
if node.children:
|
||||
content = self._translate_node(node.children[0])
|
||||
return f"capturing group: ({content})"
|
||||
return "capturing group: ()"
|
||||
|
||||
def _translate_non_capturing_group(self, node: GroupNode) -> str:
|
||||
"""Translate a non-capturing group."""
|
||||
if node.children:
|
||||
content = self._translate_node(node.children[0])
|
||||
return f"non-capturing group: ({content})"
|
||||
return "non-capturing group: ()"
|
||||
|
||||
def _translate_named_group(self, node: GroupNode) -> str:
|
||||
"""Translate a named group."""
|
||||
name = node.name or "unnamed"
|
||||
if node.children:
|
||||
content = self._translate_node(node.children[0])
|
||||
return f"named group '{name}': ({content})"
|
||||
return f"named group '{name}': ()"
|
||||
|
||||
def _translate_lookahead(self, node: GroupNode) -> str:
|
||||
"""Translate a positive lookahead."""
|
||||
if node.children:
|
||||
content = self._translate_node(node.children[0])
|
||||
return f"followed by ({content})"
|
||||
return "followed by ()"
|
||||
|
||||
def _translate_negative_lookahead(self, node: GroupNode) -> str:
|
||||
"""Translate a negative lookahead."""
|
||||
if node.children:
|
||||
content = self._translate_node(node.children[0])
|
||||
return f"NOT followed by ({content})"
|
||||
return "NOT followed by ()"
|
||||
|
||||
def _translate_lookbehind(self, node: GroupNode) -> str:
|
||||
"""Translate a lookbehind."""
|
||||
if node.children:
|
||||
content = self._translate_node(node.children[0])
|
||||
return f"preceded by ({content})"
|
||||
return "preceded by ()"
|
||||
|
||||
def _translate_negative_lookbehind(self, node: GroupNode) -> str:
|
||||
"""Translate a negative lookbehind."""
|
||||
if node.children:
|
||||
content = self._translate_node(node.children[0])
|
||||
return f"NOT preceded by ({content})"
|
||||
return "NOT preceded by ()"
|
||||
|
||||
def _translate_quantifier(self, node: QuantifierNode) -> str:
|
||||
"""Translate a quantifier."""
|
||||
if not node.children:
|
||||
return "[empty quantifier]"
|
||||
|
||||
child = node.children[0]
|
||||
base = self._translate_node(child)
|
||||
|
||||
lazy_str = " (lazy)" if node.is_lazy else ""
|
||||
possessive_str = " (possessive)" if node.is_possessive else ""
|
||||
|
||||
if node.min_count == 0 and node.max_count == 1:
|
||||
return f"optional: {base}{lazy_str}{possessive_str}"
|
||||
elif node.min_count == 0 and node.max_count == float('inf'):
|
||||
return f"zero or more of: {base}{lazy_str}{possessive_str}"
|
||||
elif node.min_count == 1 and node.max_count == float('inf'):
|
||||
return f"one or more of: {base}{lazy_str}{possessive_str}"
|
||||
elif node.min_count == node.max_count:
|
||||
count = node.min_count
|
||||
if count == 1:
|
||||
return base
|
||||
else:
|
||||
return f"exactly {count} of: {base}{lazy_str}{possessive_str}"
|
||||
elif node.max_count == float('inf'):
|
||||
return f"at least {node.min_count} of: {base}{lazy_str}{possessive_str}"
|
||||
else:
|
||||
return f"between {node.min_count} and {node.max_count} of: {base}{lazy_str}{possessive_str}"
|
||||
|
||||
def _translate_anchor_start(self, node: RegexNode) -> str:
|
||||
"""Translate start anchor."""
|
||||
return "at the start of line or string"
|
||||
|
||||
def _translate_anchor_end(self, node: RegexNode) -> str:
|
||||
"""Translate end anchor."""
|
||||
return "at the end of line or string"
|
||||
|
||||
def _translate_word_boundary(self, node: RegexNode) -> str:
|
||||
"""Translate word boundary."""
|
||||
return "at a word boundary"
|
||||
|
||||
def _translate_non_word_boundary(self, node: RegexNode) -> str:
|
||||
"""Translate non-word boundary."""
|
||||
return "not at a word boundary"
|
||||
|
||||
def _translate_start_of_string(self, node: RegexNode) -> str:
|
||||
"""Translate start of string anchor."""
|
||||
return "at the start of the string"
|
||||
|
||||
def _translate_end_of_string(self, node: RegexNode) -> str:
|
||||
"""Translate end of string anchor."""
|
||||
return "at the end of the string"
|
||||
|
||||
def _translate_digit(self, node: RegexNode) -> str:
|
||||
"""Translate digit character class."""
|
||||
return "any digit (0-9)"
|
||||
|
||||
def _translate_non_digit(self, node: RegexNode) -> str:
|
||||
"""Translate non-digit character class."""
|
||||
return "any non-digit character"
|
||||
|
||||
def _translate_word_char(self, node: RegexNode) -> str:
|
||||
"""Translate word character class."""
|
||||
return "any word character (a-z, A-Z, 0-9, underscore)"
|
||||
|
||||
def _translate_non_word_char(self, node: RegexNode) -> str:
|
||||
"""Translate non-word character class."""
|
||||
return "any non-word character"
|
||||
|
||||
def _translate_whitespace(self, node: RegexNode) -> str:
|
||||
"""Translate whitespace character class."""
|
||||
return "any whitespace character (space, tab, newline, etc.)"
|
||||
|
||||
def _translate_non_whitespace(self, node: RegexNode) -> str:
|
||||
"""Translate non-whitespace character class."""
|
||||
return "any non-whitespace character"
|
||||
|
||||
def _translate_backreference(self, node: RegexNode) -> str:
|
||||
"""Translate a backreference."""
|
||||
return f"same as capture group \\\{node.raw}"
|
||||
|
||||
|
||||
def translate_regex(pattern: str, flavor: str = "pcre") -> str:
|
||||
"""Translate a regex pattern to human-readable English."""
|
||||
translator = RegexTranslator(flavor)
|
||||
return translator.translate(pattern)
|
||||
Reference in New Issue
Block a user