From 2a00980cf865045d487af2f6dd926aa6ce77ccf3 Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Fri, 6 Feb 2026 01:09:45 +0000 Subject: [PATCH] Initial upload: regex-humanizer-cli with CI/CD workflow --- regex_humanizer/translator.py | 291 ++++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 regex_humanizer/translator.py diff --git a/regex_humanizer/translator.py b/regex_humanizer/translator.py new file mode 100644 index 0000000..830f55c --- /dev/null +++ b/regex_humanizer/translator.py @@ -0,0 +1,291 @@ +"""Translator for converting regex AST to human-readable English.""" + +from typing import Optional +from .parser import ( + RegexNode, NodeType, LiteralNode, CharacterClassNode, + QuantifierNode, GroupNode, RegexParser +) + + +class RegexTranslator: + """Translates regex AST nodes to human-readable English.""" + + def __init__(self, flavor: str = "pcre"): + self.flavor = flavor + + def translate(self, pattern: str) -> str: + """Translate a regex pattern to human-readable English.""" + parser = RegexParser(pattern, self.flavor) + ast = parser.parse() + return self._translate_node(ast) + + def _translate_node(self, node: RegexNode) -> str: + """Translate a single node.""" + if node is None: + return "" + + handlers = { + NodeType.SEQUENCE: self._translate_sequence, + NodeType.LITERAL: self._translate_literal, + NodeType.ESCAPED_CHAR: self._translate_escaped_char, + NodeType.DOT: self._translate_dot, + NodeType.POSITIVE_SET: self._translate_positive_set, + NodeType.NEGATIVE_SET: self._translate_negative_set, + NodeType.CAPTURING_GROUP: self._translate_capturing_group, + NodeType.NON_CAPTURING_GROUP: self._translate_non_capturing_group, + NodeType.NAMED_GROUP: self._translate_named_group, + NodeType.LOOKAHEAD: self._translate_lookahead, + NodeType.NEGATIVE_LOOKAHEAD: self._translate_negative_lookahead, + NodeType.LOOKBEHIND: self._translate_lookbehind, + NodeType.NEGATIVE_LOOKBEHIND: self._translate_negative_lookbehind, + NodeType.QUANTIFIER: self._translate_quantifier, + NodeType.ANCHOR_START: self._translate_anchor_start, + NodeType.ANCHOR_END: self._translate_anchor_end, + NodeType.WORD_BOUNDARY: self._translate_word_boundary, + NodeType.NON_WORD_BOUNDARY: self._translate_non_word_boundary, + NodeType.BRANCH: self._translate_branch, + NodeType.START_OF_STRING: self._translate_start_of_string, + NodeType.END_OF_STRING: self._translate_end_of_string, + NodeType.DIGIT: self._translate_digit, + NodeType.NON_DIGIT: self._translate_non_digit, + NodeType.WORD_CHAR: self._translate_word_char, + NodeType.NON_WORD_CHAR: self._translate_non_word_char, + NodeType.WHITESPACE: self._translate_whitespace, + NodeType.NON_WHITESPACE: self._translate_non_whitespace, + NodeType.BACKREFERENCE: self._translate_backreference, + } + + handler = handlers.get(node.node_type) + if handler: + return handler(node) + return f"[{node.node_type.value}]" + + def _translate_sequence(self, node: RegexNode) -> str: + """Translate a sequence of nodes.""" + if not node.children: + return "empty string" + + parts = [] + for child in node.children: + if child.node_type == NodeType.BRANCH: + branch_parts = [self._translate_node(c) for c in child.children] + if len(branch_parts) == 1: + parts.append(branch_parts[0]) + else: + parts.append("(" + " OR ".join(branch_parts) + ")") + else: + parts.append(self._translate_node(child)) + + return "".join(parts) + + def _translate_branch(self, node: RegexNode) -> str: + """Translate a branch (alternation).""" + if not node.children: + return "" + + parts = [self._translate_node(child) for child in node.children] + return " OR ".join(parts) + + def _translate_literal(self, node: LiteralNode) -> str: + """Translate a literal node.""" + value = node.value + value = value.replace("\\", "backslash ") + value = value.replace(".", "period ") + value = value.replace("*", "asterisk ") + value = value.replace("+", "plus ") + value = value.replace("?", "question mark ") + value = value.replace("$", "dollar sign ") + value = value.replace("^", "caret ") + value = value.replace("|", "pipe ") + value = value.replace("(", "left parenthesis ") + value = value.replace(")", "right parenthesis ") + value = value.replace("[", "left bracket ") + value = value.replace("]", "right bracket ") + value = value.replace("{", "left brace ") + value = value.replace("}", "right brace ") + value = value.replace("\t", "tab ") + value = value.replace("\n", "newline ") + value = value.replace("\r", "carriage return ") + value = value.replace(" ", "space ") + return value + + def _translate_escaped_char(self, node: LiteralNode) -> str: + """Translate an escaped character.""" + value = node.value + if value == " ": + return "space" + elif value == "\t": + return "tab character (escape sequence \\t)" + elif value == "\n": + return "newline character (escape sequence \\n)" + elif value == "\r": + return "carriage return (escape sequence \\r)" + return f"'{value}'" + + def _translate_dot(self, node: RegexNode) -> str: + """Translate a dot (any character).""" + return "any single character" + + def _translate_positive_set(self, node: CharacterClassNode) -> str: + """Translate a positive character set like [a-z].""" + parts = [] + + for start, end in node.ranges: + parts.append(f"any character from {start} through {end}") + + for char in node.characters: + if char == '-': + parts.append("hyphen") + else: + parts.append(f"'{char}'") + + if not parts: + return "any character in empty set" + + if len(parts) == 1: + return parts[0] + + return "any of: " + ", ".join(parts) + + def _translate_negative_set(self, node: CharacterClassNode) -> str: + """Translate a negative character set like [^a-z].""" + positive = self._translate_positive_set(node) + if positive.startswith("any character from"): + return "any character EXCEPT " + positive[20:] + return f"any character EXCEPT {positive[7:]}" + + def _translate_capturing_group(self, node: GroupNode) -> str: + """Translate a capturing group.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"capturing group: ({content})" + return "capturing group: ()" + + def _translate_non_capturing_group(self, node: GroupNode) -> str: + """Translate a non-capturing group.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"non-capturing group: ({content})" + return "non-capturing group: ()" + + def _translate_named_group(self, node: GroupNode) -> str: + """Translate a named group.""" + name = node.name or "unnamed" + if node.children: + content = self._translate_node(node.children[0]) + return f"named group '{name}': ({content})" + return f"named group '{name}': ()" + + def _translate_lookahead(self, node: GroupNode) -> str: + """Translate a positive lookahead.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"followed by ({content})" + return "followed by ()" + + def _translate_negative_lookahead(self, node: GroupNode) -> str: + """Translate a negative lookahead.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"NOT followed by ({content})" + return "NOT followed by ()" + + def _translate_lookbehind(self, node: GroupNode) -> str: + """Translate a lookbehind.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"preceded by ({content})" + return "preceded by ()" + + def _translate_negative_lookbehind(self, node: GroupNode) -> str: + """Translate a negative lookbehind.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"NOT preceded by ({content})" + return "NOT preceded by ()" + + def _translate_quantifier(self, node: QuantifierNode) -> str: + """Translate a quantifier.""" + if not node.children: + return "[empty quantifier]" + + child = node.children[0] + base = self._translate_node(child) + + lazy_str = " (lazy)" if node.is_lazy else "" + possessive_str = " (possessive)" if node.is_possessive else "" + + if node.min_count == 0 and node.max_count == 1: + return f"optional: {base}{lazy_str}{possessive_str}" + elif node.min_count == 0 and node.max_count == float('inf'): + return f"zero or more of: {base}{lazy_str}{possessive_str}" + elif node.min_count == 1 and node.max_count == float('inf'): + return f"one or more of: {base}{lazy_str}{possessive_str}" + elif node.min_count == node.max_count: + count = node.min_count + if count == 1: + return base + else: + return f"exactly {count} of: {base}{lazy_str}{possessive_str}" + elif node.max_count == float('inf'): + return f"at least {node.min_count} of: {base}{lazy_str}{possessive_str}" + else: + return f"between {node.min_count} and {node.max_count} of: {base}{lazy_str}{possessive_str}" + + def _translate_anchor_start(self, node: RegexNode) -> str: + """Translate start anchor.""" + return "at the start of line or string" + + def _translate_anchor_end(self, node: RegexNode) -> str: + """Translate end anchor.""" + return "at the end of line or string" + + def _translate_word_boundary(self, node: RegexNode) -> str: + """Translate word boundary.""" + return "at a word boundary" + + def _translate_non_word_boundary(self, node: RegexNode) -> str: + """Translate non-word boundary.""" + return "not at a word boundary" + + def _translate_start_of_string(self, node: RegexNode) -> str: + """Translate start of string anchor.""" + return "at the start of the string" + + def _translate_end_of_string(self, node: RegexNode) -> str: + """Translate end of string anchor.""" + return "at the end of the string" + + def _translate_digit(self, node: RegexNode) -> str: + """Translate digit character class.""" + return "any digit (0-9)" + + def _translate_non_digit(self, node: RegexNode) -> str: + """Translate non-digit character class.""" + return "any non-digit character" + + def _translate_word_char(self, node: RegexNode) -> str: + """Translate word character class.""" + return "any word character (a-z, A-Z, 0-9, underscore)" + + def _translate_non_word_char(self, node: RegexNode) -> str: + """Translate non-word character class.""" + return "any non-word character" + + def _translate_whitespace(self, node: RegexNode) -> str: + """Translate whitespace character class.""" + return "any whitespace character (space, tab, newline, etc.)" + + def _translate_non_whitespace(self, node: RegexNode) -> str: + """Translate non-whitespace character class.""" + return "any non-whitespace character" + + def _translate_backreference(self, node: RegexNode) -> str: + """Translate a backreference.""" + return f"same as capture group \\\{node.raw}" + + +def translate_regex(pattern: str, flavor: str = "pcre") -> str: + """Translate a regex pattern to human-readable English.""" + translator = RegexTranslator(flavor) + return translator.translate(pattern)