Add converter, examples, and flavors modules

2026-02-02 06:26:48 +00:00
parent 67cfe7f8bb
commit 6bd1c44fae
1 changed files with 270 additions and 0 deletions
--- a/regex_humanizer/converter/converter.py
+++ b/regex_humanizer/converter/converter.py
@@ -0,0 +1,270 @@
+from typing import List, Optional
+
+from ..parser import (
+    Alternation,
+    Anchor,
+    ASTNode,
+    Backreference,
+    CharacterClass,
+    Group,
+    Literal,
+    Quantifier,
+    SpecialSequence,
+    parse_regex,
+)
+
+
+def quantifier_description(quantifier: Quantifier, child_desc: str) -> str:
+    if quantifier.min == 0 and quantifier.max == 1:
+        base = "optionally"
+    elif quantifier.min == 0 and quantifier.max == Quantifier.MAX_UNBOUNDED:
+        base = "zero or more times"
+    elif quantifier.min == 1 and quantifier.max == Quantifier.MAX_UNBOUNDED:
+        base = "one or more times"
+    elif quantifier.min == quantifier.max:
+        base = f"exactly {quantifier.min} time{'s' if quantifier.min != 1 else ''}"
+    elif quantifier.max == Quantifier.MAX_UNBOUNDED:
+        base = f"at least {quantifier.min} time{'s' if quantifier.min != 1 else ''}"
+    else:
+        base = f"between {quantifier.min} and {quantifier.max} times"
+
+    if quantifier.lazy:
+        base += " (lazy)"
+    elif quantifier.possessive:
+        base += " (possessive)"
+
+    return f"{child_desc}, {base}"
+
+
+def literal_description(node: Literal) -> str:
+    if node.value == " ":
+        return "a space"
+    elif node.value == "\t":
+        return "a tab character"
+    elif node.value == "\n":
+        return "a newline character"
+    elif node.value == "\r":
+        return "a carriage return"
+    elif node.value in r".^$*+?{}[]\|()":
+        return f"the literal character '{node.value}'"
+    else:
+        return f"the letter '{node.value}'"
+
+
+def character_class_description(node: CharacterClass) -> str:
+    parts = []
+
+    if node.inverted:
+        parts.append("any character except")
+    else:
+        parts.append("any of")
+
+    content_parts = []
+
+    for char in node.characters:
+        if char == " ":
+            content_parts.append("a space")
+        elif char == "\t":
+            content_parts.append("a tab")
+        elif char == "\n":
+            content_parts.append("a newline")
+        elif char in "-]":
+            content_parts.append(f"'{char}'")
+        else:
+            content_parts.append(f"'{char}'")
+
+    for start, end in node.ranges:
+        content_parts.append(f"characters from '{start}' to '{end}'")
+
+    if not content_parts:
+        return "an empty character class"
+
+    if len(content_parts) == 1:
+        content_str = content_parts[0]
+    elif len(content_parts) == 2:
+        content_str = f"{content_parts[0]} or {content_parts[1]}"
+    else:
+        content_str = ", ".join(content_parts[:-1]) + f", or {content_parts[-1]}"
+
+    return " ".join(parts) + " " + content_str
+
+
+def special_sequence_description(node: SpecialSequence) -> str:
+    sequences = {
+        ".": "any single character",
+        r"\d": "a digit (0-9)",
+        r"\D": "any non-digit character",
+        r"\w": "a word character (letter, digit, or underscore)",
+        r"\W": "any non-word character",
+        r"\s": "any whitespace character",
+        r"\S": "any non-whitespace character",
+        r"\b": "a word boundary",
+        r"\B": "a non-word boundary",
+        r"\A": "the start of the string",
+        r"\Z": "the end of the string",
+        r"\z": "the absolute end of the string",
+        r"^": "the start of the string",
+        r"$": "the end of the string",
+    }
+
+    return sequences.get(node.sequence, node.sequence)
+
+
+def anchor_description(node: Anchor) -> str:
+    anchors = {
+        "^": "the start of the string",
+        "$": "the end of the string",
+        r"\b": "a word boundary",
+        r"\B": "a position that is not a word boundary",
+    }
+    return anchors.get(node.kind, node.kind)
+
+
+def group_description(node: Group) -> str:
+    if node.name:
+        name_desc = f"named '{node.name}'"
+    elif not node.capturing:
+        name_desc = "non-capturing"
+    else:
+        name_desc = "capturing"
+
+    if node.content:
+        inner_desc = generate_description(node.content)
+        return f"a {name_desc} group containing: {inner_desc}"
+    else:
+        return f"an empty {name_desc} group"
+
+
+def alternation_description(node: Alternation) -> str:
+    option_descs = []
+    for option in node.options:
+        if option:
+            option_descs.append(generate_description(option))
+        else:
+            option_descs.append("empty string")
+
+    if len(option_descs) == 1:
+        return option_descs[0]
+
+    return " or ".join(option_descs)
+
+
+def backreference_description(node: Backreference) -> str:
+    if isinstance(node.reference, int):
+        return f"whatever was matched by capture group {node.reference}"
+    else:
+        return f"whatever was matched by the group named '{node.reference}'"
+
+
+def generate_description(nodes: List[ASTNode]) -> str:
+    if not nodes:
+        return "an empty pattern"
+
+    parts = []
+    for node in nodes:
+        if isinstance(node, Literal):
+            parts.append(literal_description(node))
+        elif isinstance(node, CharacterClass):
+            parts.append(character_class_description(node))
+        elif isinstance(node, Quantifier):
+            if hasattr(node, 'child') and node.child:
+                child_desc = generate_description([node.child])
+                parts.append(quantifier_description(node, child_desc))
+            else:
+                parts.append("an element with a quantifier")
+        elif isinstance(node, SpecialSequence):
+            parts.append(special_sequence_description(node))
+        elif isinstance(node, Anchor):
+            parts.append(anchor_description(node))
+        elif isinstance(node, Group):
+            parts.append(group_description(node))
+        elif isinstance(node, Alternation):
+            parts.append(alternation_description(node))
+        elif isinstance(node, Backreference):
+            parts.append(backreference_description(node))
+        else:
+            parts.append(str(node))
+
+    if len(parts) == 1:
+        return parts[0]
+
+    result = ""
+    for i, part in enumerate(parts):
+        if i == 0:
+            result = part
+        elif i == len(parts) - 1:
+            result += f" and {part}"
+        else:
+            result += f", {part}"
+
+    return result
+
+
+def convert_to_english(pattern: str, flavor: str = "pcre") -> str:
+    try:
+        ast = parse_regex(pattern)
+        return generate_description(ast)
+    except Exception as e:
+        return f"Error parsing pattern: {str(e)}"
+
+
+def convert_to_english_verbose(pattern: str, flavor: str = "pcre") -> dict:
+    try:
+        ast = parse_regex(pattern)
+
+        result = {
+            "pattern": pattern,
+            "flavor": flavor,
+            "description": generate_description(ast),
+            "structure": [],
+        }
+
+        for node in ast:
+            node_info = node_to_dict(node)
+            result["structure"].append(node_info)
+
+        return result
+    except Exception as e:
+        return {
+            "pattern": pattern,
+            "flavor": flavor,
+            "description": f"Error parsing pattern: {str(e)}",
+            "structure": [],
+            "error": str(e),
+        }
+
+
+def node_to_dict(node: ASTNode) -> dict:
+    result = {"type": type(node).__name__}
+
+    if hasattr(node, 'position'):
+        result["position"] = node.position
+
+    if isinstance(node, Literal):
+        result["value"] = node.value
+        result["escaped"] = node.escaped
+    elif isinstance(node, CharacterClass):
+        result["inverted"] = node.inverted
+        result["characters"] = node.characters
+        result["ranges"] = node.ranges
+    elif isinstance(node, Quantifier):
+        result["min"] = node.min
+        result["max"] = node.max
+        result["lazy"] = node.lazy
+        result["possessive"] = node.possessive
+        if hasattr(node, 'child') and node.child:
+            result["child"] = node_to_dict(node.child)
+    elif isinstance(node, Group):
+        result["capturing"] = node.capturing
+        result["name"] = node.name
+        result["content"] = [node_to_dict(child) for child in node.content]
+    elif isinstance(node, Alternation):
+        result["options"] = [[node_to_dict(child) for child in option] for option in node.options]
+    elif isinstance(node, Anchor):
+        result["kind"] = node.kind
+    elif isinstance(node, SpecialSequence):
+        result["sequence"] = node.sequence
+    elif isinstance(node, Backreference):
+        result["reference"] = node.reference
+
+    return result