diff --git a/regex_humanizer/converter/converter.py b/regex_humanizer/converter/converter.py new file mode 100644 index 0000000..dccabdb --- /dev/null +++ b/regex_humanizer/converter/converter.py @@ -0,0 +1,270 @@ +from typing import List, Optional + +from ..parser import ( + Alternation, + Anchor, + ASTNode, + Backreference, + CharacterClass, + Group, + Literal, + Quantifier, + SpecialSequence, + parse_regex, +) + + +def quantifier_description(quantifier: Quantifier, child_desc: str) -> str: + if quantifier.min == 0 and quantifier.max == 1: + base = "optionally" + elif quantifier.min == 0 and quantifier.max == Quantifier.MAX_UNBOUNDED: + base = "zero or more times" + elif quantifier.min == 1 and quantifier.max == Quantifier.MAX_UNBOUNDED: + base = "one or more times" + elif quantifier.min == quantifier.max: + base = f"exactly {quantifier.min} time{'s' if quantifier.min != 1 else ''}" + elif quantifier.max == Quantifier.MAX_UNBOUNDED: + base = f"at least {quantifier.min} time{'s' if quantifier.min != 1 else ''}" + else: + base = f"between {quantifier.min} and {quantifier.max} times" + + if quantifier.lazy: + base += " (lazy)" + elif quantifier.possessive: + base += " (possessive)" + + return f"{child_desc}, {base}" + + +def literal_description(node: Literal) -> str: + if node.value == " ": + return "a space" + elif node.value == "\t": + return "a tab character" + elif node.value == "\n": + return "a newline character" + elif node.value == "\r": + return "a carriage return" + elif node.value in r".^$*+?{}[]\|()": + return f"the literal character '{node.value}'" + else: + return f"the letter '{node.value}'" + + +def character_class_description(node: CharacterClass) -> str: + parts = [] + + if node.inverted: + parts.append("any character except") + else: + parts.append("any of") + + content_parts = [] + + for char in node.characters: + if char == " ": + content_parts.append("a space") + elif char == "\t": + content_parts.append("a tab") + elif char == "\n": + content_parts.append("a newline") + elif char in "-]": + content_parts.append(f"'{char}'") + else: + content_parts.append(f"'{char}'") + + for start, end in node.ranges: + content_parts.append(f"characters from '{start}' to '{end}'") + + if not content_parts: + return "an empty character class" + + if len(content_parts) == 1: + content_str = content_parts[0] + elif len(content_parts) == 2: + content_str = f"{content_parts[0]} or {content_parts[1]}" + else: + content_str = ", ".join(content_parts[:-1]) + f", or {content_parts[-1]}" + + return " ".join(parts) + " " + content_str + + +def special_sequence_description(node: SpecialSequence) -> str: + sequences = { + ".": "any single character", + r"\d": "a digit (0-9)", + r"\D": "any non-digit character", + r"\w": "a word character (letter, digit, or underscore)", + r"\W": "any non-word character", + r"\s": "any whitespace character", + r"\S": "any non-whitespace character", + r"\b": "a word boundary", + r"\B": "a non-word boundary", + r"\A": "the start of the string", + r"\Z": "the end of the string", + r"\z": "the absolute end of the string", + r"^": "the start of the string", + r"$": "the end of the string", + } + + return sequences.get(node.sequence, node.sequence) + + +def anchor_description(node: Anchor) -> str: + anchors = { + "^": "the start of the string", + "$": "the end of the string", + r"\b": "a word boundary", + r"\B": "a position that is not a word boundary", + } + return anchors.get(node.kind, node.kind) + + +def group_description(node: Group) -> str: + if node.name: + name_desc = f"named '{node.name}'" + elif not node.capturing: + name_desc = "non-capturing" + else: + name_desc = "capturing" + + if node.content: + inner_desc = generate_description(node.content) + return f"a {name_desc} group containing: {inner_desc}" + else: + return f"an empty {name_desc} group" + + +def alternation_description(node: Alternation) -> str: + option_descs = [] + for option in node.options: + if option: + option_descs.append(generate_description(option)) + else: + option_descs.append("empty string") + + if len(option_descs) == 1: + return option_descs[0] + + return " or ".join(option_descs) + + +def backreference_description(node: Backreference) -> str: + if isinstance(node.reference, int): + return f"whatever was matched by capture group {node.reference}" + else: + return f"whatever was matched by the group named '{node.reference}'" + + +def generate_description(nodes: List[ASTNode]) -> str: + if not nodes: + return "an empty pattern" + + parts = [] + for node in nodes: + if isinstance(node, Literal): + parts.append(literal_description(node)) + elif isinstance(node, CharacterClass): + parts.append(character_class_description(node)) + elif isinstance(node, Quantifier): + if hasattr(node, 'child') and node.child: + child_desc = generate_description([node.child]) + parts.append(quantifier_description(node, child_desc)) + else: + parts.append("an element with a quantifier") + elif isinstance(node, SpecialSequence): + parts.append(special_sequence_description(node)) + elif isinstance(node, Anchor): + parts.append(anchor_description(node)) + elif isinstance(node, Group): + parts.append(group_description(node)) + elif isinstance(node, Alternation): + parts.append(alternation_description(node)) + elif isinstance(node, Backreference): + parts.append(backreference_description(node)) + else: + parts.append(str(node)) + + if len(parts) == 1: + return parts[0] + + result = "" + for i, part in enumerate(parts): + if i == 0: + result = part + elif i == len(parts) - 1: + result += f" and {part}" + else: + result += f", {part}" + + return result + + +def convert_to_english(pattern: str, flavor: str = "pcre") -> str: + try: + ast = parse_regex(pattern) + return generate_description(ast) + except Exception as e: + return f"Error parsing pattern: {str(e)}" + + +def convert_to_english_verbose(pattern: str, flavor: str = "pcre") -> dict: + try: + ast = parse_regex(pattern) + + result = { + "pattern": pattern, + "flavor": flavor, + "description": generate_description(ast), + "structure": [], + } + + for node in ast: + node_info = node_to_dict(node) + result["structure"].append(node_info) + + return result + except Exception as e: + return { + "pattern": pattern, + "flavor": flavor, + "description": f"Error parsing pattern: {str(e)}", + "structure": [], + "error": str(e), + } + + +def node_to_dict(node: ASTNode) -> dict: + result = {"type": type(node).__name__} + + if hasattr(node, 'position'): + result["position"] = node.position + + if isinstance(node, Literal): + result["value"] = node.value + result["escaped"] = node.escaped + elif isinstance(node, CharacterClass): + result["inverted"] = node.inverted + result["characters"] = node.characters + result["ranges"] = node.ranges + elif isinstance(node, Quantifier): + result["min"] = node.min + result["max"] = node.max + result["lazy"] = node.lazy + result["possessive"] = node.possessive + if hasattr(node, 'child') and node.child: + result["child"] = node_to_dict(node.child) + elif isinstance(node, Group): + result["capturing"] = node.capturing + result["name"] = node.name + result["content"] = [node_to_dict(child) for child in node.content] + elif isinstance(node, Alternation): + result["options"] = [[node_to_dict(child) for child in option] for option in node.options] + elif isinstance(node, Anchor): + result["kind"] = node.kind + elif isinstance(node, SpecialSequence): + result["sequence"] = node.sequence + elif isinstance(node, Backreference): + result["reference"] = node.reference + + return result