Add converter, examples, and flavors modules
This commit is contained in:
270
regex_humanizer/converter/converter.py
Normal file
270
regex_humanizer/converter/converter.py
Normal file
@@ -0,0 +1,270 @@
|
|||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from ..parser import (
|
||||||
|
Alternation,
|
||||||
|
Anchor,
|
||||||
|
ASTNode,
|
||||||
|
Backreference,
|
||||||
|
CharacterClass,
|
||||||
|
Group,
|
||||||
|
Literal,
|
||||||
|
Quantifier,
|
||||||
|
SpecialSequence,
|
||||||
|
parse_regex,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def quantifier_description(quantifier: Quantifier, child_desc: str) -> str:
|
||||||
|
if quantifier.min == 0 and quantifier.max == 1:
|
||||||
|
base = "optionally"
|
||||||
|
elif quantifier.min == 0 and quantifier.max == Quantifier.MAX_UNBOUNDED:
|
||||||
|
base = "zero or more times"
|
||||||
|
elif quantifier.min == 1 and quantifier.max == Quantifier.MAX_UNBOUNDED:
|
||||||
|
base = "one or more times"
|
||||||
|
elif quantifier.min == quantifier.max:
|
||||||
|
base = f"exactly {quantifier.min} time{'s' if quantifier.min != 1 else ''}"
|
||||||
|
elif quantifier.max == Quantifier.MAX_UNBOUNDED:
|
||||||
|
base = f"at least {quantifier.min} time{'s' if quantifier.min != 1 else ''}"
|
||||||
|
else:
|
||||||
|
base = f"between {quantifier.min} and {quantifier.max} times"
|
||||||
|
|
||||||
|
if quantifier.lazy:
|
||||||
|
base += " (lazy)"
|
||||||
|
elif quantifier.possessive:
|
||||||
|
base += " (possessive)"
|
||||||
|
|
||||||
|
return f"{child_desc}, {base}"
|
||||||
|
|
||||||
|
|
||||||
|
def literal_description(node: Literal) -> str:
|
||||||
|
if node.value == " ":
|
||||||
|
return "a space"
|
||||||
|
elif node.value == "\t":
|
||||||
|
return "a tab character"
|
||||||
|
elif node.value == "\n":
|
||||||
|
return "a newline character"
|
||||||
|
elif node.value == "\r":
|
||||||
|
return "a carriage return"
|
||||||
|
elif node.value in r".^$*+?{}[]\|()":
|
||||||
|
return f"the literal character '{node.value}'"
|
||||||
|
else:
|
||||||
|
return f"the letter '{node.value}'"
|
||||||
|
|
||||||
|
|
||||||
|
def character_class_description(node: CharacterClass) -> str:
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
if node.inverted:
|
||||||
|
parts.append("any character except")
|
||||||
|
else:
|
||||||
|
parts.append("any of")
|
||||||
|
|
||||||
|
content_parts = []
|
||||||
|
|
||||||
|
for char in node.characters:
|
||||||
|
if char == " ":
|
||||||
|
content_parts.append("a space")
|
||||||
|
elif char == "\t":
|
||||||
|
content_parts.append("a tab")
|
||||||
|
elif char == "\n":
|
||||||
|
content_parts.append("a newline")
|
||||||
|
elif char in "-]":
|
||||||
|
content_parts.append(f"'{char}'")
|
||||||
|
else:
|
||||||
|
content_parts.append(f"'{char}'")
|
||||||
|
|
||||||
|
for start, end in node.ranges:
|
||||||
|
content_parts.append(f"characters from '{start}' to '{end}'")
|
||||||
|
|
||||||
|
if not content_parts:
|
||||||
|
return "an empty character class"
|
||||||
|
|
||||||
|
if len(content_parts) == 1:
|
||||||
|
content_str = content_parts[0]
|
||||||
|
elif len(content_parts) == 2:
|
||||||
|
content_str = f"{content_parts[0]} or {content_parts[1]}"
|
||||||
|
else:
|
||||||
|
content_str = ", ".join(content_parts[:-1]) + f", or {content_parts[-1]}"
|
||||||
|
|
||||||
|
return " ".join(parts) + " " + content_str
|
||||||
|
|
||||||
|
|
||||||
|
def special_sequence_description(node: SpecialSequence) -> str:
|
||||||
|
sequences = {
|
||||||
|
".": "any single character",
|
||||||
|
r"\d": "a digit (0-9)",
|
||||||
|
r"\D": "any non-digit character",
|
||||||
|
r"\w": "a word character (letter, digit, or underscore)",
|
||||||
|
r"\W": "any non-word character",
|
||||||
|
r"\s": "any whitespace character",
|
||||||
|
r"\S": "any non-whitespace character",
|
||||||
|
r"\b": "a word boundary",
|
||||||
|
r"\B": "a non-word boundary",
|
||||||
|
r"\A": "the start of the string",
|
||||||
|
r"\Z": "the end of the string",
|
||||||
|
r"\z": "the absolute end of the string",
|
||||||
|
r"^": "the start of the string",
|
||||||
|
r"$": "the end of the string",
|
||||||
|
}
|
||||||
|
|
||||||
|
return sequences.get(node.sequence, node.sequence)
|
||||||
|
|
||||||
|
|
||||||
|
def anchor_description(node: Anchor) -> str:
|
||||||
|
anchors = {
|
||||||
|
"^": "the start of the string",
|
||||||
|
"$": "the end of the string",
|
||||||
|
r"\b": "a word boundary",
|
||||||
|
r"\B": "a position that is not a word boundary",
|
||||||
|
}
|
||||||
|
return anchors.get(node.kind, node.kind)
|
||||||
|
|
||||||
|
|
||||||
|
def group_description(node: Group) -> str:
|
||||||
|
if node.name:
|
||||||
|
name_desc = f"named '{node.name}'"
|
||||||
|
elif not node.capturing:
|
||||||
|
name_desc = "non-capturing"
|
||||||
|
else:
|
||||||
|
name_desc = "capturing"
|
||||||
|
|
||||||
|
if node.content:
|
||||||
|
inner_desc = generate_description(node.content)
|
||||||
|
return f"a {name_desc} group containing: {inner_desc}"
|
||||||
|
else:
|
||||||
|
return f"an empty {name_desc} group"
|
||||||
|
|
||||||
|
|
||||||
|
def alternation_description(node: Alternation) -> str:
|
||||||
|
option_descs = []
|
||||||
|
for option in node.options:
|
||||||
|
if option:
|
||||||
|
option_descs.append(generate_description(option))
|
||||||
|
else:
|
||||||
|
option_descs.append("empty string")
|
||||||
|
|
||||||
|
if len(option_descs) == 1:
|
||||||
|
return option_descs[0]
|
||||||
|
|
||||||
|
return " or ".join(option_descs)
|
||||||
|
|
||||||
|
|
||||||
|
def backreference_description(node: Backreference) -> str:
|
||||||
|
if isinstance(node.reference, int):
|
||||||
|
return f"whatever was matched by capture group {node.reference}"
|
||||||
|
else:
|
||||||
|
return f"whatever was matched by the group named '{node.reference}'"
|
||||||
|
|
||||||
|
|
||||||
|
def generate_description(nodes: List[ASTNode]) -> str:
|
||||||
|
if not nodes:
|
||||||
|
return "an empty pattern"
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
for node in nodes:
|
||||||
|
if isinstance(node, Literal):
|
||||||
|
parts.append(literal_description(node))
|
||||||
|
elif isinstance(node, CharacterClass):
|
||||||
|
parts.append(character_class_description(node))
|
||||||
|
elif isinstance(node, Quantifier):
|
||||||
|
if hasattr(node, 'child') and node.child:
|
||||||
|
child_desc = generate_description([node.child])
|
||||||
|
parts.append(quantifier_description(node, child_desc))
|
||||||
|
else:
|
||||||
|
parts.append("an element with a quantifier")
|
||||||
|
elif isinstance(node, SpecialSequence):
|
||||||
|
parts.append(special_sequence_description(node))
|
||||||
|
elif isinstance(node, Anchor):
|
||||||
|
parts.append(anchor_description(node))
|
||||||
|
elif isinstance(node, Group):
|
||||||
|
parts.append(group_description(node))
|
||||||
|
elif isinstance(node, Alternation):
|
||||||
|
parts.append(alternation_description(node))
|
||||||
|
elif isinstance(node, Backreference):
|
||||||
|
parts.append(backreference_description(node))
|
||||||
|
else:
|
||||||
|
parts.append(str(node))
|
||||||
|
|
||||||
|
if len(parts) == 1:
|
||||||
|
return parts[0]
|
||||||
|
|
||||||
|
result = ""
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
if i == 0:
|
||||||
|
result = part
|
||||||
|
elif i == len(parts) - 1:
|
||||||
|
result += f" and {part}"
|
||||||
|
else:
|
||||||
|
result += f", {part}"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_english(pattern: str, flavor: str = "pcre") -> str:
|
||||||
|
try:
|
||||||
|
ast = parse_regex(pattern)
|
||||||
|
return generate_description(ast)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error parsing pattern: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_english_verbose(pattern: str, flavor: str = "pcre") -> dict:
|
||||||
|
try:
|
||||||
|
ast = parse_regex(pattern)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"pattern": pattern,
|
||||||
|
"flavor": flavor,
|
||||||
|
"description": generate_description(ast),
|
||||||
|
"structure": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
for node in ast:
|
||||||
|
node_info = node_to_dict(node)
|
||||||
|
result["structure"].append(node_info)
|
||||||
|
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"pattern": pattern,
|
||||||
|
"flavor": flavor,
|
||||||
|
"description": f"Error parsing pattern: {str(e)}",
|
||||||
|
"structure": [],
|
||||||
|
"error": str(e),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def node_to_dict(node: ASTNode) -> dict:
|
||||||
|
result = {"type": type(node).__name__}
|
||||||
|
|
||||||
|
if hasattr(node, 'position'):
|
||||||
|
result["position"] = node.position
|
||||||
|
|
||||||
|
if isinstance(node, Literal):
|
||||||
|
result["value"] = node.value
|
||||||
|
result["escaped"] = node.escaped
|
||||||
|
elif isinstance(node, CharacterClass):
|
||||||
|
result["inverted"] = node.inverted
|
||||||
|
result["characters"] = node.characters
|
||||||
|
result["ranges"] = node.ranges
|
||||||
|
elif isinstance(node, Quantifier):
|
||||||
|
result["min"] = node.min
|
||||||
|
result["max"] = node.max
|
||||||
|
result["lazy"] = node.lazy
|
||||||
|
result["possessive"] = node.possessive
|
||||||
|
if hasattr(node, 'child') and node.child:
|
||||||
|
result["child"] = node_to_dict(node.child)
|
||||||
|
elif isinstance(node, Group):
|
||||||
|
result["capturing"] = node.capturing
|
||||||
|
result["name"] = node.name
|
||||||
|
result["content"] = [node_to_dict(child) for child in node.content]
|
||||||
|
elif isinstance(node, Alternation):
|
||||||
|
result["options"] = [[node_to_dict(child) for child in option] for option in node.options]
|
||||||
|
elif isinstance(node, Anchor):
|
||||||
|
result["kind"] = node.kind
|
||||||
|
elif isinstance(node, SpecialSequence):
|
||||||
|
result["sequence"] = node.sequence
|
||||||
|
elif isinstance(node, Backreference):
|
||||||
|
result["reference"] = node.reference
|
||||||
|
|
||||||
|
return result
|
||||||
Reference in New Issue
Block a user