274 lines
8.5 KiB
Python
274 lines
8.5 KiB
Python
"""Generate concrete match examples for regex patterns."""
|
|
|
|
import random
|
|
import re
|
|
import string
|
|
from typing import List, Set
|
|
|
|
from ..parser import (
|
|
Alternation,
|
|
Anchor,
|
|
ASTNode,
|
|
Backreference,
|
|
CharacterClass,
|
|
Group,
|
|
Literal,
|
|
Quantifier,
|
|
SpecialSequence,
|
|
parse_regex,
|
|
)
|
|
|
|
|
|
DIGITS = string.digits
|
|
UPPERCASE = string.ascii_uppercase
|
|
LOWERCASE = string.ascii_lowercase
|
|
WHITESPACE = " \t\n\r"
|
|
PUNCTUATION = "!@#$%^&*()_+-=[]{}|;:,.<>?"
|
|
|
|
|
|
def generate_literal_example(node: Literal) -> str:
|
|
"""Generate an example for a literal."""
|
|
return node.value
|
|
|
|
|
|
def generate_character_class_example(node: CharacterClass) -> str:
|
|
"""Generate an example for a character class."""
|
|
options = []
|
|
|
|
for char in node.characters:
|
|
if char in r"\-\]":
|
|
options.append("\\" + char)
|
|
elif char == "\t":
|
|
options.append("\\t")
|
|
elif char == "\n":
|
|
options.append("\\n")
|
|
elif char == "\r":
|
|
options.append("\\r")
|
|
else:
|
|
options.append(char)
|
|
|
|
for start, end in node.ranges:
|
|
start_ord = ord(start)
|
|
end_ord = ord(end)
|
|
for i in range(start_ord, min(end_ord + 1, start_ord + 10)):
|
|
options.append(chr(i))
|
|
|
|
if not options:
|
|
return ""
|
|
|
|
return random.choice(options)
|
|
|
|
|
|
def generate_special_sequence_example(node: SpecialSequence) -> str:
|
|
"""Generate an example for a special sequence."""
|
|
sequences = {
|
|
".": random.choice(string.ascii_letters + string.digits + "!@#$"),
|
|
r"\d": random.choice(DIGITS),
|
|
r"\D": random.choice(UPPERCASE + LOWERCASE + PUNCTUATION + WHITESPACE),
|
|
r"\w": random.choice(string.ascii_letters + string.digits + "_"),
|
|
r"\W": random.choice(PUNCTUATION + WHITESPACE),
|
|
r"\s": random.choice(WHITESPACE),
|
|
r"\S": random.choice(string.ascii_letters + string.digits + PUNCTUATION),
|
|
r"\b": "",
|
|
r"\B": "",
|
|
r"^": "",
|
|
r"$": "",
|
|
}
|
|
return sequences.get(node.sequence, node.sequence)
|
|
|
|
|
|
def generate_anchor_example(node: Anchor) -> str:
|
|
"""Generate an example for an anchor."""
|
|
return ""
|
|
|
|
|
|
def generate_quantifier_example(node: Quantifier) -> str:
|
|
"""Generate an example for a quantifier."""
|
|
if not hasattr(node, 'child') or not node.child:
|
|
return "*"
|
|
|
|
if isinstance(node.child, SpecialSequence):
|
|
if node.child.sequence in (r"\d", r"\D", r"\w", r"\W", r"\s", r"\S"):
|
|
word_chars = string.ascii_letters + string.digits + "_"
|
|
if node.child.sequence == r"\d":
|
|
chars = string.digits
|
|
elif node.child.sequence == r"\D":
|
|
chars = string.ascii_letters + string.punctuation + string.whitespace
|
|
elif node.child.sequence == r"\w":
|
|
chars = word_chars
|
|
elif node.child.sequence == r"\W":
|
|
chars = string.punctuation + string.whitespace
|
|
elif node.child.sequence == r"\s":
|
|
chars = string.whitespace
|
|
else:
|
|
chars = string.ascii_letters + string.digits + string.punctuation
|
|
|
|
if node.min == 0 and node.max == 1:
|
|
return random.choice(["", random.choice(chars)])
|
|
elif node.min == 0 and node.max == Quantifier.MAX_UNBOUNDED:
|
|
count = random.randint(0, 4)
|
|
elif node.min == 1 and node.max == Quantifier.MAX_UNBOUNDED:
|
|
count = random.randint(1, 4)
|
|
elif node.min == node.max:
|
|
count = node.min
|
|
elif node.max == Quantifier.MAX_UNBOUNDED:
|
|
count = random.randint(node.min, node.min + 3)
|
|
else:
|
|
count = random.randint(node.min, node.max)
|
|
return "".join(random.choice(chars) for _ in range(count))
|
|
|
|
child_example = generate_node_example(node.child)
|
|
|
|
if node.min == 0 and node.max == 1:
|
|
return random.choice(["", child_example])
|
|
elif node.min == 0 and node.max == Quantifier.MAX_UNBOUNDED:
|
|
count = random.randint(0, 4)
|
|
return child_example * count
|
|
elif node.min == 1 and node.max == Quantifier.MAX_UNBOUNDED:
|
|
count = random.randint(1, 4)
|
|
return child_example * count
|
|
elif node.min == node.max:
|
|
return child_example * node.min
|
|
elif node.max == Quantifier.MAX_UNBOUNDED:
|
|
count = random.randint(node.min, node.min + 3)
|
|
return child_example * count
|
|
else:
|
|
count = random.randint(node.min, node.max)
|
|
return child_example * count
|
|
|
|
|
|
def generate_group_example(node: Group) -> str:
|
|
"""Generate an example for a group."""
|
|
return "".join(generate_node_example(child) for child in node.content)
|
|
|
|
|
|
def generate_alternation_example(node: Alternation) -> str:
|
|
"""Generate an example for an alternation."""
|
|
if not node.options:
|
|
return ""
|
|
|
|
non_empty_options = [opt for opt in node.options if opt]
|
|
if not non_empty_options:
|
|
return ""
|
|
|
|
option = random.choice(non_empty_options)
|
|
return "".join(generate_node_example(child) for child in option)
|
|
|
|
|
|
def generate_backreference_example(node: Backreference) -> str:
|
|
"""Generate an example for a backreference."""
|
|
return "[reference]"
|
|
|
|
|
|
def generate_node_example(node: ASTNode) -> str:
|
|
"""Generate an example for any AST node."""
|
|
if isinstance(node, Literal):
|
|
return generate_literal_example(node)
|
|
elif isinstance(node, CharacterClass):
|
|
return generate_character_class_example(node)
|
|
elif isinstance(node, SpecialSequence):
|
|
return generate_special_sequence_example(node)
|
|
elif isinstance(node, Anchor):
|
|
return generate_anchor_example(node)
|
|
elif isinstance(node, Quantifier):
|
|
return generate_quantifier_example(node)
|
|
elif isinstance(node, Group):
|
|
return generate_group_example(node)
|
|
elif isinstance(node, Alternation):
|
|
return generate_alternation_example(node)
|
|
elif isinstance(node, Backreference):
|
|
return generate_backreference_example(node)
|
|
else:
|
|
return ""
|
|
|
|
|
|
def generate_examples(pattern: str, count: int = 5, flavor: str = "pcre") -> List[str]:
|
|
"""Generate example strings that match the given pattern.
|
|
|
|
Args:
|
|
pattern: The regex pattern.
|
|
count: Number of examples to generate.
|
|
flavor: The regex flavor.
|
|
|
|
Returns:
|
|
A list of example strings that match the pattern.
|
|
"""
|
|
try:
|
|
ast = parse_regex(pattern)
|
|
examples: Set[str] = set()
|
|
|
|
for _ in range(count * 3):
|
|
if len(examples) >= count:
|
|
break
|
|
|
|
example = "".join(generate_node_example(node) for node in ast)
|
|
if example:
|
|
examples.add(example)
|
|
|
|
if len(examples) >= count:
|
|
break
|
|
|
|
if len(examples) < count:
|
|
test_strings = [
|
|
"abc123",
|
|
"test@example.com",
|
|
"hello world",
|
|
"123-456-7890",
|
|
"https://example.com",
|
|
"foo bar baz",
|
|
"12345",
|
|
"ABCdef",
|
|
"word1 word2",
|
|
"special!@#chars",
|
|
]
|
|
try:
|
|
compiled = re.compile(pattern)
|
|
for test_str in test_strings:
|
|
if len(examples) >= count:
|
|
break
|
|
match = compiled.search(test_str)
|
|
if match:
|
|
examples.add(match.group(0))
|
|
except re.error:
|
|
pass
|
|
|
|
return list(examples)[:count]
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def generate_match_examples(pattern: str, test_string: str, count: int = 5, flavor: str = "pcre") -> List[str]:
|
|
"""Generate examples from a test string that match the pattern.
|
|
|
|
Args:
|
|
pattern: The regex pattern.
|
|
test_string: The string to search for matches.
|
|
count: Maximum number of examples to return.
|
|
flavor: The regex flavor.
|
|
|
|
Returns:
|
|
A list of matching substrings from the test string.
|
|
"""
|
|
try:
|
|
compiled = re.compile(pattern)
|
|
matches = compiled.findall(test_string)
|
|
unique_matches = []
|
|
seen = set()
|
|
|
|
for match in matches:
|
|
if isinstance(match, tuple):
|
|
match_str = "".join(match)
|
|
else:
|
|
match_str = match
|
|
|
|
if match_str not in seen:
|
|
seen.add(match_str)
|
|
unique_matches.append(match_str)
|
|
|
|
if len(unique_matches) >= count:
|
|
break
|
|
|
|
return unique_matches
|
|
except re.error:
|
|
return []
|