commit 52e792305bcf25c35445bde2580f5c9c5d136130 Author: CI Bot Date: Fri Feb 6 03:02:57 2026 +0000 feat: initial commit for regex-humanizer-cli - Add regex parser, translator, and test generator - Add CLI with explain, test, interactive commands - Add multi-flavor support (PCRE, JavaScript, Python) - Add Gitea Actions CI workflow - Add comprehensive README documentation diff --git a/.gitea/workflows/regex-humanizer-cli.yml b/.gitea/workflows/regex-humanizer-cli.yml new file mode 100644 index 0000000..18260c4 --- /dev/null +++ b/.gitea/workflows/regex-humanizer-cli.yml @@ -0,0 +1,38 @@ +name: CI + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + test: + runs-on: ubuntu-latest + timeout: 600 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -e . + python -m pip install pytest pytest-cov ruff + + - name: Run tests + run: python -m pytest tests/ -v --tb=short + + - name: Run linting + run: python -m ruff check regex_humanizer/ + + - name: Run type checking + run: python -m pip install mypy && python -m mypy regex_humanizer/ --ignore-missing-imports diff --git a/README.md b/README.md new file mode 100644 index 0000000..7c3b249 --- /dev/null +++ b/README.md @@ -0,0 +1,171 @@ +# Regex Humanizer CLI + +A CLI tool that converts complex regex patterns to human-readable English descriptions and generates comprehensive test cases. + +## Features + +- **Regex to English Translation**: Convert any regex pattern to plain English +- **Test Case Generation**: Auto-generate matching and non-matching test inputs +- **Multi-Flavor Support**: Supports PCRE, JavaScript, and Python regex flavors +- **Interactive Mode**: REPL-style interface for exploring regex patterns +- **Pattern Validation**: Validate regex patterns for different flavors +- **Flavor Conversion**: Convert patterns between different regex flavors + +## Installation + +```bash +pip install regex-humanizer-cli +``` + +Or from source: + +```bash +pip install -e . +``` + +## Quick Start + +### Explain a regex pattern + +```bash +regex-humanizer explain "^\d{3}-\d{4}$" +``` + +Output: +``` +Pattern: ^\d{3}-\d{4}$ +Flavor: pcre + +English Explanation: +-------------------------------------------------- +at the start of line or stringany digit (0-9)any digit (0-9)any digit (0-9)hyphenany digit (0-9)any digit (0-9)any digit (0-9)any digit (0-9)at the end of line or string +``` + +### Generate test cases + +```bash +regex-humanizer test "^[a-z]+$" +``` + +Output: +``` +Pattern: ^[a-z]+$ +Flavor: pcre + +Matching strings (should match the pattern): +-------------------------------------------------- + 1. abc + 2. hello + 3. world + +Non-matching strings (should NOT match the pattern): +-------------------------------------------------- + 1. 123 + 2. Hello + 3. test123 +``` + +### Interactive mode + +```bash +regex-humanizer interactive +``` + +## Commands + +### explain + +Explain a regex pattern in human-readable English: + +```bash +regex-humanizer explain "PATTERN" [OPTIONS] +``` + +Options: +- `--output, -o`: Output format (text/json, default: text) +- `--verbose, -v`: Show detailed breakdown +- `--flavor, -f`: Regex flavor (pcre/javascript/python) + +### test + +Generate test cases for a regex pattern: + +```bash +regex-humanizer test "PATTERN" [OPTIONS] +``` + +Options: +- `--output, -o`: Output format (text/json, default: text) +- `--count, -n`: Number of test cases (default: 5) + +### interactive + +Start an interactive REPL for exploring regex patterns: + +```bash +regex-humanizer interactive [OPTIONS] +``` + +Options: +- `--flavor, -f`: Default regex flavor + +### flavors + +List available regex flavors: + +```bash +regex-humanizer flavors +``` + +### validate + +Validate a regex pattern: + +```bash +regex-humanizer validate "PATTERN" [OPTIONS] +``` + +Options: +- `--flavor, -f`: Specific flavor to validate against + +### convert + +Convert a regex pattern between flavors: + +```bash +regex-humanizer convert "PATTERN" --from-flavor pcre --to-flavor javascript +``` + +## Flavor Support + +| Feature | PCRE | JavaScript | Python | +|---------|------|------------|--------| +| Lookahead | ✅ | ✅ | ✅ | +| Lookbehind | ✅ | ⚠️ Limited | ✅ | +| Named Groups | ✅ | ✅ | ✅ | +| Possessive Quantifiers | ✅ | ❌ | ❌ | +| Atomic Groups | ✅ | ❌ | ❌ | + +## Configuration + +No configuration file required. All options can be passed via command line. + +## Development + +```bash +# Install development dependencies +pip install -e ".[dev]" + +# Run tests +pytest tests/ -v + +# Run linting +ruff check regex_humanizer/ + +# Run type checking +mypy regex_humanizer/ --ignore-missing-imports +``` + +## License + +MIT License diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f4bb5a6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,60 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "regex-humanizer-cli" +version = "1.0.0" +description = "A CLI tool that converts complex regex patterns to human-readable English descriptions and generates comprehensive test cases" +readme = "README.md" +license = {text = "MIT"} +requires-python = ">=3.9" +authors = [ + {name = "Regex Humanizer Contributors"} +] +keywords = ["regex", "regular-expression", "cli", "humanizer", "testing"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "click>=8.0", + "regex>=2023.0", + "parsimonious>=0.10.0", + "pygments>=2.15", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0", + "pytest-cov>=4.0", + "black>=23.0", + "ruff>=0.1.0", +] + +[project.scripts] +regex-humanizer = "regex_humanizer.cli:main" + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_functions = ["test_*"] +addopts = "-v --tb=short" + +[tool.black] +line-length = 100 +target-version = ['py39'] + +[tool.ruff] +line-length = 100 +target-version = "py39" + +[tool.setuptools.packages.find] +where = ["."] +include = ["regex_humanizer*"] diff --git a/regex_humanizer/__init__.py b/regex_humanizer/__init__.py new file mode 100644 index 0000000..fd1c27a --- /dev/null +++ b/regex_humanizer/__init__.py @@ -0,0 +1,3 @@ +"""Regex Humanizer CLI - Convert regex patterns to human-readable English.""" + +__version__ = "1.0.0" diff --git a/regex_humanizer/cli.py b/regex_humanizer/cli.py new file mode 100644 index 0000000..54a054f --- /dev/null +++ b/regex_humanizer/cli.py @@ -0,0 +1,280 @@ +"""Command-line interface for Regex Humanizer.""" + +import json +import sys + +import click +from .parser import parse_regex +from .translator import translate_regex +from .test_generator import generate_test_cases +from .flavors import get_flavor_manager +from .interactive import start_interactive_mode + + +@click.group() +@click.option( + "--flavor", + type=click.Choice(["pcre", "javascript", "python"]), + default="pcre", + help="Regex flavor to use", +) +@click.pass_context +def main(ctx: click.Context, flavor: str): + """Regex Humanizer CLI - Convert regex patterns to human-readable English and generate test cases.""" + ctx.ensure_object(dict) + ctx.obj["flavor"] = flavor + + +@main.command("explain") +@click.argument("pattern", type=str) +@click.option( + "--output", + "-o", + type=click.Choice(["text", "json"]), + default="text", + help="Output format", +) +@click.option( + "--verbose", + "-v", + is_flag=True, + help="Show detailed breakdown", +) +@click.option( + "--flavor", + "-f", + type=click.Choice(["pcre", "javascript", "python"]), + default=None, + help="Regex flavor to use", +) +@click.pass_context +def explain(ctx: click.Context, pattern: str, output: str, verbose: bool, flavor: str): + """Explain a regex pattern in human-readable English.""" + if ctx.obj is None: + ctx.obj = {} + flavor = flavor or ctx.obj.get("flavor", "pcre") + + try: + ast = parse_regex(pattern, flavor) + translation = translate_regex(pattern, flavor) + + if output == "json": + result = { + "pattern": pattern, + "flavor": flavor, + "explanation": translation, + "verbose": { + "node_count": len(get_all_nodes(ast)), + "features": identify_features(ast), + } if verbose else None, + } + click.echo(json.dumps(result, indent=2)) + else: + click.echo(f"\nPattern: {pattern}") + click.echo(f"Flavor: {flavor}") + click.echo("\nEnglish Explanation:") + click.echo("-" * 50) + click.echo(translation) + click.echo() + + if verbose: + features = identify_features(ast) + click.echo("\nFeatures detected:") + for feature in features: + click.echo(f" - {feature}") + + except Exception as e: + click.echo(f"Error: {e}", err=True) + sys.exit(1) + + +@main.command("test") +@click.argument("pattern", type=str) +@click.option( + "--output", + "-o", + type=click.Choice(["text", "json"]), + default="text", + help="Output format", +) +@click.option( + "--count", + "-n", + type=int, + default=5, + help="Number of test cases to generate", +) +@click.pass_context +def test(ctx: click.Context, pattern: str, output: str, count: int): + """Generate test cases (matching and non-matching) for a regex pattern.""" + if ctx.obj is None: + ctx.obj = {} + flavor = ctx.obj.get("flavor", "pcre") + + try: + result = generate_test_cases( + pattern, + flavor, + matching_count=count, + non_matching_count=count + ) + + if output == "json": + click.echo(json.dumps(result, indent=2)) + else: + click.echo(f"\nPattern: {pattern}") + click.echo(f"Flavor: {flavor}") + click.echo("\nMatching strings (should match the pattern):") + click.echo("-" * 50) + for i, s in enumerate(result["matching"], 1): + click.echo(f" {i}. {s}") + + click.echo("\nNon-matching strings (should NOT match the pattern):") + click.echo("-" * 50) + for i, s in enumerate(result["non_matching"], 1): + click.echo(f" {i}. {s}") + + click.echo() + + except Exception as e: + click.echo(f"Error: {e}", err=True) + sys.exit(1) + + +@main.command("interactive") +@click.option( + "--flavor", + "-f", + type=click.Choice(["pcre", "javascript", "python"]), + default="pcre", + help="Regex flavor to use", +) +@click.pass_context +def interactive(ctx: click.Context, flavor: str): + """Start an interactive REPL for exploring regex patterns.""" + start_interactive_mode(flavor=flavor) + + +@main.command("flavors") +@click.pass_context +def flavors(ctx: click.Context): + """List available regex flavors.""" + manager = get_flavor_manager() + flavor_list = manager.list_flavors() + + click.echo("\nAvailable Regex Flavors:") + click.echo("-" * 50) + for name, desc in flavor_list: + click.echo(f"\n {name}:") + click.echo(f" {desc}") + click.echo() + + +@main.command("validate") +@click.argument("pattern", type=str) +@click.option( + "--flavor", + "-f", + type=click.Choice(["pcre", "javascript", "python"]), + default=None, + help="Specific flavor to validate against", +) +@click.pass_context +def validate(ctx: click.Context, pattern: str, flavor: str): + """Validate a regex pattern.""" + if ctx.obj is None: + ctx.obj = {} + check_flavor = flavor or ctx.obj.get("flavor", "pcre") + + try: + ast = parse_regex(pattern, check_flavor) + click.echo(f"\nPattern: {pattern}") + click.echo(f"Flavor: {check_flavor}") + click.echo("\nValidation: PASSED") + click.echo(f"AST node count: {len(get_all_nodes(ast))}") + except Exception as e: + click.echo(f"\nPattern: {pattern}") + click.echo("Validation: FAILED") + click.echo(f"Error: {e}") + sys.exit(1) + + +@main.command("convert") +@click.argument("pattern", type=str) +@click.option( + "--from-flavor", + "-s", + type=click.Choice(["pcre", "javascript", "python"]), + default="pcre", + help="Source flavor", +) +@click.option( + "--to-flavor", + "-t", + type=click.Choice(["pcre", "javascript", "python"]), + default="javascript", + help="Target flavor", +) +@click.pass_context +def convert(ctx: click.Context, pattern: str, from_flavor: str, to_flavor: str): + """Convert a regex pattern between flavors.""" + manager = get_flavor_manager() + converted, warnings = manager.convert(pattern, from_flavor, to_flavor) + + click.echo(f"\nOriginal ({from_flavor}): {pattern}") + click.echo(f"Converted ({to_flavor}): {converted}") + + if warnings: + click.echo("\nWarnings:") + for warning in warnings: + click.echo(f" - {warning}") + + +def get_all_nodes(ast) -> list: + """Get all nodes from AST.""" + nodes = [ast] + for child in getattr(ast, 'children', []): + nodes.extend(get_all_nodes(child)) + return nodes + + +def identify_features(ast) -> list[str]: + """Identify features in a regex pattern.""" + features = [] + nodes = get_all_nodes(ast) + + node_types = set(n.node_type.name for n in nodes) + + if "LOOKAHEAD" in node_types or "NEGATIVE_LOOKAHEAD" in node_types: + features.append("Lookahead assertions") + if "LOOKBEHIND" in node_types or "NEGATIVE_LOOKBEHIND" in node_types: + features.append("Lookbehind assertions") + if "NAMED_GROUP" in node_types: + features.append("Named groups") + if "CAPTURING_GROUP" in node_types: + features.append("Capturing groups") + if "NON_CAPTURING_GROUP" in node_types: + features.append("Non-capturing groups") + if "QUANTIFIER" in node_types: + features.append("Quantifiers") + for n in nodes: + if n.node_type.name == "QUANTIFIER" and n.is_lazy: + features.append("Lazy quantifiers") + break + if n.node_type.name == "QUANTIFIER" and n.is_possessive: + features.append("Possessive quantifiers") + break + if "POSITIVE_SET" in node_types or "NEGATIVE_SET" in node_types: + features.append("Character classes") + if "ANCHOR_START" in node_types or "ANCHOR_END" in node_types: + features.append("Anchors") + if "DIGIT" in node_types or "WORD_CHAR" in node_types or "WHITESPACE" in node_types: + features.append("Shorthand character classes") + if "BACKREFERENCE" in node_types: + features.append("Backreferences") + + return features + + +if __name__ == "__main__": + main() diff --git a/regex_humanizer/flavors.py b/regex_humanizer/flavors.py new file mode 100644 index 0000000..73e9699 --- /dev/null +++ b/regex_humanizer/flavors.py @@ -0,0 +1,207 @@ +"""Flavor support system for different regex flavors.""" + +from abc import ABC, abstractmethod +from typing import Optional +import re + + +class RegexFlavor(ABC): + """Base class for regex flavors.""" + + @property + @abstractmethod + def name(self) -> str: + """Return the flavor name.""" + pass + + @property + @abstractmethod + def description(self) -> str: + """Return a description of the flavor.""" + pass + + @abstractmethod + def normalize(self, pattern: str) -> tuple[str, list[str]]: + """Normalize a pattern to this flavor, returning warnings.""" + pass + + @abstractmethod + def get_flags(self) -> int: + """Return regex flags for this flavor.""" + pass + + @abstractmethod + def supports_feature(self, feature: str) -> bool: + """Check if a feature is supported.""" + pass + + +class PCREFlavor(RegexFlavor): + """PCRE (Perl Compatible Regular Expressions) flavor.""" + + @property + def name(self) -> str: + return "pcre" + + @property + def description(self) -> str: + return "PCRE - Full feature set with possessive quantifiers, lookbehinds, and all Perl extensions" + + def normalize(self, pattern: str) -> tuple[str, list[str]]: + warnings = [] + normalized = pattern + return normalized, warnings + + def get_flags(self) -> int: + return re.MULTILINE + + def supports_feature(self, feature: str) -> bool: + supported = { + "lookahead": True, + "lookbehind": True, + "named_groups": True, + "non_capturing_groups": True, + "possessive_quantifiers": True, + "atomic_groups": True, + "comment_syntax": True, + "inline_flags": True, + "recursion": True, + "subroutine_references": True, + } + return supported.get(feature, False) + + +class JavaScriptFlavor(RegexFlavor): + """JavaScript regex flavor.""" + + @property + def name(self) -> str: + return "javascript" + + @property + def description(self) -> str: + return "JavaScript/ECMAScript - Limited lookbehind support, dotAll flag needed for . matching newlines" + + def normalize(self, pattern: str) -> tuple[str, list[str]]: + warnings = [] + normalized = pattern + + normalized = normalized.replace("(?P<", "(?<") + while "\\k<" in normalized: + normalized = normalized.replace("\\k<", "\\k") + + warnings.append("Note: Some PCRE features may not work in JavaScript") + + return normalized, warnings + + def get_flags(self) -> int: + return 0 + + def supports_feature(self, feature: str) -> bool: + supported = { + "lookahead": True, + "lookbehind": True, + "named_groups": True, + "non_capturing_groups": True, + "possessive_quantifiers": False, + "atomic_groups": False, + "comment_syntax": False, + "inline_flags": False, + "recursion": False, + "subroutine_references": False, + } + return supported.get(feature, False) + + +class PythonFlavor(RegexFlavor): + """Python re module regex flavor.""" + + @property + def name(self) -> str: + return "python" + + @property + def description(self) -> str: + return "Python re module - Full Unicode support, named groups, and most PCRE features" + + def normalize(self, pattern: str) -> tuple[str, list[str]]: + warnings = [] + normalized = pattern + + normalized = normalized.replace("(?P<", "(?<") + + return normalized, warnings + + def get_flags(self) -> int: + return re.MULTILINE | re.UNICODE + + def supports_feature(self, feature: str) -> bool: + supported = { + "lookahead": True, + "lookbehind": True, + "named_groups": True, + "non_capturing_groups": True, + "possessive_quantifiers": False, + "atomic_groups": False, + "comment_syntax": True, + "inline_flags": True, + "recursion": False, + "subroutine_references": False, + } + return supported.get(feature, False) + + +class FlavorManager: + """Manages regex flavors and their adapters.""" + + def __init__(self): + self._flavors: dict[str, RegexFlavor] = {} + self._register_default_flavors() + + def _register_default_flavors(self): + """Register the default flavors.""" + self.register_flavor(PCREFlavor()) + self.register_flavor(JavaScriptFlavor()) + self.register_flavor(PythonFlavor()) + + def register_flavor(self, flavor: RegexFlavor): + """Register a new flavor.""" + self._flavors[flavor.name] = flavor + + def get_flavor(self, name: str) -> Optional[RegexFlavor]: + """Get a flavor by name.""" + return self._flavors.get(name) + + def list_flavors(self) -> list[tuple[str, str]]: + """List all available flavors.""" + return [(name, flavor.description) for name, flavor in self._flavors.items()] + + def convert( + self, + pattern: str, + from_flavor: str, + to_flavor: str + ) -> tuple[str, list[str]]: + """Convert a pattern from one flavor to another.""" + source = self.get_flavor(from_flavor) + target = self.get_flavor(to_flavor) + + if not source: + return pattern, [f"Unknown source flavor: {from_flavor}"] + if not target: + return pattern, [f"Unknown target flavor: {to_flavor}"] + + normalized, warnings = source.normalize(pattern) + result, convert_warnings = target.normalize(normalized) + + return result, warnings + convert_warnings + + +def get_flavor_manager() -> FlavorManager: + """Get the global flavor manager instance.""" + return FlavorManager() + + +def get_available_flavors() -> list[str]: + """Get a list of available flavor names.""" + return ["pcre", "javascript", "python"] diff --git a/regex_humanizer/interactive.py b/regex_humanizer/interactive.py new file mode 100644 index 0000000..37049d2 --- /dev/null +++ b/regex_humanizer/interactive.py @@ -0,0 +1,289 @@ +"""Interactive REPL mode for exploring regex patterns.""" + +import sys +import os +from .translator import translate_regex +from .test_generator import generate_test_cases +from .flavors import get_flavor_manager + + +def format_output(text: str, use_color: bool = True) -> str: + """Format output with optional color.""" + if not use_color or not sys.stdout.isatty(): + return text + + try: + from pygments import highlight + from pygments.lexers import RegexLexer + from pygments.formatters import TerminalFormatter + + lexer = RegexLexer() + formatter = TerminalFormatter() + return highlight(text, lexer, formatter) + except ImportError: + return text + + +class InteractiveSession: + """Interactive session for regex exploration.""" + + def __init__(self, flavor: str = "pcre", use_color: bool = True): + self.flavor = flavor + self.use_color = use_color + self.history: list[str] = [] + self.history_file = os.path.expanduser("~/.regex_humanizer_history") + self._load_history() + + def _load_history(self): + """Load command history from file.""" + if os.path.exists(self.history_file): + try: + with open(self.history_file, 'r') as f: + self.history = [line.strip() for line in f if line.strip()] + except Exception: + self.history = [] + + def _save_history(self): + """Save command history to file.""" + try: + os.makedirs(os.path.dirname(self.history_file), exist_ok=True) + with open(self.history_file, 'w') as f: + for cmd in self.history[-1000:]: + f.write(cmd + '\n') + except Exception: + pass + + def run(self): + """Run the interactive session.""" + print("\nRegex Humanizer - Interactive Mode") + print("Type 'help' for available commands, 'quit' to exit.\n") + + while True: + try: + import click + user_input = click.prompt( + "regex> ", + type=str, + default="", + show_default=False + ) + + if not user_input.strip(): + continue + + self.history.append(user_input) + self._save_history() + + self._process_command(user_input.strip()) + + except (KeyboardInterrupt, EOFError): + print("\nGoodbye!") + break + + def _process_command(self, command: str): + """Process a user command.""" + parts = command.split(None, 1) + cmd = parts[0].lower() + args = parts[1] if len(parts) > 1 else "" + + commands = { + "help": self._cmd_help, + "quit": self._cmd_quit, + "exit": self._cmd_quit, + "explain": self._cmd_explain, + "test": self._cmd_test, + "flavor": self._cmd_flavor, + "set": self._cmd_flavor, + "load": self._cmd_load, + "save": self._cmd_save, + "history": self._cmd_history, + "clear": self._cmd_clear, + "example": self._cmd_example, + } + + handler = commands.get(cmd) + if handler: + handler(args) + else: + print(f"Unknown command: {cmd}") + print("Type 'help' for available commands.") + + def _cmd_help(self, args: str): + """Show help message.""" + help_text = """ +Available Commands: + explain - Explain a regex pattern in English + test - Generate test cases for a pattern + flavor - Set the regex flavor (pcre, javascript, python) + set - Same as 'flavor' + load - Load a pattern from a file + save - Save the last pattern to a file + history - Show command history + example - Show an example pattern + clear - Clear the screen + quit / exit - Exit the interactive mode + +Examples: + explain ^\\d{3}-\\d{4}$ + test [a-z]+ + flavor javascript +""" + print(help_text) + + def _cmd_quit(self, args: str): + """Exit the session.""" + print("Goodbye!") + sys.exit(0) + + def _cmd_explain(self, args: str): + """Explain a regex pattern.""" + if not args: + print("Usage: explain ") + return + + try: + pattern = self._expand_pattern(args) + result = translate_regex(pattern, self.flavor) + + header = f"Pattern: {pattern}" + print("\n" + "=" * (len(header))) + print(header) + print("=" * (len(header))) + print("\nEnglish Explanation:") + print("-" * (len(header))) + print(result) + print() + + except Exception as e: + print(f"Error parsing pattern: {e}") + + def _cmd_test(self, args: str): + """Generate test cases for a pattern.""" + if not args: + print("Usage: test ") + return + + try: + pattern = self._expand_pattern(args) + result = generate_test_cases(pattern, self.flavor, 3, 3) + + header = f"Pattern: {pattern}" + print("\n" + "=" * (len(header))) + print(header) + print("=" * (len(header))) + print(f"\nFlavor: {self.flavor}") + + print("\nMatching strings:") + print("-" * (len(header))) + for i, s in enumerate(result["matching"], 1): + print(f" {i}. {s}") + + print("\nNon-matching strings:") + print("-" * (len(header))) + for i, s in enumerate(result["non_matching"], 1): + print(f" {i}. {s}") + + print() + + except Exception as e: + print(f"Error generating tests: {e}") + + def _cmd_flavor(self, args: str): + """Set the current flavor.""" + if not args: + manager = get_flavor_manager() + flavors = manager.list_flavors() + print("Available flavors:") + for name, desc in flavors: + marker = " (current)" if name == self.flavor else "" + print(f" {name}{marker}: {desc}") + return + + flavor_name = args.strip().lower() + manager = get_flavor_manager() + + if manager.get_flavor(flavor_name): + self.flavor = flavor_name + print(f"Flavor set to: {flavor_name}") + else: + print(f"Unknown flavor: {flavor_name}") + print("Available flavors: pcre, javascript, python") + + def _cmd_load(self, args: str): + """Load a pattern from a file.""" + if not args: + print("Usage: load ") + return + + filename = args.strip() + if not os.path.exists(filename): + print(f"File not found: {filename}") + return + + try: + with open(filename, 'r') as f: + pattern = f.read().strip() + + print(f"Loaded pattern: {pattern}") + + if hasattr(self, '_last_pattern'): + pass + self._last_pattern = pattern + + except Exception as e: + print(f"Error reading file: {e}") + + def _cmd_save(self, args: str): + """Save a pattern to a file.""" + if not args: + print("Usage: save ") + return + + pattern = getattr(self, '_last_pattern', None) + if not pattern: + print("No pattern to save. Use 'explain' or 'test' first.") + return + + try: + with open(args.strip(), 'w') as f: + f.write(pattern) + print(f"Saved pattern to: {args.strip()}") + except Exception as e: + print(f"Error writing file: {e}") + + def _cmd_history(self, args: str): + """Show command history.""" + print("Command history:") + for i, cmd in enumerate(self.history[-50:], 1): + print(f" {i:3}. {cmd}") + + def _cmd_clear(self, args: str): + """Clear the screen.""" + os.system('cls' if os.name == 'nt' else 'clear') + + def _cmd_example(self, args: str): + """Show an example pattern.""" + examples = [ + r"^\d{3}-\d{4}$", + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", + r"^(?:http|https)://[^\s]+$", + r"\b\d{4}-\d{2}-\d{2}\b", + r"(?i)(hello|hi|greetings)\s+world!?", + ] + + import random + example = random.choice(examples) + print(f"\nExample pattern: {example}") + print("\nType: explain " + example) + print("Type: test " + example) + print() + + def _expand_pattern(self, pattern: str) -> str: + """Expand a pattern from history or args.""" + return pattern + + +def start_interactive_mode(flavor: str = "pcre"): + """Start the interactive mode.""" + session = InteractiveSession(flavor=flavor) + session.run() diff --git a/regex_humanizer/parser.py b/regex_humanizer/parser.py new file mode 100644 index 0000000..47a0d96 --- /dev/null +++ b/regex_humanizer/parser.py @@ -0,0 +1,664 @@ +"""Regex parser for converting regex patterns to AST nodes.""" + +from typing import Optional, Any +from dataclasses import dataclass, field +from enum import Enum + + +class NodeType(Enum): + LITERAL = "literal" + CHARACTER_CLASS = "character_class" + POSITIVE_SET = "positive_set" + NEGATIVE_SET = "negative_set" + DOT = "dot" + GROUP = "group" + CAPTURING_GROUP = "capturing_group" + NON_CAPTURING_GROUP = "non_capturing_group" + NAMED_GROUP = "named_group" + LOOKAHEAD = "lookahead" + LOOKBEHIND = "lookbehind" + NEGATIVE_LOOKAHEAD = "negative_lookahead" + NEGATIVE_LOOKBEHIND = "negative_lookbehind" + QUANTIFIER = "quantifier" + ANCHOR_START = "anchor_start" + ANCHOR_END = "anchor_end" + WORD_BOUNDARY = "word_boundary" + NON_WORD_BOUNDARY = "non_word_boundary" + START_OF_STRING = "start_of_string" + END_OF_STRING = "end_of_string" + END_OF_STRING_Z = "end_of_string_z" + ANY_NEWLINE = "any_newline" + CONTROL_CHAR = "control_char" + ESCAPED_CHAR = "escaped_char" + HEX_ESCAPE = "hex_escape" + OCTAL_ESCAPE = "octal_escape" + UNICODE_PROPERTY = "unicode_property" + BACKREFERENCE = "backreference" + BRANCH = "branch" + SEQUENCE = "sequence" + DIGIT = "digit" + NON_DIGIT = "non_digit" + WORD_CHAR = "word_char" + NON_WORD_CHAR = "non_word_char" + WHITESPACE = "whitespace" + NON_WHITESPACE = "non_whitespace" + + +@dataclass +class RegexNode: + """Base class for regex AST nodes.""" + node_type: NodeType + children: list["RegexNode"] = field(default_factory=list) + raw: str = "" + position: int = 0 + + +@dataclass +class LiteralNode(RegexNode): + """Represents a literal character or string.""" + value: str = "" + + +@dataclass +class CharacterClassNode(RegexNode): + """Represents a character class like [a-z].""" + negated: bool = False + ranges: list[tuple[str, str]] = field(default_factory=list) + characters: str = "" + + +@dataclass +class QuantifierNode(RegexNode): + """Represents a quantifier like *, +, ?, {n,m}.""" + min_count: Optional[int] = None + max_count: Any = None + is_lazy: bool = False + is_possessive: bool = False + + +@dataclass +class GroupNode(RegexNode): + """Represents a group.""" + name: Optional[str] = None + group_index: Optional[int] = None + is_non_capturing: bool = False + + +class RegexParser: + """Parser for regex patterns that builds an AST.""" + + def __init__(self, pattern: str, flavor: str = "pcre"): + self.pattern = pattern + self.flavor = flavor + self.pos = 0 + self.length = len(pattern) + self._errors: list[str] = [] + + def parse(self) -> RegexNode: + """Parse the entire pattern into an AST.""" + self.pos = 0 + self._errors = [] + result = self._parse_sequence() + if self.pos < self.length: + remaining = self.pattern[self.pos:] + self._errors.append(f"Unexpected content at position {self.pos}: {remaining[:20]}") + return result + + def _parse_sequence(self) -> RegexNode: + """Parse a sequence of regex elements.""" + children = [] + start_pos = self.pos + while self.pos < self.length: + char = self.pattern[self.pos] + if char == ')': + break + elif char == '\\': + node = self._parse_escape() + if node: + children.append(node) + elif char == '[': + node = self._parse_character_class() + if node: + children.append(node) + elif char == '.': + children.append(RegexNode( + node_type=NodeType.DOT, + raw=char, + position=self.pos + )) + self.pos += 1 + elif char == '(': + node = self._parse_group() + if node: + children.append(node) + elif char == '|': + self.pos += 1 + first_alt_children = [] + if children and children[-1].node_type == NodeType.BRANCH: + first_alt_children = children[-1].children + else: + first_alt_children = children[:] + children.clear() + + alt_children = first_alt_children + while self.pos < self.length and self.pattern[self.pos] != ')' and self.pattern[self.pos] != '|': + char = self.pattern[self.pos] + if char == '\\': + node = self._parse_escape() + if node: + alt_children.append(node) + elif char == '[': + node = self._parse_character_class() + if node: + alt_children.append(node) + elif char == '.': + alt_children.append(RegexNode( + node_type=NodeType.DOT, + raw=char, + position=self.pos + )) + self.pos += 1 + elif char == '(': + node = self._parse_group() + if node: + alt_children.append(node) + elif char in '*+?{': + if alt_children: + prev = alt_children.pop() + if char == '{': + node = self._parse_quantifier(char, prev) + else: + node = self._parse_quantifier(char, prev) + if node: + alt_children.append(node) + else: + alt_children.append(prev) + self.pos += 1 + elif char == ')': + break + else: + literal = char + self.pos += 1 + while self.pos < self.length and self.pattern[self.pos] not in r')|*+?[\.^{$': + literal += self.pattern[self.pos] + self.pos += 1 + alt_children.append(LiteralNode( + node_type=NodeType.LITERAL, + value=literal, + raw=literal, + position=self.pos - len(literal) + )) + + if children and children[-1].node_type == NodeType.BRANCH: + pass + else: + branch = RegexNode( + node_type=NodeType.BRANCH, + children=first_alt_children, + raw='|', + position=self.pos - 1 + ) + children.append(branch) + elif char in '^$': + if char == '^': + children.append(RegexNode( + node_type=NodeType.ANCHOR_START, + raw=char, + position=self.pos + )) + else: + children.append(RegexNode( + node_type=NodeType.ANCHOR_END, + raw=char, + position=self.pos + )) + self.pos += 1 + elif char in '*+?': + node = self._parse_quantifier(char, children.pop() if children else None) + if node: + children.append(node) + else: + self._errors.append(f"Quantifier '{char}' without preceding element at position {self.pos}") + self.pos += 1 + elif char == '{': + if children: + node = self._parse_quantifier(char, children.pop()) + if node: + children.append(node) + else: + self._errors.append(f"Invalid quantifier at position {self.pos}") + self.pos += 1 + else: + self._errors.append(f"Quantifier '{{' without preceding element at position {self.pos}") + self.pos += 1 + else: + literal = char + self.pos += 1 + while self.pos < self.length and self.pattern[self.pos] not in r')|*+?[\.^{$': + char = self.pattern[self.pos] + if char == '\\': + if self.pos + 1 < self.length: + literal += char + self.pattern[self.pos + 1] + self.pos += 2 + else: + literal += char + self.pos += 1 + else: + literal += char + self.pos += 1 + children.append(LiteralNode( + node_type=NodeType.LITERAL, + value=literal, + raw=literal, + position=self.pos - len(literal) + )) + + end_pos = self.pos + return RegexNode( + node_type=NodeType.SEQUENCE, + children=children, + raw=self.pattern[start_pos:end_pos], + position=start_pos + ) + + def _parse_escape(self) -> Optional[RegexNode]: + """Parse an escape sequence.""" + if self.pos + 1 >= self.length: + return None + + self.pos += 1 + char = self.pattern[self.pos] + self.pos += 1 + + escaped_chars = { + 'd': ('digit', '\\d'), + 'D': ('non_digit', '\\D'), + 'w': ('word_char', '\\w'), + 'W': ('non_word_char', '\\W'), + 's': ('whitespace', '\\s'), + 'S': ('non_whitespace', '\\S'), + 'b': ('word_boundary', '\\b'), + 'B': ('non_word_boundary', '\\B'), + } + + if char in escaped_chars: + node_type_name, raw = escaped_chars[char] + return RegexNode( + node_type=NodeType(node_type_name), + raw=f'\\{char}', + position=self.pos - 2 + ) + + special_escaped = { + '.': '.', + '*': '*', + '+': '+', + '?': '?', + '^': '^', + '$': '$', + '|': '|', + '(': '(', + ')': ')', + '[': '[', + ']': ']', + '{': '{', + '}': '}', + '\\': '\\', + '-': '-', + 'n': '\n', + 'r': '\r', + 't': '\t', + } + + if char in special_escaped: + return LiteralNode( + node_type=NodeType.ESCAPED_CHAR, + value=special_escaped[char], + raw=f'\\{char}', + position=self.pos - 2 + ) + + if char == '0': + return RegexNode( + node_type=NodeType.OCTAL_ESCAPE, + raw=f'\\{char}', + position=self.pos - 2 + ) + + if char == 'x': + if self.pos + 2 <= self.length: + hex_part = self.pattern[self.pos:self.pos + 2] + if all(c in '0123456789abcdefABCDEF' for c in hex_part): + self.pos += 2 + return RegexNode( + node_type=NodeType.HEX_ESCAPE, + raw=f'\\x{hex_part}', + position=self.pos - 4 + ) + + if char == 'u': + if self.pos + 4 <= self.length: + hex_part = self.pattern[self.pos:self.pos + 4] + if all(c in '0123456789abcdefABCDEF' for c in hex_part): + self.pos += 4 + return RegexNode( + node_type=NodeType.UNICODE_PROPERTY, + raw=f'\\u{hex_part}', + position=self.pos - 6 + ) + + if char == 'p': + if self.pos < self.length and self.pattern[self.pos] == '{': + end = self.pattern.find('}', self.pos + 1) + if end != -1: + prop = self.pattern[self.pos + 1:end] + self.pos = end + 1 + return RegexNode( + node_type=NodeType.UNICODE_PROPERTY, + raw=f'\\p{{{prop}}}', + position=self.pos - len(f'\\p{{{prop}}}') + ) + + if char == 'c': + if self.pos < self.length: + ctrl_char = self.pattern[self.pos] + self.pos += 1 + return RegexNode( + node_type=NodeType.CONTROL_CHAR, + raw=f'\\c{ctrl_char}', + position=self.pos - 3 + ) + + if char.isdigit(): + backref = char + while self.pos < self.length and self.pattern[self.pos].isdigit(): + backref += self.pattern[self.pos] + self.pos += 1 + return RegexNode( + node_type=NodeType.BACKREFERENCE, + raw=f'\\{backref}', + position=self.pos - len(backref) - 1 + ) + + return LiteralNode( + node_type=NodeType.ESCAPED_CHAR, + value=char, + raw=f'\\{char}', + position=self.pos - 2 + ) + + def _parse_character_class(self) -> Optional[RegexNode]: + """Parse a character class like [a-z] or [^a-z].""" + if self.pos >= self.length or self.pattern[self.pos] != '[': + return None + + start_pos = self.pos + self.pos += 1 + + negated = False + if self.pos < self.length and self.pattern[self.pos] == '^': + negated = True + self.pos += 1 + elif self.pos < self.length and self.pattern[self.pos] == ']': + self.pos += 1 + + ranges = [] + characters = "" + + while self.pos < self.length: + char = self.pattern[self.pos] + + if char == ']': + self.pos += 1 + break + elif char == '\\': + if self.pos + 1 < self.length: + next_char = self.pattern[self.pos + 1] + if next_char == 'd' or next_char == 'D': + self.pos += 2 + elif next_char == 'w' or next_char == 'W': + self.pos += 2 + elif next_char == 's' or next_char == 'S': + self.pos += 2 + else: + self.pos += 2 + characters += next_char + else: + self.pos += 1 + elif char == '-' and characters and self.pos + 1 < self.length and self.pattern[self.pos + 1] != ']': + self.pos += 1 + end_char = self.pattern[self.pos] + self.pos += 1 + if characters[-1]: + ranges.append((characters[-1], end_char)) + characters = characters[:-1] + else: + characters += char + self.pos += 1 + + node = CharacterClassNode( + node_type=NodeType.NEGATIVE_SET if negated else NodeType.POSITIVE_SET, + negated=negated, + ranges=ranges, + characters=characters, + raw=self.pattern[start_pos:self.pos], + position=start_pos + ) + + return node + + def _parse_group(self) -> Optional[RegexNode]: + """Parse a group like (?:...) or (?...) or (?=...).""" + if self.pos >= self.length or self.pattern[self.pos] != '(': + return None + + start_pos = self.pos + self.pos += 1 + + if self.pos < self.length and self.pattern[self.pos] == '?': + self.pos += 1 + + if self.pos < self.length: + next_char = self.pattern[self.pos] + + if next_char == '=': + self.pos += 1 + children = self._parse_sequence() + return GroupNode( + node_type=NodeType.LOOKAHEAD, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + is_non_capturing=True + ) + elif next_char == '!': + self.pos += 1 + children = self._parse_sequence() + return GroupNode( + node_type=NodeType.NEGATIVE_LOOKAHEAD, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + is_non_capturing=True + ) + elif next_char == '<': + self.pos += 1 + if self.pos < self.length: + if self.pattern[self.pos] == '=': + self.pos += 1 + children = self._parse_sequence() + return GroupNode( + node_type=NodeType.LOOKBEHIND, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + is_non_capturing=True + ) + elif self.pattern[self.pos] == '!': + self.pos += 1 + children = self._parse_sequence() + return GroupNode( + node_type=NodeType.NEGATIVE_LOOKBEHIND, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + is_non_capturing=True + ) + else: + name_start = self.pos + while self.pos < self.length and self.pattern[self.pos] != '>': + self.pos += 1 + name = self.pattern[name_start:self.pos] + self.pos += 1 + children = self._parse_sequence() + return GroupNode( + node_type=NodeType.NAMED_GROUP, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + name=name, + is_non_capturing=False + ) + elif next_char == ':': + self.pos += 1 + children = self._parse_sequence() + return GroupNode( + node_type=NodeType.NON_CAPTURING_GROUP, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + is_non_capturing=True + ) + elif next_char == '#': + comment_end = self.pattern.find(')', self.pos) + if comment_end != -1: + self.pos = comment_end + 1 + children = self._parse_sequence() + return RegexNode( + node_type=NodeType.NON_CAPTURING_GROUP, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos + ) + elif next_char == 'P': + self.pos += 1 + if self.pos < self.length and self.pattern[self.pos] == '<': + name_start = self.pos + 1 + name_end = self.pattern.find('>', name_start) + if name_end != -1: + name = self.pattern[name_start:name_end] + self.pos = name_end + 1 + children = self._parse_sequence() + return GroupNode( + node_type=NodeType.NAMED_GROUP, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + name=name, + is_non_capturing=False + ) + elif next_char in 'iDsx': + self.pos += 1 + children = self._parse_sequence() + return RegexNode( + node_type=NodeType.NON_CAPTURING_GROUP, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos + ) + + children = self._parse_sequence() + + if self.pos < self.length and self.pattern[self.pos] == ')': + self.pos += 1 + + return GroupNode( + node_type=NodeType.CAPTURING_GROUP, + children=[children], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + is_non_capturing=False + ) + + def _parse_quantifier(self, char: str, node: Optional[RegexNode]) -> Optional[RegexNode]: + """Parse a quantifier like *, +, ?, {n,m}.""" + if node is None: + return None + + start_pos = self.pos + is_lazy = False + is_possessive = False + + if char in '*+?': + self.pos += 1 + if char == '*': + min_count = 0 + max_count = float('inf') + elif char == '+': + min_count = 1 + max_count = float('inf') + else: + min_count = 0 + max_count = 1 + + if self.pos < self.length and self.pattern[self.pos] in '?+': + modifier = self.pattern[self.pos] + if modifier == '?': + is_lazy = True + elif modifier == '+': + is_possessive = True + self.pos += 1 + + elif char == '{': + end = self.pattern.find('}', self.pos + 1) + if end == -1: + return None + + quant_content = self.pattern[self.pos + 1:end] + self.pos = end + 1 + + parts = quant_content.split(',') + min_count = int(parts[0]) + if len(parts) > 1 and parts[1].strip(): + max_count = int(parts[1]) + else: + max_count = min_count + + if self.pos < self.length and self.pattern[self.pos] in '?+': + modifier = self.pattern[self.pos] + if modifier == '?': + is_lazy = True + elif modifier == '+': + is_possessive = True + self.pos += 1 + + + else: + return None + + result = QuantifierNode( + node_type=NodeType.QUANTIFIER, + children=[node] if node else [], + raw=self.pattern[start_pos:self.pos], + position=start_pos, + min_count=min_count, + max_count=max_count, + is_lazy=is_lazy, + is_possessive=is_possessive + ) + + if node: + result.children = [node] + + return result + + def get_errors(self) -> list[str]: + """Return any parsing errors.""" + return self._errors + + +def parse_regex(pattern: str, flavor: str = "pcre") -> RegexNode: + """Parse a regex pattern into an AST.""" + parser = RegexParser(pattern, flavor) + ast = parser.parse() + return ast diff --git a/regex_humanizer/test_generator.py b/regex_humanizer/test_generator.py new file mode 100644 index 0000000..80df9b5 --- /dev/null +++ b/regex_humanizer/test_generator.py @@ -0,0 +1,382 @@ +"""Test case generator for regex patterns.""" + +import random +import string +from typing import Optional +from .parser import parse_regex, RegexNode, NodeType + + +class TestCaseGenerator: + """Generates matching and non-matching test cases for regex patterns.""" + + def __init__(self, flavor: str = "pcre"): + self.flavor = flavor + + def generate_matching( + self, + pattern: str, + count: int = 5, + max_length: int = 50 + ) -> list[str]: + """Generate strings that match the pattern.""" + try: + ast = parse_regex(pattern, self.flavor) + return self._generate_matching_from_ast(ast, count, max_length) + except Exception: + return self._generate_fallback_matching(pattern, count) + + def _generate_matching_from_ast( + self, + node: RegexNode, + count: int, + max_length: int + ) -> list[str]: + """Generate matching strings from AST.""" + if node.node_type == NodeType.SEQUENCE: + return self._generate_sequence(node.children, count, max_length) + return [pattern_to_string(node, max_length) for _ in range(count)] + + def _generate_sequence( + self, + children: list[RegexNode], + count: int, + max_length: int + ) -> list[str]: + """Generate strings for a sequence of nodes.""" + results = [] + for _ in range(count): + parts = [] + for child in children: + if len("".join(parts)) >= max_length: + break + part = generate_from_node(child, max_length - len("".join(parts))) + if part is None: + part = "" + parts.append(part) + results.append("".join(parts)) + return results + + def _generate_fallback_matching( + self, + pattern: str, + count: int + ) -> list[str]: + """Fallback matching generation using simple heuristics.""" + results = [] + for _ in range(count): + result = "" + in_class = False + class_chars = [] + + for char in pattern: + if char == '\\' and len(pattern) > 1: + next_char = pattern[pattern.index(char) + 1] + if next_char in 'dDsSwWbB': + if next_char == 'd': + result += random.choice(string.digits) + elif next_char == 'D': + result += random.choice(string.ascii_letters) + elif next_char == 'w': + result += random.choice(string.ascii_letters) + elif next_char == 'W': + result += random.choice(' !@#$%^&*()') + elif next_char == 's': + result += " " + elif next_char == 'b': + result += random.choice(string.ascii_letters) + else: + result += next_char + elif char == '.': + result += random.choice(string.ascii_letters) + elif char in '*+?': + continue + elif char == '[': + in_class = True + class_chars = [] + elif char == ']': + in_class = False + if class_chars: + result += random.choice(class_chars) + elif in_class: + if char == '-' and class_chars: + pass + else: + class_chars.append(char) + elif char not in '()|^$\\{}': + result += char + + if not result: + result = "test" + results.append(result[:20]) + + return results[:count] + + def generate_non_matching( + self, + pattern: str, + count: int = 5, + max_length: int = 50 + ) -> list[str]: + """Generate strings that do NOT match the pattern.""" + try: + ast = parse_regex(pattern, self.flavor) + return self._generate_non_matching_from_ast(pattern, ast, count, max_length) + except Exception: + return self._generate_fallback_non_matching(pattern, count) + + def _generate_non_matching_from_ast( + self, + pattern: str, + node: RegexNode, + count: int, + max_length: int + ) -> list[str]: + """Generate non-matching strings from AST.""" + results = set() + + if node.node_type == NodeType.ANCHOR_START: + return [s + "prefix" for s in results] or ["prefix_test"] + + if node.node_type == NodeType.ANCHOR_END: + return ["suffix" + s for s in results] or ["test_suffix"] + + if node.node_type == NodeType.START_OF_STRING: + return ["prefix" + s for s in results] or ["prefix_test"] + + if node.node_type == NodeType.END_OF_STRING: + return [s + "suffix" for s in results] or ["test_suffix"] + + base_matching = self._generate_matching_from_ast(node, 10, max_length) + + for matching in base_matching: + if len(results) >= count: + break + + if len(matching) > 0: + pos = random.randint(0, len(matching) - 1) + original = matching[pos] + replacement = get_replacement_char(original) + if replacement != original: + non_match = matching[:pos] + replacement + matching[pos + 1:] + if not matches_pattern(pattern, non_match, self.flavor): + results.add(non_match) + + if len(results) < count and matching: + pos = random.randint(0, len(matching)) + char_to_add = get_opposite_char_class(matching[pos - 1] if pos > 0 else 'a') + non_match = matching[:pos] + char_to_add + matching[pos:] + if not matches_pattern(pattern, non_match, self.flavor): + results.add(non_match) + + if len(results) < count: + for _ in range(count - len(results)): + base = self._generate_fallback_non_matching(pattern, 1)[0] if self._generate_fallback_non_matching(pattern, 1) else "does_not_match_123" + results.add(base + str(random.randint(100, 999))) + + return list(results)[:count] + + def _generate_fallback_non_matching( + self, + pattern: str, + count: int + ) -> list[str]: + """Fallback non-matching generation.""" + results = ["does_not_match", "completely_different", "!@#$%^&*()", "", "xyz123"] + + if pattern.startswith('^'): + results.append("prefix_" + results[0]) + + if pattern.endswith('$'): + results.append(results[0] + "_suffix") + + if '\\d' in pattern or '[0-9]' in pattern: + results.append("abc_def") + + if '\\w' in pattern: + results.append("!@#$%^&*") + + if '\\s' in pattern: + results.append("nospacehere") + + dot_count = pattern.count('.') + if dot_count > 0: + results.append("x" * (dot_count + 1)) + + import re + try: + compiled = re.compile(pattern) + filtered_results = [] + for r in results: + if compiled.search(r) is None: + filtered_results.append(r) + if filtered_results: + return filtered_results[:count] + except re.error: + pass + + return results[:count] + + +def generate_from_node(node: RegexNode, max_length: int) -> Optional[str]: + """Generate a string from a single node.""" + if node.node_type == NodeType.LITERAL: + return node.value[:max_length] if node.value else None + + if node.node_type == NodeType.ESCAPED_CHAR: + return node.value if node.value else None + + if node.node_type == NodeType.DOT: + return random.choice(string.ascii_letters) + + if node.node_type in (NodeType.POSITIVE_SET, NodeType.NEGATIVE_SET): + if node.node_type == NodeType.NEGATIVE_SET: + all_chars = [] + for start, end in node.ranges: + all_chars.extend([chr(i) for i in range(ord(start), ord(end) + 1)]) + all_chars.extend(node.characters) + available = [c for c in string.ascii_letters if c not in all_chars] + if available: + return random.choice(available) + return "!" + if node.ranges: + start, end = node.ranges[0] + return chr(random.randint(ord(start), ord(end))) + if node.characters: + return random.choice(node.characters) + return "a" + + if node.node_type in (NodeType.DIGIT, NodeType.NON_DIGIT): + return random.choice(string.digits) + + if node.node_type in (NodeType.WORD_CHAR, NodeType.NON_WORD_CHAR): + return random.choice(string.ascii_letters) + + if node.node_type in (NodeType.WHITESPACE, NodeType.NON_WHITESPACE): + return " " + + if node.node_type == NodeType.QUANTIFIER: + if node.children: + child_str = generate_from_node(node.children[0], max_length) + if child_str is None: + child_str = "x" + + min_count = node.min_count if node.min_count else 0 + max_count = min(node.max_count, 3) if node.max_count and node.max_count != float('inf') else 3 + max_count = max(min_count, max_count) + + if min_count == 0 and max_count == 0: + repeat = 0 + elif min_count == 0: + repeat = random.randint(1, max_count) + else: + repeat = random.randint(min_count, max_count) + + return (child_str * repeat)[:max_length] + return None + + if node.node_type == NodeType.CAPTURING_GROUP: + if node.children: + return generate_from_node(node.children[0], max_length) + return None + + if node.node_type == NodeType.NON_CAPTURING_GROUP: + if node.children: + return generate_from_node(node.children[0], max_length) + return None + + if node.node_type == NodeType.NAMED_GROUP: + if node.children: + return generate_from_node(node.children[0], max_length) + return None + + if node.node_type in (NodeType.LOOKAHEAD, NodeType.NEGATIVE_LOOKAHEAD): + return "" + + if node.node_type in (NodeType.LOOKBEHIND, NodeType.NEGATIVE_LOOKBEHIND): + return "" + + if node.node_type == NodeType.SEQUENCE: + result = "" + for child in node.children: + if len(result) >= max_length: + break + part = generate_from_node(child, max_length - len(result)) + if part: + result += part + return result if result else None + + if node.node_type == NodeType.BRANCH: + if node.children: + choices = [] + for child in node.children: + part = generate_from_node(child, max_length) + if part: + choices.append(part) + if choices: + return random.choice(choices) + return None + + return None + + +def pattern_to_string(node: RegexNode, max_length: int) -> str: + """Convert a node to a representative string.""" + result = generate_from_node(node, max_length) + return result if result else "test" + + +def get_replacement_char(original: str) -> str: + """Get a replacement character different from the original.""" + if original.isdigit(): + return random.choice([c for c in string.digits if c != original]) + if original.isalpha(): + return random.choice([c for c in string.ascii_letters if c.lower() != original.lower()]) + if original == ' ': + return random.choice(['\t', '\n']) + return 'x' + + +def get_opposite_char_class(char: str) -> str: + """Get a character from a different class.""" + if char.isdigit(): + return random.choice(string.ascii_letters) + if char.isalpha(): + return random.choice(string.digits) + if char == ' ': + return 'x' + return '1' + + +def matches_pattern(pattern: str, text: str, flavor: str) -> bool: + """Check if text matches pattern.""" + import re + try: + flags = 0 + if flavor == "python": + pass + elif flavor == "javascript": + flags = re.MULTILINE + elif flavor == "pcre": + flags = re.MULTILINE + + compiled = re.compile(pattern, flags) + return compiled.search(text) is not None + except re.error: + return False + + +def generate_test_cases( + pattern: str, + flavor: str = "pcre", + matching_count: int = 5, + non_matching_count: int = 5 +) -> dict: + """Generate all test cases for a pattern.""" + generator = TestCaseGenerator(flavor) + + return { + "pattern": pattern, + "flavor": flavor, + "matching": generator.generate_matching(pattern, matching_count), + "non_matching": generator.generate_non_matching(pattern, non_matching_count) + } diff --git a/regex_humanizer/translator.py b/regex_humanizer/translator.py new file mode 100644 index 0000000..16c5e59 --- /dev/null +++ b/regex_humanizer/translator.py @@ -0,0 +1,291 @@ +"""Translator for converting regex AST to human-readable English.""" + + +from .parser import ( + RegexNode, NodeType, LiteralNode, CharacterClassNode, + QuantifierNode, GroupNode, RegexParser +) + + +class RegexTranslator: + """Translates regex AST nodes to human-readable English.""" + + def __init__(self, flavor: str = "pcre"): + self.flavor = flavor + + def translate(self, pattern: str) -> str: + """Translate a regex pattern to human-readable English.""" + parser = RegexParser(pattern, self.flavor) + ast = parser.parse() + return self._translate_node(ast) + + def _translate_node(self, node: RegexNode) -> str: + """Translate a single node.""" + if node is None: + return "" + + handlers = { + NodeType.SEQUENCE: self._translate_sequence, + NodeType.LITERAL: self._translate_literal, + NodeType.ESCAPED_CHAR: self._translate_escaped_char, + NodeType.DOT: self._translate_dot, + NodeType.POSITIVE_SET: self._translate_positive_set, + NodeType.NEGATIVE_SET: self._translate_negative_set, + NodeType.CAPTURING_GROUP: self._translate_capturing_group, + NodeType.NON_CAPTURING_GROUP: self._translate_non_capturing_group, + NodeType.NAMED_GROUP: self._translate_named_group, + NodeType.LOOKAHEAD: self._translate_lookahead, + NodeType.NEGATIVE_LOOKAHEAD: self._translate_negative_lookahead, + NodeType.LOOKBEHIND: self._translate_lookbehind, + NodeType.NEGATIVE_LOOKBEHIND: self._translate_negative_lookbehind, + NodeType.QUANTIFIER: self._translate_quantifier, + NodeType.ANCHOR_START: self._translate_anchor_start, + NodeType.ANCHOR_END: self._translate_anchor_end, + NodeType.WORD_BOUNDARY: self._translate_word_boundary, + NodeType.NON_WORD_BOUNDARY: self._translate_non_word_boundary, + NodeType.BRANCH: self._translate_branch, + NodeType.START_OF_STRING: self._translate_start_of_string, + NodeType.END_OF_STRING: self._translate_end_of_string, + NodeType.DIGIT: self._translate_digit, + NodeType.NON_DIGIT: self._translate_non_digit, + NodeType.WORD_CHAR: self._translate_word_char, + NodeType.NON_WORD_CHAR: self._translate_non_word_char, + NodeType.WHITESPACE: self._translate_whitespace, + NodeType.NON_WHITESPACE: self._translate_non_whitespace, + NodeType.BACKREFERENCE: self._translate_backreference, + } + + handler = handlers.get(node.node_type) + if handler: + return handler(node) + return f"[{node.node_type.value}]" + + def _translate_sequence(self, node: RegexNode) -> str: + """Translate a sequence of nodes.""" + if not node.children: + return "empty string" + + parts = [] + for child in node.children: + if child.node_type == NodeType.BRANCH: + branch_parts = [self._translate_node(c) for c in child.children] + if len(branch_parts) == 1: + parts.append(branch_parts[0]) + else: + parts.append("(" + " OR ".join(branch_parts) + ")") + else: + parts.append(self._translate_node(child)) + + return "".join(parts) + + def _translate_branch(self, node: RegexNode) -> str: + """Translate a branch (alternation).""" + if not node.children: + return "" + + parts = [self._translate_node(child) for child in node.children] + return " OR ".join(parts) + + def _translate_literal(self, node: LiteralNode) -> str: + """Translate a literal node.""" + value = node.value + value = value.replace("\\", "backslash ") + value = value.replace(".", "period ") + value = value.replace("*", "asterisk ") + value = value.replace("+", "plus ") + value = value.replace("?", "question mark ") + value = value.replace("$", "dollar sign ") + value = value.replace("^", "caret ") + value = value.replace("|", "pipe ") + value = value.replace("(", "left parenthesis ") + value = value.replace(")", "right parenthesis ") + value = value.replace("[", "left bracket ") + value = value.replace("]", "right bracket ") + value = value.replace("{", "left brace ") + value = value.replace("}", "right brace ") + value = value.replace("\t", "tab ") + value = value.replace("\n", "newline ") + value = value.replace("\r", "carriage return ") + value = value.replace(" ", "space ") + return value + + def _translate_escaped_char(self, node: LiteralNode) -> str: + """Translate an escaped character.""" + value = node.value + if value == " ": + return "space" + elif value == "\t": + return "tab character (escape sequence \\t)" + elif value == "\n": + return "newline character (escape sequence \\n)" + elif value == "\r": + return "carriage return (escape sequence \\r)" + return f"'{value}'" + + def _translate_dot(self, node: RegexNode) -> str: + """Translate a dot (any character).""" + return "any single character" + + def _translate_positive_set(self, node: CharacterClassNode) -> str: + """Translate a positive character set like [a-z].""" + parts = [] + + for start, end in node.ranges: + parts.append(f"any character from {start} through {end}") + + for char in node.characters: + if char == '-': + parts.append("hyphen") + else: + parts.append(f"'{char}'") + + if not parts: + return "any character in empty set" + + if len(parts) == 1: + return parts[0] + + return "any of: " + ", ".join(parts) + + def _translate_negative_set(self, node: CharacterClassNode) -> str: + """Translate a negative character set like [^a-z].""" + positive = self._translate_positive_set(node) + if positive.startswith("any character from"): + return "any character EXCEPT " + positive[20:] + return f"any character EXCEPT {positive[7:]}" + + def _translate_capturing_group(self, node: GroupNode) -> str: + """Translate a capturing group.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"capturing group: ({content})" + return "capturing group: ()" + + def _translate_non_capturing_group(self, node: GroupNode) -> str: + """Translate a non-capturing group.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"non-capturing group: ({content})" + return "non-capturing group: ()" + + def _translate_named_group(self, node: GroupNode) -> str: + """Translate a named group.""" + name = node.name or "unnamed" + if node.children: + content = self._translate_node(node.children[0]) + return f"named group '{name}': ({content})" + return f"named group '{name}': ()" + + def _translate_lookahead(self, node: GroupNode) -> str: + """Translate a positive lookahead.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"followed by ({content})" + return "followed by ()" + + def _translate_negative_lookahead(self, node: GroupNode) -> str: + """Translate a negative lookahead.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"NOT followed by ({content})" + return "NOT followed by ()" + + def _translate_lookbehind(self, node: GroupNode) -> str: + """Translate a lookbehind.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"preceded by ({content})" + return "preceded by ()" + + def _translate_negative_lookbehind(self, node: GroupNode) -> str: + """Translate a negative lookbehind.""" + if node.children: + content = self._translate_node(node.children[0]) + return f"NOT preceded by ({content})" + return "NOT preceded by ()" + + def _translate_quantifier(self, node: QuantifierNode) -> str: + """Translate a quantifier.""" + if not node.children: + return "[empty quantifier]" + + child = node.children[0] + base = self._translate_node(child) + + lazy_str = " (lazy)" if node.is_lazy else "" + possessive_str = " (possessive)" if node.is_possessive else "" + + if node.min_count == 0 and node.max_count == 1: + return f"optional: {base}{lazy_str}{possessive_str}" + elif node.min_count == 0 and node.max_count == float('inf'): + return f"zero or more of: {base}{lazy_str}{possessive_str}" + elif node.min_count == 1 and node.max_count == float('inf'): + return f"one or more of: {base}{lazy_str}{possessive_str}" + elif node.min_count == node.max_count: + count = node.min_count + if count == 1: + return base + else: + return f"exactly {count} of: {base}{lazy_str}{possessive_str}" + elif node.max_count == float('inf'): + return f"at least {node.min_count} of: {base}{lazy_str}{possessive_str}" + else: + return f"between {node.min_count} and {node.max_count} of: {base}{lazy_str}{possessive_str}" + + def _translate_anchor_start(self, node: RegexNode) -> str: + """Translate start anchor.""" + return "at the start of line or string" + + def _translate_anchor_end(self, node: RegexNode) -> str: + """Translate end anchor.""" + return "at the end of line or string" + + def _translate_word_boundary(self, node: RegexNode) -> str: + """Translate word boundary.""" + return "at a word boundary" + + def _translate_non_word_boundary(self, node: RegexNode) -> str: + """Translate non-word boundary.""" + return "not at a word boundary" + + def _translate_start_of_string(self, node: RegexNode) -> str: + """Translate start of string anchor.""" + return "at the start of the string" + + def _translate_end_of_string(self, node: RegexNode) -> str: + """Translate end of string anchor.""" + return "at the end of the string" + + def _translate_digit(self, node: RegexNode) -> str: + """Translate digit character class.""" + return "any digit (0-9)" + + def _translate_non_digit(self, node: RegexNode) -> str: + """Translate non-digit character class.""" + return "any non-digit character" + + def _translate_word_char(self, node: RegexNode) -> str: + """Translate word character class.""" + return "any word character (a-z, A-Z, 0-9, underscore)" + + def _translate_non_word_char(self, node: RegexNode) -> str: + """Translate non-word character class.""" + return "any non-word character" + + def _translate_whitespace(self, node: RegexNode) -> str: + """Translate whitespace character class.""" + return "any whitespace character (space, tab, newline, etc.)" + + def _translate_non_whitespace(self, node: RegexNode) -> str: + """Translate non-whitespace character class.""" + return "any non-whitespace character" + + def _translate_backreference(self, node: RegexNode) -> str: + """Translate a backreference.""" + return f"same as capture group \\{node.raw}" + + +def translate_regex(pattern: str, flavor: str = "pcre") -> str: + """Translate a regex pattern to human-readable English.""" + translator = RegexTranslator(flavor) + return translator.translate(pattern) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..82820de --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +click>=8.0 +regex>=2023.0 +parsimonious>=0.10.0 +pytest>=7.0 +pygments>=2.15 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8136bbc --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +from setuptools import setup, find_packages + +setup( + name="regex-humanizer-cli", + version="1.0.0", + packages=find_packages(where="."), + package_dir={"": "."}, + install_requires=[ + "click>=8.0", + "regex>=2023.0", + "parsimonious>=0.10.0", + "pygments>=2.15", + ], + extras_require={ + "dev": ["pytest>=7.0", "pytest-cov>=4.0", "black>=23.0", "ruff>=0.1.0"], + }, + entry_points={ + "console_scripts": [ + "regex-humanizer=regex_humanizer.cli:main", + ], + }, + python_requires=">=3.9", +)