regex-humanizer/tests/test_parser.py

"""Tests for the parser module."""

import pytest

from regex_humanizer.parser import (
    tokenize,
    parse_regex,
    ParseError,
    Literal,
    CharacterClass,
    Quantifier,
    Group,
    Alternation,
    Anchor,
)


class TestTokenizer:
    """Tests for the tokenize function."""

    def test_tokenize_literal(self):
        """Test tokenizing a literal string."""
        tokens = tokenize("abc")
        assert len(tokens) == 1
        assert tokens[0].type == "LITERAL"
        assert tokens[0].value == "abc"

    def test_tokenize_anchors(self):
        """Test tokenizing anchor characters."""
        tokens = tokenize("^test$")
        assert len(tokens) == 3
        assert tokens[0].type == "ANCHOR_START"
        assert tokens[1].type == "LITERAL"
        assert tokens[2].type == "ANCHOR_END"

    def test_tokenize_quantifiers(self):
        """Test tokenizing quantifiers."""
        tokens = tokenize("a*")
        assert len(tokens) == 2
        assert tokens[0].type == "LITERAL"
        assert tokens[1].type == "STAR"

    def test_tokenize_character_class(self):
        """Test tokenizing character classes."""
        tokens = tokenize("[abc]")
        assert len(tokens) >= 2
        assert tokens[0].type == "OPEN_BRACKET"
        assert tokens[-1].type == "CLOSE_BRACKET"

    def test_tokenize_groups(self):
        """Test tokenizing groups."""
        tokens = tokenize("(abc)")
        assert len(tokens) == 3
        assert tokens[0].type == "OPEN_GROUP"
        assert tokens[1].type == "LITERAL"
        assert tokens[2].type == "CLOSE_GROUP"

    def test_tokenize_alternation(self):
        """Test tokenizing alternation."""
        tokens = tokenize("a|b")
        assert len(tokens) == 3
        assert tokens[0].type == "LITERAL"
        assert tokens[1].type == "ALTERNATION"
        assert tokens[2].type == "LITERAL"

    def test_tokenize_escape(self):
        """Test tokenizing escaped characters."""
        tokens = tokenize(r"\.")
        assert len(tokens) == 1
        assert tokens[0].type == "ESCAPED"

    def test_tokenize_special_sequences(self):
        """Test tokenizing special sequences."""
        tokens = tokenize(r"\d+\w*\s?")
        assert len(tokens) >= 4


class TestParser:
    """Tests for the parse_regex function."""

    def test_parse_literal(self):
        """Test parsing a literal pattern."""
        ast = parse_regex("hello")
        assert len(ast) == 1
        assert isinstance(ast[0], Literal)
        assert ast[0].value == "hello"

    def test_parse_character_class(self):
        """Test parsing a character class."""
        ast = parse_regex("[abc]")
        assert len(ast) == 1
        assert isinstance(ast[0], CharacterClass)

    def test_parse_inverted_class(self):
        """Test parsing an inverted class."""
        ast = parse_regex("[^abc]")
        assert len(ast) == 1
        assert isinstance(ast[0], CharacterClass)
        assert ast[0].inverted is True

    def test_parse_quantifier_star(self):
        """Test parsing the * quantifier."""
        ast = parse_regex("a*")
        assert len(ast) == 1
        assert isinstance(ast[0], Quantifier)

    def test_parse_quantifier_plus(self):
        """Test parsing the + quantifier."""
        ast = parse_regex("a+")
        assert len(ast) == 1
        assert isinstance(ast[0], Quantifier)

    def test_parse_quantifier_question(self):
        """Test parsing the ? quantifier."""
        ast = parse_regex("a?")
        assert len(ast) == 1
        assert isinstance(ast[0], Quantifier)

    def test_parse_group(self):
        """Test parsing a group."""
        ast = parse_regex("(abc)")
        assert len(ast) == 1
        assert isinstance(ast[0], Group)
        assert len(ast[0].content) == 1
        assert isinstance(ast[0].content[0], Literal)
        assert ast[0].content[0].value == "abc"

    def test_parse_non_capturing_group(self):
        """Test parsing a non-capturing group."""
        ast = parse_regex("(?:abc)")
        assert len(ast) == 1
        assert isinstance(ast[0], Group)
        assert ast[0].capturing is False

    def test_parse_alternation(self):
        """Test parsing alternation."""
        ast = parse_regex("a|b")
        assert len(ast) == 1
        assert isinstance(ast[0], Alternation)
        assert len(ast[0].options) == 2

    def test_parse_anchors(self):
        """Test parsing anchors."""
        ast = parse_regex("^start$")
        assert len(ast) == 3
        assert isinstance(ast[0], Anchor)
        assert isinstance(ast[1], Literal)
        assert isinstance(ast[2], Anchor)
        assert ast[1].value == "start"

    def test_parse_special_sequences(self):
        """Test parsing special sequences."""
        ast = parse_regex(r"\d+\w+")
        assert len(ast) == 2
        assert isinstance(ast[0], Quantifier)
        assert isinstance(ast[1], Quantifier)

    def test_parse_complex_pattern(self):
        """Test parsing a complex pattern."""
        pattern = r"^\w+@[a-z]+\.[a-z]+$"
        ast = parse_regex(pattern)
        assert len(ast) > 0

    def test_parse_error_unclosed_bracket(self):
        """Test parsing error for unclosed bracket."""
        with pytest.raises(ParseError):
            parse_regex("[abc")

    def test_parse_error_unclosed_group(self):
        """Test parsing error for unclosed group."""
        with pytest.raises(ParseError):
            parse_regex("(abc")