fix: add type annotations to parser.py

2026-02-02 07:04:38 +00:00
parent e86a5dede4
commit 352813814d
1 changed files with 336 additions and 0 deletions
--- a/regex_humanizer/parser/parser.py
+++ b/regex_humanizer/parser/parser.py
@@ -0,0 +1,336 @@
 """Parse tokens into an AST."""
 import re
 from typing import List, Optional
 from .ast import (
    ASTNode,
    Alternation,
    Anchor,
    Backreference,
    CharacterClass,
    Group,
    Literal,
    Quantifier,
    SpecialSequence,
 )
 from .tokenizer import Token, tokenize
 class ParseError(Exception):
    """Exception raised when parsing fails."""
    def __init__(self, message: str, position: int = 0):
        self.message = message
        self.position = position
        super().__init__(f"{message} at position {position}")
 def parse_quantifier(tokens: List[Token], index: int) -> tuple[Optional[Quantifier], int]:
    """Parse a quantifier from tokens starting at index."""
    if index >= len(tokens):
        return None, index
    token = tokens[index]
    min_count = 0
    max_count = Quantifier.MAX_UNBOUNDED
    lazy = False
    possessive = False
    if token.type in ("PLUS", "PLUS_LAZY", "PLUS_POSSESSIVE"):
        min_count = 1
        max_count = Quantifier.MAX_UNBOUNDED
        lazy = token.type == "PLUS_LAZY"
        possessive = token.type == "PLUS_POSSESSIVE"
        return Quantifier(min=min_count, max=max_count, lazy=lazy, possessive=possessive, position=token.position), index + 1
    elif token.type in ("STAR", "STAR_LAZY", "STAR_POSSESSIVE"):
        min_count = 0
        max_count = Quantifier.MAX_UNBOUNDED
        lazy = token.type == "STAR_LAZY"
        possessive = token.type == "STAR_POSSESSIVE"
        return Quantifier(min=min_count, max=max_count, lazy=lazy, possessive=possessive, position=token.position), index + 1
    elif token.type in ("QUESTION", "QUESTION_LAZY", "QUESTION_POSSESSIVE"):
        min_count = 0
        max_count = 1
        lazy = token.type == "QUESTION_LAZY"
        possessive = token.type == "QUESTION_POSSESSIVE"
        return Quantifier(min=min_count, max=max_count, lazy=lazy, possessive=possessive, position=token.position), index + 1
    elif token.type == "OPEN_BRACE":
        brace_content = ""
        brace_end = index
        for i in range(index + 1, len(tokens)):
            if tokens[i].type == "CLOSE_BRACE":
                brace_end = i
                brace_content = "".join(t.value for t in tokens[index + 1:i])
                break
        if not brace_content:
            raise ParseError("Invalid quantifier format", tokens[index].position)
        brace_match = re.match(r"^(\d+)(?:,(\d*))?$", brace_content)
        if not brace_match:
            raise ParseError("Invalid quantifier format", tokens[index].position)
        min_count = int(brace_match.group(1))
        max_count_str = brace_match.group(2)
        max_count = int(max_count_str) if max_count_str else Quantifier.MAX_UNBOUNDED
        next_index = brace_end + 1
        if next_index < len(tokens) and tokens[next_index].value == "?":
            lazy = True
            next_index += 1
        return Quantifier(min=min_count, max=max_count, lazy=lazy, position=tokens[index].position), next_index
    return None, index
 def parse_character_class(tokens: List[Token], index: int) -> tuple[CharacterClass, int]:
    """Parse a character class from tokens starting at index."""
    if index >= len(tokens) or tokens[index].type != "OPEN_BRACKET":
        raise ParseError("Expected character class", tokens[index].position if index < len(tokens) else 0)
    bracket_token = tokens[index]
    inverted = False
    characters = []
    ranges = []
    i = index + 1
    if i < len(tokens) and tokens[i].type == "LITERAL" and tokens[i].value == "^":
        inverted = True
        i += 1
    while i < len(tokens) and tokens[i].type != "CLOSE_BRACKET":
        token = tokens[i]
        if token.type == "ESCAPED":
            char = token.value[1]
            if i + 2 < len(tokens) and tokens[i + 1].type == "MINUS":
                end_char = tokens[i + 2].value
                if end_char == "ESCAPED":
                    end_char = end_char[1]
                ranges.append((char, end_char))
                i += 3
            else:
                characters.append(char)
                i += 1
        elif token.type == "MINUS":
            i += 1
        elif token.type == "DIGIT":
            characters.append(token.value)
            i += 1
        elif token.type == "LITERAL":
            if i + 2 < len(tokens) and tokens[i + 1].type == "MINUS":
                end_char = tokens[i + 2].value
                ranges.append((token.value, end_char))
                i += 3
            else:
                characters.append(token.value)
                i += 1
        else:
            characters.append(token.value)
            i += 1
    if i >= len(tokens):
        raise ParseError("Unclosed character class", bracket_token.position)
    return CharacterClass(
        inverted=inverted,
        characters=characters,
        ranges=ranges,
        position=bracket_token.position
    ), i + 1
 def parse_group(tokens: List[Token], index: int) -> tuple[Group, int]:
    """Parse a group from tokens starting at index."""
    if index >= len(tokens):
        raise ParseError("Expected group start", 0)
    group_token = tokens[index]
    if tokens[index].type == "NON_CAPTURING":
        content, next_index = parse_sequence(tokens, index + 1)
        if next_index >= len(tokens) or tokens[next_index].type != "CLOSE_GROUP":
            raise ParseError("Unclosed non-capturing group", group_token.position)
        next_index += 1
        return Group(content=content, capturing=False, position=group_token.position), next_index
    if tokens[index].type == "NAMED_GROUP":
        name = tokens[index].extra
        content, next_index = parse_sequence(tokens, index + 1)
        if next_index >= len(tokens) or tokens[next_index].type != "CLOSE_GROUP":
            raise ParseError("Unclosed named group", group_token.position)
        next_index += 1
        return Group(content=content, capturing=True, name=name, position=group_token.position), next_index
    if tokens[index].type in ("POSITIVE_LOOKAHEAD", "NEGATIVE_LOOKAHEAD",
                               "POSITIVE_LOOKBEHIND", "NEGATIVE_LOOKBEHIND",
                               "COMMENT"):
        content, next_index = parse_sequence(tokens, index + 1)
        if next_index >= len(tokens) or tokens[next_index].type != "CLOSE_GROUP":
            raise ParseError("Unclosed group", group_token.position)
        next_index += 1
        return Group(content=content, capturing=False, position=group_token.position), next_index
    if tokens[index].type == "OPEN_GROUP":
        i = index + 1
        if i >= len(tokens):
            raise ParseError("Empty group", group_token.position)
        options: List[List[ASTNode]] = []
        current_option: List[ASTNode] = []
        first_alternation_index: Optional[int] = None
        while i < len(tokens):
            token = tokens[i]
            if token.type == "ALTERNATION":
                options.append(current_option)
                current_option = []
                first_alternation_index = i
                i += 1
            elif token.type == "CLOSE_GROUP":
                if current_option or first_alternation_index is not None:
                    options.append(current_option)
                if len(options) > 1:
                    alternation = Alternation(options=options, position=tokens[first_alternation_index].position)  # type: ignore[index]
                    return Group(content=[alternation], capturing=True, position=group_token.position), i + 1
                else:
                    return Group(content=current_option, capturing=True, position=group_token.position), i + 1
            else:
                nodes, next_i = parse_sequence(tokens, i)
                current_option.extend(nodes)
                i = next_i
        raise ParseError("Unclosed group", group_token.position)
    raise ParseError("Expected group start", tokens[index].position if index < len(tokens) else 0)
 def parse_sequence(tokens: List[Token], index: int) -> tuple[List[ASTNode], int]:
    """Parse a sequence of tokens until end of group or pattern."""
    nodes: List[ASTNode] = []
    i = index
    while i < len(tokens):
        token = tokens[i]
        if token.type in ("CLOSE_GROUP", "CLOSE_BRACKET", "ALTERNATION"):
            break
        if token.type == "ANCHOR_START":
            nodes.append(Anchor(kind="^", position=token.position))
            i += 1
        elif token.type == "ANCHOR_END":
            nodes.append(Anchor(kind="$", position=token.position))
            i += 1
        elif token.type == "WORD_BOUNDARY":
            nodes.append(Anchor(kind=r"\b", position=token.position))
            i += 1
        elif token.type == "NON_WORD_BOUNDARY":
            nodes.append(Anchor(kind=r"\B", position=token.position))
            i += 1
        elif token.type in ("DIGIT", "NON_DIGIT", "WHITESPACE", "NON_WHITESPACE",
                           "WORD_CHAR", "NON_WORD_CHAR"):
            nodes.append(SpecialSequence(sequence=token.value, position=token.position))
            i += 1
        elif token.type == "ANY_CHAR":
            nodes.append(SpecialSequence(sequence=".", position=token.position))
            i += 1
        elif token.type == "OPEN_BRACKET":
            char_class, next_i = parse_character_class(tokens, i)
            nodes.append(char_class)
            i = next_i
        elif token.type == "OPEN_GROUP":
            group, next_i = parse_group(tokens, i)
            nodes.append(group)
            i = next_i
        elif token.type == "NON_CAPTURING":
            group, next_i = parse_group(tokens, i)
            nodes.append(group)
            i = next_i
        elif token.type == "BACKREFERENCE":
            ref = int(token.extra) if token.extra else 1
            nodes.append(Backreference(reference=ref, position=token.position))
            i += 1
        elif token.type == "NAMED_BACKREFERENCE":
            nodes.append(Backreference(reference=token.extra or "", position=token.position))
            i += 1
        elif token.type == "ESCAPED":
            char = token.value[1]
            nodes.append(Literal(value=char, escaped=True, position=token.position))
            i += 1
        elif token.type == "LITERAL":
            literal_value = token.value
            literal_position = token.position
            i += 1
            while i < len(tokens) and tokens[i].type == "LITERAL":
                literal_value += tokens[i].value
                i += 1
            nodes.append(Literal(value=literal_value, escaped=False, position=literal_position))
        elif token.type == "ALTERNATION":
            break
        else:
            nodes.append(Literal(value=token.value, position=token.position))
            i += 1
        if i < len(tokens):
            quant_node, next_i = parse_quantifier(tokens, i)
            if quant_node and nodes:
                nodes[-1] = quantifier_wrap(nodes[-1], quant_node)
                i = next_i
    return nodes, i
 def quantifier_wrap(node: ASTNode, quantifier: Quantifier) -> Quantifier:
    """Wrap a node with a quantifier."""
    quantifier.child = node
    return quantifier
 def parse_alternation(tokens: List[Token], index: int) -> tuple[Alternation, int]:
    """Parse an alternation from tokens."""
    options: List[List[ASTNode]] = []
    current_option: List[ASTNode] = []
    i = index
    while i < len(tokens):
        token = tokens[i]
        if token.type == "ALTERNATION":
            options.append(current_option)
            current_option = []
            i += 1
        elif token.type == "CLOSE_GROUP":
            if current_option:
                options.append(current_option)
            alternation = Alternation(options=options, position=tokens[index].position)
            return alternation, i
        else:
            node, next_i = parse_sequence(tokens, i)
            current_option.extend(node)
            i = next_i
    if current_option:
        options.append(current_option)
    return Alternation(options=options, position=tokens[index].position), i
 def parse_regex(pattern: str) -> List[ASTNode]:
    """Parse a regex pattern into an AST."""
    tokens = tokenize(pattern)
    nodes, index = parse_sequence(tokens, 0)
    if index < len(tokens) and tokens[index].type == "ALTERNATION":
        alternation, next_index = parse_alternation(tokens, index)
        return [alternation]
    if index < len(tokens):
        remaining = "".join(t.value for t in tokens[index:])
        raise ParseError(f"Unexpected token at position {index}: {remaining!r}", tokens[index].position)
    return nodes