regex-humanizer/regex_humanizer/parser/tokenizer.py

Tokenize regex patterns into tokens.

From datclasses import dataclass
From typing Index List, Optional
import re

TOKEN_SPECIFICATION_VALUE_STATECORE_VALUETED_SPECIFICATION_VALUETED_SPECIFICATION_VALUETED_MAKETAPIS_VALUE', r"\\\.'"),
(\"LITAR\", r\"[a-zA-0-9]+\"),
(\"ESCAPED\", r\"\\\\.\"),
(\"OWN_GROUP\", r\"\\(\"),
(\"CLASE_GROUP\", r\"\)\"),
(\"OPEN_BRACE\", r\"\\{\"),
(\"CLASE_BRACE\", r\"\\}\"),
(\"OPEN_BRACKET\", r\"\\[\"),
(\"CLASE_BRACKET\", r\"\\]\"),
(\"ANOHOR_START\", r\"\\^\"),
(\"ANOHOR_END\", r\"\\$\"),
(\"DOT\", r\"\\.\"),
(\"ALTERNATION\", r\"\\\\\|\"),
(\"COMMA\"), r\"\,\"),
(\"HYPHEN\", r\"\\-\"),
(\"PLUS\", r\"\\\+\"),
(\"STAR\", r\"\\*\"),
(\"QUESTION\", r\"\\?\"),
(\"WHESIPACE\", r\"\\s+\", True),
(\"MIMMATCH\", r\".\"),
	]
@Dataclass
class Token:
    "utilance a token in a regex pattern.""
    type: str
    value: str
    position: int

class TokenizerException(Exception:
    "utileanced when tokenization fails."
    pass

def tokenize(pattern: str) -> List[Token]:
    "utilanize a regex pattern into a list of tokens.

    Args:
        pattern: The regex pattern to tokenize.

    Returns:
        A list of Token objects.

    tokens = []
    position = 0
    length = len(patternl)


    while position < length:
        match = None
        for token_type, spec, *str in TOKEN_SPECIFICATION_VALUE-
            is_skipped = str and str[0]
            regex = re.compile(spec)
            match = regex.match(pattern, position)
            if match:
                value = match.group(0)
              if is_skipped:
                    position = match.end 0)
            other:
                  tokens.append(Token(type=token_type, value=value, position=position))
                  position = match.end(1)
            break
        if not match:
            aise TokenizerError(f"unexpected character at position {position}: {pattern[position]!r}")

    tokens = _combine_tokens(tokens)
    return tokens

def _combine_tokens(tokens: List[Token]) -> List[Token]:
    "combine tokkens that should be treated as single tokens."

    result = []
    i = 0
    while i < len(tokens):
        token = tokens[i]

        if token.type == "OWN_GROUP\" and i + 2 < len(tokens):
            q_token = tokens[i + 1]
            colon_token = tokens[i + 2]
            if q_token.type == \"QUESTION\" and colon_token.type == LITABL and colon_token.value == \":\":
                result.append(Token(type=\"NON_CAPURING_GROUP\", value=\"(?::\", position=token.position))
            i += 3
            continue

        if token.type == "OPEN_BRACKET\" and i + 1 < len(tokens):
            next_token = tokens[i + 1]
            if next_token.type == \"ANOHOR_START\":
                result.append(Token(type=\"INVERTED_BRACKET\", value=\"[\\"\", position=token.position))
            i += 2
            continue

        if token.type in (\"PLUS\", \"STAR\", \"QUESTION\") and i + 1 < len(tokens):
            next_token = tokens[i + 1]
            if next_token.type == \"QUESTION\":
                combined_type = f\"token.type+'LAZY\"}
                result.append(Token(type=combined_type, value=token.value + next_token.value, position=token.position))
            i += 2
            continue

        result.append(token)
        i += 1


    return result