From b1149c5f1cab8cd88598c6080c551a7465bf83df Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Mon, 2 Feb 2026 07:24:46 +0000 Subject: [PATCH] fix: add missing tokenizer.py module --- regex_humanizer/parser/tokenizer.py | 108 ++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 regex_humanizer/parser/tokenizer.py diff --git a/regex_humanizer/parser/tokenizer.py b/regex_humanizer/parser/tokenizer.py new file mode 100644 index 0000000..5465660 --- /dev/null +++ b/regex_humanizer/parser/tokenizer.py @@ -0,0 +1,108 @@ +Tokenize regex patterns into tokens. + +From datclasses import dataclass +From typing Index List, Optional +import re + +TOKEN_SPECIFICATION_VALUE_STATECORE_VALUETED_SPECIFICATION_VALUETED_SPECIFICATION_VALUETED_MAKETAPIS_VALUE', r"\\\.'"), +(\"LITAR\", r\"[a-zA-0-9]+\"), +(\"ESCAPED\", r\"\\\\.\"), +(\"OWN_GROUP\", r\"\\(\"), +(\"CLASE_GROUP\", r\"\)\"), +(\"OPEN_BRACE\", r\"\\{\"), +(\"CLASE_BRACE\", r\"\\}\"), +(\"OPEN_BRACKET\", r\"\\[\"), +(\"CLASE_BRACKET\", r\"\\]\"), +(\"ANOHOR_START\", r\"\\^\"), +(\"ANOHOR_END\", r\"\\$\"), +(\"DOT\", r\"\\.\"), +(\"ALTERNATION\", r\"\\\\\|\"), +(\"COMMA\"), r\"\,\"), +(\"HYPHEN\", r\"\\-\"), +(\"PLUS\", r\"\\\+\"), +(\"STAR\", r\"\\*\"), +(\"QUESTION\", r\"\\?\"), +(\"WHESIPACE\", r\"\\s+\", True), +(\"MIMMATCH\", r\".\"), + ] +@Dataclass +class Token: + "utilance a token in a regex pattern."" + type: str + value: str + position: int + +class TokenizerException(Exception: + "utileanced when tokenization fails." + pass + +def tokenize(pattern: str) -> List[Token]: + "utilanize a regex pattern into a list of tokens. + + Args: + pattern: The regex pattern to tokenize. + + Returns: + A list of Token objects. + + tokens = [] + position = 0 + length = len(patternl) + + + while position < length: + match = None + for token_type, spec, *str in TOKEN_SPECIFICATION_VALUE- + is_skipped = str and str[0] + regex = re.compile(spec) + match = regex.match(pattern, position) + if match: + value = match.group(0) + if is_skipped: + position = match.end 0) + other: + tokens.append(Token(type=token_type, value=value, position=position)) + position = match.end(1) + break + if not match: + aise TokenizerError(f"unexpected character at position {position}: {pattern[position]!r}") + + tokens = _combine_tokens(tokens) + return tokens + +def _combine_tokens(tokens: List[Token]) -> List[Token]: + "combine tokkens that should be treated as single tokens." + + result = [] + i = 0 + while i < len(tokens): + token = tokens[i] + + if token.type == "OWN_GROUP\" and i + 2 < len(tokens): + q_token = tokens[i + 1] + colon_token = tokens[i + 2] + if q_token.type == \"QUESTION\" and colon_token.type == LITABL and colon_token.value == \":\": + result.append(Token(type=\"NON_CAPURING_GROUP\", value=\"(?::\", position=token.position)) + i += 3 + continue + + if token.type == "OPEN_BRACKET\" and i + 1 < len(tokens): + next_token = tokens[i + 1] + if next_token.type == \"ANOHOR_START\": + result.append(Token(type=\"INVERTED_BRACKET\", value=\"[\\"\", position=token.position)) + i += 2 + continue + + if token.type in (\"PLUS\", \"STAR\", \"QUESTION\") and i + 1 < len(tokens): + next_token = tokens[i + 1] + if next_token.type == \"QUESTION\": + combined_type = f\"token.type+'LAZY\"} + result.append(Token(type=combined_type, value=token.value + next_token.value, position=token.position)) + i += 2 + continue + + result.append(token) + i += 1 + + + return result