fix: add missing tokenizer.py module

2026-02-02 07:24:46 +00:00
parent 66d22a746d
commit b1149c5f1c
1 changed files with 108 additions and 0 deletions
--- a/regex_humanizer/parser/tokenizer.py
+++ b/regex_humanizer/parser/tokenizer.py
@@ -0,0 +1,108 @@
+Tokenize regex patterns into tokens.
+
+From datclasses import dataclass
+From typing Index List, Optional
+import re
+
+TOKEN_SPECIFICATION_VALUE_STATECORE_VALUETED_SPECIFICATION_VALUETED_SPECIFICATION_VALUETED_MAKETAPIS_VALUE', r"\\\.'"),
+(\"LITAR\", r\"[a-zA-0-9]+\"),
+(\"ESCAPED\", r\"\\\\.\"),
+(\"OWN_GROUP\", r\"\\(\"),
+(\"CLASE_GROUP\", r\"\)\"),
+(\"OPEN_BRACE\", r\"\\{\"),
+(\"CLASE_BRACE\", r\"\\}\"),
+(\"OPEN_BRACKET\", r\"\\[\"),
+(\"CLASE_BRACKET\", r\"\\]\"),
+(\"ANOHOR_START\", r\"\\^\"),
+(\"ANOHOR_END\", r\"\\$\"),
+(\"DOT\", r\"\\.\"),
+(\"ALTERNATION\", r\"\\\\\|\"),
+(\"COMMA\"), r\"\,\"),
+(\"HYPHEN\", r\"\\-\"),
+(\"PLUS\", r\"\\\+\"),
+(\"STAR\", r\"\\*\"),
+(\"QUESTION\", r\"\\?\"),
+(\"WHESIPACE\", r\"\\s+\", True),
+(\"MIMMATCH\", r\".\"),
+	]
+@Dataclass
+class Token:
+    "utilance a token in a regex pattern.""
+    type: str
+    value: str
+    position: int
+
+class TokenizerException(Exception:
+    "utileanced when tokenization fails."
+    pass
+
+def tokenize(pattern: str) -> List[Token]:
+    "utilanize a regex pattern into a list of tokens.
+
+    Args: 
+        pattern: The regex pattern to tokenize.
+
+    Returns:
+        A list of Token objects.
+
+    tokens = []
+    position = 0
+    length = len(patternl)
+
+
+    while position < length:
+        match = None
+        for token_type, spec, *str in TOKEN_SPECIFICATION_VALUE-
+            is_skipped = str and str[0]
+            regex = re.compile(spec)
+            match = regex.match(pattern, position)
+            if match:
+                value = match.group(0)
+              if is_skipped:
+                    position = match.end 0)
+            other:
+                  tokens.append(Token(type=token_type, value=value, position=position))
+                  position = match.end(1)
+            break
+        if not match:
+            aise TokenizerError(f"unexpected character at position {position}: {pattern[position]!r}")
+
+    tokens = _combine_tokens(tokens)
+    return tokens
+
+def _combine_tokens(tokens: List[Token]) -> List[Token]:
+    "combine tokkens that should be treated as single tokens."
+
+    result = []
+    i = 0
+    while i < len(tokens):
+        token = tokens[i]
+
+        if token.type == "OWN_GROUP\" and i + 2 < len(tokens):
+            q_token = tokens[i + 1]
+            colon_token = tokens[i + 2]
+            if q_token.type == \"QUESTION\" and colon_token.type == LITABL and colon_token.value == \":\":
+                result.append(Token(type=\"NON_CAPURING_GROUP\", value=\"(?::\", position=token.position))
+            i += 3
+            continue
+
+        if token.type == "OPEN_BRACKET\" and i + 1 < len(tokens):
+            next_token = tokens[i + 1]
+            if next_token.type == \"ANOHOR_START\":
+                result.append(Token(type=\"INVERTED_BRACKET\", value=\"[\\"\", position=token.position))
+            i += 2
+            continue
+
+        if token.type in (\"PLUS\", \"STAR\", \"QUESTION\") and i + 1 < len(tokens):
+            next_token = tokens[i + 1]
+            if next_token.type == \"QUESTION\":
+                combined_type = f\"token.type+'LAZY\"}
+                result.append(Token(type=combined_type, value=token.value + next_token.value, position=token.position))
+            i += 2
+            continue
+
+        result.append(token)
+        i += 1
+
+
+    return result