Initial upload: cmdparse CLI tool with comprehensive documentation and CI/CD
This commit is contained in:
129
cmdparse/patterns.py
Normal file
129
cmdparse/patterns.py
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
"""Built-in pattern definitions for CLI output parsing."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Pattern:
|
||||||
|
"""Represents a regex pattern for detecting CLI output types."""
|
||||||
|
name: str
|
||||||
|
pattern: re.Pattern
|
||||||
|
confidence: int
|
||||||
|
|
||||||
|
|
||||||
|
TABLE_HEADER_PATTERN = re.compile(
|
||||||
|
r'^[\s]*(?:\|[\s-]*)+[+\-|]+$|'
|
||||||
|
r'^[A-Z][A-Za-z\s]+(?:[A-Z][A-Za-z\s]*)+$|'
|
||||||
|
r'^\s*(?:[A-Z][A-Za-z_]+(?:\s+[A-Z][A-Za-z_]+)*)\s+(?:[A-Z][A-Za-z_]+(?:\s+[A-Z][A-Za-z_]+)*)\s*$',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
|
TABLE_ROW_PATTERN = re.compile(
|
||||||
|
r'^\s*\|?\s*(.+?)\s*\|?\s*$|'
|
||||||
|
r'^\s*([^\|]+?)\s*\|\s*(.+?)\s*$|'
|
||||||
|
r'^\s*\+[-+\+]+\+\s*$'
|
||||||
|
)
|
||||||
|
|
||||||
|
KEY_VALUE_COLON_PATTERN = re.compile(
|
||||||
|
r'^\s*([A-Za-z_][A-Za-z0-9_\-\.]*)\s*:\s*(.+)$',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
|
KEY_VALUE_EQUALS_PATTERN = re.compile(
|
||||||
|
r'^\s*([A-Za-z_][A-Za-z0-9_\-\.]*)\s*=\s*(.+)$',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
|
DELIMITED_COMMA_PATTERN = re.compile(
|
||||||
|
r'^\s*([^,]+),([^,]+),([^,]*)$'
|
||||||
|
)
|
||||||
|
|
||||||
|
DELIMITED_TAB_PATTERN = re.compile(
|
||||||
|
r'^\s*([^\t]+)\t([^\t]*)\s*$'
|
||||||
|
)
|
||||||
|
|
||||||
|
DELIMITED_SEMICOLON_PATTERN = re.compile(
|
||||||
|
r'^\s*([^;]+);([^;]+);([^;]*)\s*$'
|
||||||
|
)
|
||||||
|
|
||||||
|
JSON_LIKE_PATTERN = re.compile(
|
||||||
|
r'^\s*\{\s*"[^"]+"\s*:\s*'
|
||||||
|
)
|
||||||
|
|
||||||
|
KEY_VALUE_BLOCK_PATTERN = re.compile(
|
||||||
|
r'^([A-Za-z_][A-Za-z0-9_\-\.]*)\s+(\S+)$',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
PATTERNS = [
|
||||||
|
Pattern('table', TABLE_HEADER_PATTERN, 80),
|
||||||
|
Pattern('key_value_colon', KEY_VALUE_COLON_PATTERN, 70),
|
||||||
|
Pattern('key_value_equals', KEY_VALUE_EQUALS_PATTERN, 65),
|
||||||
|
Pattern('delimited_tab', DELIMITED_TAB_PATTERN, 85),
|
||||||
|
Pattern('delimited_comma', DELIMITED_COMMA_PATTERN, 75),
|
||||||
|
Pattern('delimited_semicolon', DELIMITED_SEMICOLON_PATTERN, 75),
|
||||||
|
Pattern('json_like', JSON_LIKE_PATTERN, 90),
|
||||||
|
Pattern('key_value_block', KEY_VALUE_BLOCK_PATTERN, 30),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def detect_pattern_type(text: str) -> str:
|
||||||
|
"""Detect the pattern type of the given text."""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return 'empty'
|
||||||
|
|
||||||
|
lines = text.strip().split('\n')
|
||||||
|
if len(lines) < 1:
|
||||||
|
return 'raw'
|
||||||
|
|
||||||
|
scores = {}
|
||||||
|
for pattern in PATTERNS:
|
||||||
|
scores[pattern.name] = 0
|
||||||
|
|
||||||
|
first_line = lines[0] if lines else ''
|
||||||
|
|
||||||
|
tab_count = sum(1 for line in lines if '\t' in line)
|
||||||
|
comma_count = sum(1 for line in lines if ',' in line and '\t' not in line)
|
||||||
|
colon_count = sum(1 for line in lines if ':' in line and '\t' not in line)
|
||||||
|
equals_count = sum(1 for line in lines if '=' in line and ':' not in line and '\t' not in line)
|
||||||
|
semicolon_count = sum(1 for line in lines if ';' in line and ',' not in line and '=' not in line and ':' not in line)
|
||||||
|
|
||||||
|
for pattern in PATTERNS:
|
||||||
|
if pattern.pattern.search(text):
|
||||||
|
scores[pattern.name] += pattern.confidence
|
||||||
|
|
||||||
|
if len(lines) > 1:
|
||||||
|
header_match = pattern.pattern.match(first_line)
|
||||||
|
if header_match:
|
||||||
|
scores[pattern.name] += 10
|
||||||
|
|
||||||
|
if tab_count >= len(lines) * 0.5:
|
||||||
|
scores['delimited_tab'] += 30
|
||||||
|
|
||||||
|
if comma_count >= len(lines) * 0.5 and tab_count < len(lines) * 0.5:
|
||||||
|
scores['delimited_comma'] += 25
|
||||||
|
|
||||||
|
if colon_count >= len(lines) * 0.5:
|
||||||
|
scores['key_value_colon'] += 25
|
||||||
|
|
||||||
|
if equals_count >= len(lines) * 0.5:
|
||||||
|
scores['key_value_equals'] += 25
|
||||||
|
|
||||||
|
if semicolon_count >= len(lines) * 0.5:
|
||||||
|
scores['delimited_semicolon'] += 30
|
||||||
|
|
||||||
|
if len(lines) >= 2:
|
||||||
|
words_first = len(first_line.split())
|
||||||
|
if all(len(line.split()) == words_first for line in lines[1:]):
|
||||||
|
if tab_count < len(lines) * 0.5 and comma_count < len(lines) * 0.5:
|
||||||
|
scores['table'] += 20
|
||||||
|
|
||||||
|
sorted_patterns = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
if sorted_patterns and sorted_patterns[0][1] > 0:
|
||||||
|
return sorted_patterns[0][0]
|
||||||
|
|
||||||
|
return 'raw'
|
||||||
Reference in New Issue
Block a user