Initial upload: cmdparse CLI tool with comprehensive documentation and CI/CD
This commit is contained in:
129
cmdparse/patterns.py
Normal file
129
cmdparse/patterns.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Built-in pattern definitions for CLI output parsing."""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class Pattern:
|
||||
"""Represents a regex pattern for detecting CLI output types."""
|
||||
name: str
|
||||
pattern: re.Pattern
|
||||
confidence: int
|
||||
|
||||
|
||||
TABLE_HEADER_PATTERN = re.compile(
|
||||
r'^[\s]*(?:\|[\s-]*)+[+\-|]+$|'
|
||||
r'^[A-Z][A-Za-z\s]+(?:[A-Z][A-Za-z\s]*)+$|'
|
||||
r'^\s*(?:[A-Z][A-Za-z_]+(?:\s+[A-Z][A-Za-z_]+)*)\s+(?:[A-Z][A-Za-z_]+(?:\s+[A-Z][A-Za-z_]+)*)\s*$',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
TABLE_ROW_PATTERN = re.compile(
|
||||
r'^\s*\|?\s*(.+?)\s*\|?\s*$|'
|
||||
r'^\s*([^\|]+?)\s*\|\s*(.+?)\s*$|'
|
||||
r'^\s*\+[-+\+]+\+\s*$'
|
||||
)
|
||||
|
||||
KEY_VALUE_COLON_PATTERN = re.compile(
|
||||
r'^\s*([A-Za-z_][A-Za-z0-9_\-\.]*)\s*:\s*(.+)$',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
KEY_VALUE_EQUALS_PATTERN = re.compile(
|
||||
r'^\s*([A-Za-z_][A-Za-z0-9_\-\.]*)\s*=\s*(.+)$',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
DELIMITED_COMMA_PATTERN = re.compile(
|
||||
r'^\s*([^,]+),([^,]+),([^,]*)$'
|
||||
)
|
||||
|
||||
DELIMITED_TAB_PATTERN = re.compile(
|
||||
r'^\s*([^\t]+)\t([^\t]*)\s*$'
|
||||
)
|
||||
|
||||
DELIMITED_SEMICOLON_PATTERN = re.compile(
|
||||
r'^\s*([^;]+);([^;]+);([^;]*)\s*$'
|
||||
)
|
||||
|
||||
JSON_LIKE_PATTERN = re.compile(
|
||||
r'^\s*\{\s*"[^"]+"\s*:\s*'
|
||||
)
|
||||
|
||||
KEY_VALUE_BLOCK_PATTERN = re.compile(
|
||||
r'^([A-Za-z_][A-Za-z0-9_\-\.]*)\s+(\S+)$',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
|
||||
PATTERNS = [
|
||||
Pattern('table', TABLE_HEADER_PATTERN, 80),
|
||||
Pattern('key_value_colon', KEY_VALUE_COLON_PATTERN, 70),
|
||||
Pattern('key_value_equals', KEY_VALUE_EQUALS_PATTERN, 65),
|
||||
Pattern('delimited_tab', DELIMITED_TAB_PATTERN, 85),
|
||||
Pattern('delimited_comma', DELIMITED_COMMA_PATTERN, 75),
|
||||
Pattern('delimited_semicolon', DELIMITED_SEMICOLON_PATTERN, 75),
|
||||
Pattern('json_like', JSON_LIKE_PATTERN, 90),
|
||||
Pattern('key_value_block', KEY_VALUE_BLOCK_PATTERN, 30),
|
||||
]
|
||||
|
||||
|
||||
def detect_pattern_type(text: str) -> str:
|
||||
"""Detect the pattern type of the given text."""
|
||||
if not text or not text.strip():
|
||||
return 'empty'
|
||||
|
||||
lines = text.strip().split('\n')
|
||||
if len(lines) < 1:
|
||||
return 'raw'
|
||||
|
||||
scores = {}
|
||||
for pattern in PATTERNS:
|
||||
scores[pattern.name] = 0
|
||||
|
||||
first_line = lines[0] if lines else ''
|
||||
|
||||
tab_count = sum(1 for line in lines if '\t' in line)
|
||||
comma_count = sum(1 for line in lines if ',' in line and '\t' not in line)
|
||||
colon_count = sum(1 for line in lines if ':' in line and '\t' not in line)
|
||||
equals_count = sum(1 for line in lines if '=' in line and ':' not in line and '\t' not in line)
|
||||
semicolon_count = sum(1 for line in lines if ';' in line and ',' not in line and '=' not in line and ':' not in line)
|
||||
|
||||
for pattern in PATTERNS:
|
||||
if pattern.pattern.search(text):
|
||||
scores[pattern.name] += pattern.confidence
|
||||
|
||||
if len(lines) > 1:
|
||||
header_match = pattern.pattern.match(first_line)
|
||||
if header_match:
|
||||
scores[pattern.name] += 10
|
||||
|
||||
if tab_count >= len(lines) * 0.5:
|
||||
scores['delimited_tab'] += 30
|
||||
|
||||
if comma_count >= len(lines) * 0.5 and tab_count < len(lines) * 0.5:
|
||||
scores['delimited_comma'] += 25
|
||||
|
||||
if colon_count >= len(lines) * 0.5:
|
||||
scores['key_value_colon'] += 25
|
||||
|
||||
if equals_count >= len(lines) * 0.5:
|
||||
scores['key_value_equals'] += 25
|
||||
|
||||
if semicolon_count >= len(lines) * 0.5:
|
||||
scores['delimited_semicolon'] += 30
|
||||
|
||||
if len(lines) >= 2:
|
||||
words_first = len(first_line.split())
|
||||
if all(len(line.split()) == words_first for line in lines[1:]):
|
||||
if tab_count < len(lines) * 0.5 and comma_count < len(lines) * 0.5:
|
||||
scores['table'] += 20
|
||||
|
||||
sorted_patterns = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
if sorted_patterns and sorted_patterns[0][1] > 0:
|
||||
return sorted_patterns[0][0]
|
||||
|
||||
return 'raw'
|
||||
Reference in New Issue
Block a user