diff --git a/app/cmdparse/patterns.py b/app/cmdparse/patterns.py new file mode 100644 index 0000000..9349f00 --- /dev/null +++ b/app/cmdparse/patterns.py @@ -0,0 +1,126 @@ +import re +from dataclasses import dataclass + + +@dataclass +class Pattern: + """Represents a regex pattern for detecting CLI output types.""" + name: str + pattern: re.Pattern + confidence: int + + +TABLE_HEADER_PATTERN = re.compile( + r'^[\s]*(?:\|[\s-]*)+[+\-|]+$|' + r'^[A-Z][A-Za-z\s]+(?:[A-Z][A-Za-z\s]*)+$|' + r'^\s*(?:[A-Z][A-Za-z_]+(?:\s+[A-Z][A-Za-z_]+)*)\s+(?:[A-Z][A-Za-z_]+(?:\s+[A-Z][A-Za-z_]+)*)\s*$', + re.MULTILINE +) + +TABLE_ROW_PATTERN = re.compile( + r'^\s*\|?\s*(.+?)\s*\|?\s*$|' + r'^\s*([^\|]+?)\s*\|\s*(.+?)\s*$|' + r'^\s*\+[-+\+]+\+\s*$' +) + +KEY_VALUE_COLON_PATTERN = re.compile( + r'^\s*([A-Za-z_][A-Za-z0-9_\-\.]*)\s*:\s*(.+)$', + re.MULTILINE +) + +KEY_VALUE_EQUALS_PATTERN = re.compile( + r'^\s*([A-Za-z_][A-Za-z0-9_\-\.]*)\s*=\s*(.+)$', + re.MULTILINE +) + +DELIMITED_COMMA_PATTERN = re.compile( + r'^\s*([^,]+),([^,]+),([^,]*)$' +) + +DELIMITED_TAB_PATTERN = re.compile( + r'^\s*([^\t]+)\t([^\t]*)\s*$' +) + +DELIMITED_SEMICOLON_PATTERN = re.compile( + r'^\s*([^;]+);([^;]+);([^;]*)\s*$' +) + +JSON_LIKE_PATTERN = re.compile( + r'^\s*\{\s*"[^"]+"\s*:\s*' +) + +KEY_VALUE_BLOCK_PATTERN = re.compile( + r'^([A-Za-z_][A-Za-z0-9_\-\.]*)\s+(\S+)$', + re.MULTILINE +) + + +PATTERNS = [ + Pattern('table', TABLE_HEADER_PATTERN, 80), + Pattern('key_value_colon', KEY_VALUE_COLON_PATTERN, 70), + Pattern('key_value_equals', KEY_VALUE_EQUALS_PATTERN, 65), + Pattern('delimited_tab', DELIMITED_TAB_PATTERN, 85), + Pattern('delimited_comma', DELIMITED_COMMA_PATTERN, 75), + Pattern('delimited_semicolon', DELIMITED_SEMICOLON_PATTERN, 75), + Pattern('json_like', JSON_LIKE_PATTERN, 90), + Pattern('key_value_block', KEY_VALUE_BLOCK_PATTERN, 30), +] + + +def detect_pattern_type(text: str) -> str: + """Detect the pattern type of the given text.""" + if not text or not text.strip(): + return 'empty' + + lines = text.strip().split('\n') + if len(lines) < 1: + return 'raw' + + scores = {} + for pattern in PATTERNS: + scores[pattern.name] = 0 + + first_line = lines[0] if lines else '' + + tab_count = sum(1 for line in lines if '\t' in line) + comma_count = sum(1 for line in lines if ',' in line and '\t' not in line) + colon_count = sum(1 for line in lines if ':' in line and '\t' not in line) + equals_count = sum(1 for line in lines if '=' in line and ':' not in line and '\t' not in line) + semicolon_count = sum(1 for line in lines if ';' in line and ',' not in line and '=' not in line and ':' not in line) + + for pattern in PATTERNS: + if pattern.pattern.search(text): + scores[pattern.name] += pattern.confidence + + if len(lines) > 1: + header_match = pattern.pattern.match(first_line) + if header_match: + scores[pattern.name] += 10 + + if tab_count >= len(lines) * 0.5: + scores['delimited_tab'] += 30 + + if comma_count >= len(lines) * 0.5 and tab_count < len(lines) * 0.5: + scores['delimited_comma'] += 25 + + if colon_count >= len(lines) * 0.5: + scores['key_value_colon'] += 25 + + if equals_count >= len(lines) * 0.5: + scores['key_value_equals'] += 25 + + if semicolon_count >= len(lines) * 0.5: + scores['delimited_semicolon'] += 30 + + if len(lines) >= 2: + words_first = len(first_line.split()) + if all(len(line.split()) == words_first for line in lines[1:]): + if tab_count < len(lines) * 0.5 and comma_count < len(lines) * 0.5: + scores['table'] += 20 + + sorted_patterns = sorted(scores.items(), key=lambda x: x[1], reverse=True) + + if sorted_patterns and sorted_patterns[0][1] > 0: + return sorted_patterns[0][0] + + return 'raw'