diff --git a/src/gdiffer/language_detector.py b/src/gdiffer/language_detector.py index 344b600..0d6ad3c 100644 --- a/src/gdiffer/language_detector.py +++ b/src/gdiffer/language_detector.py @@ -1,75 +1,206 @@ """Language detection for code files.""" -from typing import Optional class LanguageDetector: + """Detects programming language from file extensions and content.""" + EXTENSION_MAP = { - 'py': 'python', 'pyw': 'python', 'pyx': 'python', - 'js': 'javascript', 'mjs': 'javascript', 'cjs': 'javascript', 'jsx': 'javascript', - 'ts': 'typescript', 'tsx': 'typescript', 'mts': 'typescript', 'cts': 'typescript', - 'java': 'java', 'kt': 'kotlin', 'kts': 'kotlin', - 'go': 'go', 'rs': 'rust', 'c': 'c', 'h': 'c', - 'cpp': 'cpp', 'cc': 'cpp', 'cxx': 'cpp', 'hpp': 'cpp', 'hxx': 'cpp', - 'cs': 'csharp', 'rb': 'ruby', 'erb': 'ruby', 'php': 'php', - 'swift': 'swift', 'm': 'objective-c', 'mm': 'objective-c', - 'scala': 'scala', 'sc': 'scala', 'jl': 'julia', - 'r': 'r', 'R': 'r', 'lua': 'lua', - 'pl': 'perl', 'pm': 'perl', 'sql': 'sql', - 'sh': 'bash', 'bash': 'bash', 'zsh': 'bash', 'fish': 'bash', - 'yaml': 'yaml', 'yml': 'yaml', 'json': 'json', - 'xml': 'xml', 'html': 'html', 'htm': 'html', - 'css': 'css', 'scss': 'scss', 'sass': 'sass', 'less': 'less', - 'md': 'markdown', 'markdown': 'markdown', - 'txt': 'text', 'dockerfile': 'dockerfile', 'Dockerfile': 'dockerfile', + 'py': 'python', + 'pyw': 'python', + 'pyx': 'python', + 'js': 'javascript', + 'mjs': 'javascript', + 'cjs': 'javascript', + 'jsx': 'javascript', + 'ts': 'typescript', + 'tsx': 'typescript', + 'mts': 'typescript', + 'cts': 'typescript', + 'java': 'java', + 'kt': 'kotlin', + 'kts': 'kotlin', + 'go': 'go', + 'rs': 'rust', + 'c': 'c', + 'h': 'c', + 'cpp': 'cpp', + 'cc': 'cpp', + 'cxx': 'cpp', + 'hpp': 'cpp', + 'hxx': 'cpp', + 'cs': 'csharp', + 'rb': 'ruby', + 'erb': 'ruby', + 'php': 'php', + 'swift': 'swift', + 'm': 'objective-c', + 'mm': 'objective-c', + 'scala': 'scala', + 'sc': 'scala', + 'jl': 'julia', + 'r': 'r', + 'R': 'r', + 'lua': 'lua', + 'pl': 'perl', + 'pm': 'perl', + 'sql': 'sql', + 'sh': 'bash', + 'bash': 'bash', + 'zsh': 'bash', + 'fish': 'bash', + 'yaml': 'yaml', + 'yml': 'yaml', + 'json': 'json', + 'xml': 'xml', + 'html': 'html', + 'htm': 'html', + 'css': 'css', + 'scss': 'scss', + 'sass': 'sass', + 'less': 'less', + 'md': 'markdown', + 'markdown': 'markdown', + 'txt': 'text', + 'dockerfile': 'dockerfile', + 'Dockerfile': 'dockerfile', } CONTENT_PATTERNS = { - 'python': [r'^import\s+\w+', r'^from\s+\w+\s+import', r'^def\s+\w+\s*\(', r'^class\s+\w+'], - 'javascript': [r'^const\s+\w+', r'^let\s+\w+', r'^var\s+\w+', r'^function\s+\w+', r'=>\s*\{'], - 'typescript': [r'^interface\s+\w+', r'^type\s+\w+', r':\s*(string|number|boolean)'], - 'java': [r'^package\s+[\w.]+;', r'^import\s+[\w.]+;', r'^public\s+class\s+\w+'], - 'go': [r'^package\s+\w+', r'^import\s+\(', r'func\s+\w+'], - 'rust': [r'^fn\s+\w+', r'^impl\s+\w+', r'^struct\s+\w+', r'^enum\s+\w+'], - 'c': [r'#include\s*<', r'#include\s*"', r'int\s+main\s*\('], - 'cpp': [r'#include\s*<', r'#include\s*"', r'class\s+\w+', r'std::\w+'], - 'ruby': [r'^require\s+', r'^class\s+\w+', r'^module\s+\w+', r'def\s+\w+'], - 'php': [r'<\?php', r'\$\w+\s*=', r'function\s+\w+', r'class\s+\w+'], + 'python': [ + r'^import\s+\w+', + r'^from\s+\w+\s+import', + r'^def\s+\w+\s*\(', + r'^class\s+\w+\s*[:\(]', + r'^if\s+__name__\s*==\s*['"]__main__['"]', + ], + 'javascript': [ + r'^const\s+\w+\s*=', + r'^let\s+\w+\s*=', + r'^var\s+\w+\s*=', + r'^function\s+\w+\s*\(', + r'=>\s*\{', + r'import\s+.*\s+from', + r'export\s+(default\s+)?', + ], + 'typescript': [ + r'^interface\s+\w+\s*\{', + r'^type\s+\w+\s*=', + r':\s*(string|number|boolean|any|void|null|undefined)', + r'<[A-Z]\w*>', + ], + 'java': [ + r'^package\s+[\w.]+;', + r'^import\s+[\w.]+;', + r'^public\s+(class|interface|enum)\s+\w+', + r'^private\s+(static\s+)?(final\s+)?\w+\s+\w+;', + ], + 'go': [ + r'^package\s+\w+', + r'^import\s*\(', + r'func\s+\w+\s*\(', + r':=', + r'go\s+func', + ], + 'rust': [ + r'^fn\s+\w+\s*\(', + r'^impl\s+\w+', + r'^struct\s+\w+', + r'^enum\s+\w+', + r'let\s+mut\s+\w+', + r'->\s*\w+', + ], + 'c': [ + r'#include\s*<', + r'#include\s*"', + r'int\s+main\s*\(', + r'struct\s+\w+\s*\{', + r'void\s+\?\s*\w+\s*\(', + ], + 'cpp': [ + r'#include\s*<', + r'#include\s*"', + r'class\s+\w+\s*(:\s*public)?', + r'std::\w+', + r'using\s+namespace\s+std', + ], + 'ruby': [ + r'^require\s+['"]', + r'^class\s+\w+(\s*<\s*\w+)?', + r'^module\s+\w+', + r'def\s+\w+', + r'puts\s+', + r'puts!', + ], + 'php': [ + r'\<\?php', + r'\$\w+\s*=', + r'function\s+\w+\s*\(', + r'class\s+\w+\s*\{', + ], } - def detect_from_filename(self, filename: str) -> Optional[str]: + def __init__(self): + self._tree_sitter_languages = {} + + def detect_from_filename(self, filename: str) -> str | None: + """Detect language from file extension.""" if '.' not in filename: return None + ext = filename.rsplit('.', 1)[-1].lower() return self.EXTENSION_MAP.get(ext) - def detect_from_content(self, content: str) -> Optional[str]: + def detect_from_content(self, content: str) -> str | None: + """Detect language from file content patterns.""" first_lines = '\n'.join(content.splitlines()[:50]) - scores = {} + + scores: dict[str, int] = {} + for lang, patterns in self.CONTENT_PATTERNS.items(): - import re - score = sum(len(re.findall(p, first_lines, re.MULTILINE)) for p in patterns) + score = 0 + for pattern in patterns: + import re + matches = len(re.findall(pattern, first_lines, re.MULTILINE)) + score += matches + if score > 0: scores[lang] = score - return max(scores, key=scores.get) if scores else None + + if scores: + best_lang = max(scores, key=scores.get) + return best_lang + + return None def detect(self, filename: str, content: str = "") -> str: + """Detect language from filename and optionally content.""" ext_lang = self.detect_from_filename(filename) + if ext_lang and ext_lang not in ['text', 'markdown', 'json', 'yaml', 'xml', 'html', 'css', 'dockerfile']: + if content: + content_lang = self.detect_from_content(content) + if content_lang and content_lang != ext_lang: + return content_lang return ext_lang + if content: content_lang = self.detect_from_content(content) if content_lang: return content_lang + return ext_lang or "text" def get_supported_languages(self) -> list[str]: + """Return list of supported languages.""" return sorted(set(self.EXTENSION_MAP.values())) def is_language_supported(self, language: str) -> bool: + """Check if a language is supported.""" return language in self.get_supported_languages() def detect_language(filename: str, content: str = "") -> str: + """Detect programming language from filename and content.""" detector = LanguageDetector() return detector.detect(filename, content)