Files
git-diff-explainer-cli/src/gdiffer/language_detector.py
7000pctAUTO 4e81287aca
Some checks failed
CI / test (3.10) (push) Has been cancelled
CI / test (3.11) (push) Has been cancelled
CI / test (3.12) (push) Has been cancelled
CI / lint (push) Has been cancelled
CI / build (push) Has been cancelled
fix: resolve CI linting errors - remove unused imports and update type annotations
2026-02-02 14:39:08 +00:00

207 lines
5.7 KiB
Python

"""Language detection for code files."""
class LanguageDetector:
"""Detects programming language from file extensions and content."""
EXTENSION_MAP = {
'py': 'python',
'pyw': 'python',
'pyx': 'python',
'js': 'javascript',
'mjs': 'javascript',
'cjs': 'javascript',
'jsx': 'javascript',
'ts': 'typescript',
'tsx': 'typescript',
'mts': 'typescript',
'cts': 'typescript',
'java': 'java',
'kt': 'kotlin',
'kts': 'kotlin',
'go': 'go',
'rs': 'rust',
'c': 'c',
'h': 'c',
'cpp': 'cpp',
'cc': 'cpp',
'cxx': 'cpp',
'hpp': 'cpp',
'hxx': 'cpp',
'cs': 'csharp',
'rb': 'ruby',
'erb': 'ruby',
'php': 'php',
'swift': 'swift',
'm': 'objective-c',
'mm': 'objective-c',
'scala': 'scala',
'sc': 'scala',
'jl': 'julia',
'r': 'r',
'R': 'r',
'lua': 'lua',
'pl': 'perl',
'pm': 'perl',
'sql': 'sql',
'sh': 'bash',
'bash': 'bash',
'zsh': 'bash',
'fish': 'bash',
'yaml': 'yaml',
'yml': 'yaml',
'json': 'json',
'xml': 'xml',
'html': 'html',
'htm': 'html',
'css': 'css',
'scss': 'scss',
'sass': 'sass',
'less': 'less',
'md': 'markdown',
'markdown': 'markdown',
'txt': 'text',
'dockerfile': 'dockerfile',
'Dockerfile': 'dockerfile',
}
CONTENT_PATTERNS = {
'python': [
r'^import\s+\w+',
r'^from\s+\w+\s+import',
r'^def\s+\w+\s*\(',
r'^class\s+\w+\s*[:\(]',
r'^if\s+__name__\s*==\s*['"]__main__['"]',
],
'javascript': [
r'^const\s+\w+\s*=',
r'^let\s+\w+\s*=',
r'^var\s+\w+\s*=',
r'^function\s+\w+\s*\(',
r'=>\s*\{',
r'import\s+.*\s+from',
r'export\s+(default\s+)?',
],
'typescript': [
r'^interface\s+\w+\s*\{',
r'^type\s+\w+\s*=',
r':\s*(string|number|boolean|any|void|null|undefined)',
r'<[A-Z]\w*>',
],
'java': [
r'^package\s+[\w.]+;',
r'^import\s+[\w.]+;',
r'^public\s+(class|interface|enum)\s+\w+',
r'^private\s+(static\s+)?(final\s+)?\w+\s+\w+;',
],
'go': [
r'^package\s+\w+',
r'^import\s*\(',
r'func\s+\w+\s*\(',
r':=',
r'go\s+func',
],
'rust': [
r'^fn\s+\w+\s*\(',
r'^impl\s+\w+',
r'^struct\s+\w+',
r'^enum\s+\w+',
r'let\s+mut\s+\w+',
r'->\s*\w+',
],
'c': [
r'#include\s*<',
r'#include\s*"',
r'int\s+main\s*\(',
r'struct\s+\w+\s*\{',
r'void\s+\?\s*\w+\s*\(',
],
'cpp': [
r'#include\s*<',
r'#include\s*"',
r'class\s+\w+\s*(:\s*public)?',
r'std::\w+',
r'using\s+namespace\s+std',
],
'ruby': [
r'^require\s+['"]',
r'^class\s+\w+(\s*<\s*\w+)?',
r'^module\s+\w+',
r'def\s+\w+',
r'puts\s+',
r'puts!',
],
'php': [
r'\<\?php',
r'\$\w+\s*=',
r'function\s+\w+\s*\(',
r'class\s+\w+\s*\{',
],
}
def __init__(self):
self._tree_sitter_languages = {}
def detect_from_filename(self, filename: str) -> str | None:
"""Detect language from file extension."""
if '.' not in filename:
return None
ext = filename.rsplit('.', 1)[-1].lower()
return self.EXTENSION_MAP.get(ext)
def detect_from_content(self, content: str) -> str | None:
"""Detect language from file content patterns."""
first_lines = '\n'.join(content.splitlines()[:50])
scores: dict[str, int] = {}
for lang, patterns in self.CONTENT_PATTERNS.items():
score = 0
for pattern in patterns:
import re
matches = len(re.findall(pattern, first_lines, re.MULTILINE))
score += matches
if score > 0:
scores[lang] = score
if scores:
best_lang = max(scores, key=scores.get)
return best_lang
return None
def detect(self, filename: str, content: str = "") -> str:
"""Detect language from filename and optionally content."""
ext_lang = self.detect_from_filename(filename)
if ext_lang and ext_lang not in ['text', 'markdown', 'json', 'yaml', 'xml', 'html', 'css', 'dockerfile']:
if content:
content_lang = self.detect_from_content(content)
if content_lang and content_lang != ext_lang:
return content_lang
return ext_lang
if content:
content_lang = self.detect_from_content(content)
if content_lang:
return content_lang
return ext_lang or "text"
def get_supported_languages(self) -> list[str]:
"""Return list of supported languages."""
return sorted(set(self.EXTENSION_MAP.values()))
def is_language_supported(self, language: str) -> bool:
"""Check if a language is supported."""
return language in self.get_supported_languages()
def detect_language(filename: str, content: str = "") -> str:
"""Detect programming language from filename and content."""
detector = LanguageDetector()
return detector.detect(filename, content)