diff --git a/src/gdiffer/language_detector.py b/src/gdiffer/language_detector.py index c50200f..156c2f8 100644 --- a/src/gdiffer/language_detector.py +++ b/src/gdiffer/language_detector.py @@ -1,166 +1,157 @@ -"""Language detection for code files.""" - - - class LanguageDetector: - """Detects programming language from file extensions and content.""" - EXTENSION_MAP = { - 'py': 'python', - 'pyw': 'python', - 'pyx': 'python', - 'js': 'javascript', - 'mjs': 'javascript', - 'cjs': 'javascript', - 'jsx': 'javascript', - 'ts': 'typescript', - 'tsx': 'typescript', - 'mts': 'typescript', - 'cts': 'typescript', - 'java': 'java', - 'kt': 'kotlin', - 'kts': 'kotlin', - 'go': 'go', - 'rs': 'rust', - 'c': 'c', - 'h': 'c', - 'cpp': 'cpp', - 'cc': 'cpp', - 'cxx': 'cpp', - 'hpp': 'cpp', - 'hxx': 'cpp', - 'cs': 'csharp', - 'rb': 'ruby', - 'erb': 'ruby', - 'php': 'php', - 'swift': 'swift', - 'm': 'objective-c', - 'mm': 'objective-c', - 'scala': 'scala', - 'sc': 'scala', - 'jl': 'julia', - 'r': 'r', - 'R': 'r', - 'lua': 'lua', - 'pl': 'perl', - 'pm': 'perl', - 'sql': 'sql', - 'sh': 'bash', - 'bash': 'bash', - 'zsh': 'bash', - 'fish': 'bash', - 'yaml': 'yaml', - 'yml': 'yaml', - 'json': 'json', - 'xml': 'xml', - 'html': 'html', - 'htm': 'html', - 'css': 'css', - 'scss': 'scss', - 'sass': 'sass', - 'less': 'less', - 'md': 'markdown', - 'markdown': 'markdown', - 'txt': 'text', - 'dockerfile': 'dockerfile', - 'Dockerfile': 'dockerfile', + "py": "python", + "pyw": "python", + "pyx": "python", + "js": "javascript", + "mjs": "javascript", + "cjs": "javascript", + "jsx": "javascript", + "ts": "typescript", + "tsx": "typescript", + "mts": "typescript", + "cts": "typescript", + "java": "java", + "kt": "kotlin", + "kts": "kotlin", + "go": "go", + "rs": "rust", + "c": "c", + "h": "c", + "cpp": "cpp", + "cc": "cpp", + "cxx": "cpp", + "hpp": "cpp", + "hxx": "cpp", + "cs": "csharp", + "rb": "ruby", + "erb": "ruby", + "php": "php", + "swift": "swift", + "m": "objective-c", + "mm": "objective-c", + "scala": "scala", + "sc": "scala", + "jl": "julia", + "r": "r", + "R": "r", + "lua": "lua", + "pl": "perl", + "pm": "perl", + "sql": "sql", + "sh": "bash", + "bash": "bash", + "zsh": "bash", + "fish": "bash", + "yaml": "yaml", + "yml": "yaml", + "json": "json", + "xml": "xml", + "html": "html", + "htm": "html", + "css": "css", + "scss": "scss", + "sass": "sass", + "less": "less", + "md": "markdown", + "markdown": "markdown", + "txt": "text", + "dockerfile": "dockerfile", + "Dockerfile": "dockerfile", } CONTENT_PATTERNS = { - 'python': [ - r'^import\\s+\\w+', - r'^from\\s+\\w+\\s+import', - r'^def\\s+\\w+\\s*\\(', - r'^class\\s+\\w+\\s*[:\\(]', - r'^if\\s+__name__\\s*==\\s*[\\'\\"]__main__[\\'\\"]', + "python": [ + r"^import\\s+\\w+", + r"^from\\s+\\w+\\s+import", + r"^def\\s+\\w+\\s*\\(", + r"^class\\s+\\w+\\s*[:\\(]", + r"^if\\s+__name__\\s*==\\s*['\"]__main__['\"]", ], - 'javascript': [ - r'^const\\s+\\w+\\s*=', - r'^let\\s+\\w+\\s*=', - r'^var\\s+\\w+\\s*=', - r'^function\\s+\\w+\\s*\\(', - r'=>\\s*\\{', - r'import\\s+.*\\s+from', - r'export\\s+(default\\s+)?', + "javascript": [ + r"^const\\s+\\w+\\s*=", + r"^let\\s+\\w+\\s*=", + r"^var\\s+\\w+\\s*=", + r"^function\\s+\\w+\\s*\\(", + r"=>\\s*\\{", + r"import\\s+.*\\s+from", + r"export\\s+(default\\s+)?", ], - 'typescript': [ - r'^interface\\s+\\w+\\s*\\{', - r'^type\\s+\\w+\\s*=', - r':\\s*(string|number|boolean|any|void|null|undefined)', - r'<[A-Z]\\w*>', + "typescript": [ + r"^interface\\s+\\w+\\s*\\{", + r"^type\\s+\\w+\\s*=", + r":\\s*(string|number|boolean|any|void|null|undefined)", + r"<[A-Z]\\w*>", ], - 'java': [ - r'^package\\s+[\\w.]+;', - r'^import\\s+[\\w.]+;', - r'^public\\s+(class|interface|enum)\\s+\\w+', - r'^private\\s+(static\\s+)?(final\\s+)?\\w+\\s+\\w+;', + "java": [ + r"^package\\s+[\\w.]+;", + r"^import\\s+[\\w.]+;", + r"^public\\s+(class|interface|enum)\\s+\\w+", + r"^private\\s+(static\\s+)?(final\\s+)?\\w+\\s+\\w+;", ], - 'go': [ - r'^package\\s+\\w+', - r'^import\\s+\\(', - r'func\\s+\\w+\\s*\\(', - r':=', - r'go\\s+func', + "go": [ + r"^package\\s+\\w+", + r"^import\\s*\\(", + r"func\\s+\\w+\\s*\\(", + r":=", + r"go\\s+func", ], - 'rust': [ - r'^fn\\s+\\w+\\s*\\(', - r'^impl\\s+\\w+', - r'^struct\\s+\\w+', - r'^enum\\s+\\w+', - r'let\\s+mut\\s+\\w+', - r'->\\s*\\w+', + "rust": [ + r"^fn\\s+\\w+\\s*\\(", + r"^impl\\s+\\w+", + r"^struct\\s+\\w+", + r"^enum\\s+\\w+", + r"let\\s+mut\\s+\\w+", + r"->\\s*\\w+", ], - 'c': [ - r'#include\\s*<', - r'#include\\s*"', - r'int\\s+main\\s*\\(', - r'struct\\s+\\w+\\s*\\{', - r'void\\s+\\*?\\s*\\w+\\s*\\(', + "c": [ + r"#include\\s*<", + r"#include\\s*\"", + r"int\\s+main\\s*\\(", + r"struct\\s+\\w+\\s*\\{", + r"void\\s+\\*?\\s*\\w+\\s*\\(", ], - 'cpp': [ - r'#include\\s*<', - r'#include\\s*"', - r'class\\s+\\w+\\s*(:\\s*public)?', - r'std::\\w+', - r'using\\s+namespace\\s+std', + "cpp": [ + r"#include\\s*<", + r"#include\\s*\"", + r"class\\s+\\w+\\s*(:\\s*public)?", + r"std::\\w+", + r"using\\s+namespace\\s+std", ], - 'ruby': [ - r'^require\\s+[\\'\\"]', - r'^class\\s+\\w+(\\s*<\\s*\\w+)?', - r'^module\\s+\\w+', - r'def\\s+\\w+', - r'puts\\s+', - r'puts!', + "ruby": [ + r"^require\\s+['\"]", + r"^class\\s+\\w+(\\s*<\\s*\\w+)?", + r"^module\\s+\\w+", + r"def\\s+\\w+", + r"puts\\s+", + r"puts!", ], - 'php': [ - r'<\\?php', - r'\\$\\w+\\s*=', - r'function\\s+\\w+\\s*\\(', - r'class\\s+\\w+\\s*\\{', + "php": [ + r"<\?php", + r"\$\\w+\\s*=", + r"function\\s+\\w+\\s*\\(", + r"class\\s+\\w+\\s*\\{", ], } def __init__(self): self._tree_sitter_languages = {} - def detect_from_filename(self, filename: str) -> str | None: - """Detect language from file extension.""" - if '.' not in filename: + def detect_from_filename(self, filename): + if "." not in filename: return None - ext = filename.rsplit('.', 1)[-1].lower() + ext = filename.rsplit(".", 1)[-1].lower() return self.EXTENSION_MAP.get(ext) - def detect_from_content(self, content: str) -> str | None: - """Detect language from file content patterns.""" - first_lines = '\\n'.join(content.splitlines()[:50]) + def detect_from_content(self, content): + first_lines = "\n".join(content.splitlines()[:50]) - scores: dict[str, int] = {} + scores = {} for lang, patterns in self.CONTENT_PATTERNS.items(): score = 0 for pattern in patterns: - import re matches = len(re.findall(pattern, first_lines, re.MULTILINE)) score += matches @@ -173,12 +164,11 @@ class LanguageDetector: return None - def detect(self, filename: str, content: str = "") -> str: - """Detect language from filename and optionally content.""" + def detect(self, filename, content=""): ext_lang = self.detect_from_filename(filename) if ext_lang and ext_lang not in [ - 'text', 'markdown', 'json', 'yaml', 'xml', 'html', 'css', 'dockerfile' + "text", "markdown", "json", "yaml", "xml", "html", "css", "dockerfile" ]: if content: content_lang = self.detect_from_content(content) @@ -193,16 +183,13 @@ class LanguageDetector: return ext_lang or "text" - def get_supported_languages(self) -> list[str]: - """Return list of supported languages.""" + def get_supported_languages(self): return sorted(set(self.EXTENSION_MAP.values())) - def is_language_supported(self, language: str) -> bool: - """Check if a language is supported.""" + def is_language_supported(self, language): return language in self.get_supported_languages() -def detect_language(filename: str, content: str = "") -> str: - """Detect programming language from filename and content.""" +def detect_language(filename, content=""): detector = LanguageDetector() return detector.detect(filename, content)