class LanguageDetector: EXTENSION_MAP = { "py": "python", "pyw": "python", "pyx": "python", "js": "javascript", "mjs": "javascript", "cjs": "javascript", "jsx": "javascript", "ts": "typescript", "tsx": "typescript", "mts": "typescript", "cts": "typescript", "java": "java", "kt": "kotlin", "kts": "kotlin", "go": "go", "rs": "rust", "c": "c", "h": "c", "cpp": "cpp", "cc": "cpp", "cxx": "cpp", "hpp": "cpp", "hxx": "cpp", "cs": "csharp", "rb": "ruby", "erb": "ruby", "php": "php", "swift": "swift", "m": "objective-c", "mm": "objective-c", "scala": "scala", "sc": "scala", "jl": "julia", "r": "r", "R": "r", "lua": "lua", "pl": "perl", "pm": "perl", "sql": "sql", "sh": "bash", "bash": "bash", "zsh": "bash", "fish": "bash", "yaml": "yaml", "yml": "yaml", "json": "json", "xml": "xml", "html": "html", "htm": "html", "css": "css", "scss": "scss", "sass": "sass", "less": "less", "md": "markdown", "markdown": "markdown", "txt": "text", "dockerfile": "dockerfile", "Dockerfile": "dockerfile", } CONTENT_PATTERNS = { "python": [ r"^import\\s+\\w+", r"^from\\s+\\w+\\s+import", r"^def\\s+\\w+\\s*\\(", r"^class\\s+\\w+\\s*[:\\(]", r"^if\\s+__name__\\s*==\\s*['\"]__main__['\"]", ], "javascript": [ r"^const\\s+\\w+\\s*=", r"^let\\s+\\w+\\s*=", r"^var\\s+\\w+\\s*=", r"^function\\s+\\w+\\s*\\(", r"=>\\s*\\{", r"import\\s+.*\\s+from", r"export\\s+(default\\s+)?", ], "typescript": [ r"^interface\\s+\\w+\\s*\\{", r"^type\\s+\\w+\\s*=", r":\\s*(string|number|boolean|any|void|null|undefined)", r"<[A-Z]\\w*>", ], "java": [ r"^package\\s+[\\w.]+;", r"^import\\s+[\\w.]+;", r"^public\\s+(class|interface|enum)\\s+\\w+", r"^private\\s+(static\\s+)?(final\\s+)?\\w+\\s+\\w+;", ], "go": [ r"^package\\s+\\w+", r"^import\\s*\\(", r"func\\s+\\w+\\s*\\(", r":=", r"go\\s+func", ], "rust": [ r"^fn\\s+\\w+\\s*\\(", r"^impl\\s+\\w+", r"^struct\\s+\\w+", r"^enum\\s+\\w+", r"let\\s+mut\\s+\\w+", r"->\\s*\\w+", ], "c": [ r"#include\\s*<", r"#include\\s*\"", r"int\\s+main\\s*\\(", r"struct\\s+\\w+\\s*\\{", r"void\\s+\\*?\\s*\\w+\\s*\\(", ], "cpp": [ r"#include\\s*<", r"#include\\s*\"", r"class\\s+\\w+\\s*(:\\s*public)?", r"std::\\w+", r"using\\s+namespace\\s+std", ], "ruby": [ r"^require\\s+['\"]", r"^class\\s+\\w+(\\s*<\\s*\\w+)?", r"^module\\s+\\w+", r"def\\s+\\w+", r"puts\\s+", r"puts!", ], "php": [ r"<\?php", r"\$\\w+\\s*=", r"function\\s+\\w+\\s*\\(", r"class\\s+\\w+\\s*\\{", ], } def __init__(self): self._tree_sitter_languages = {} def detect_from_filename(self, filename): if "." not in filename: return None ext = filename.rsplit(".", 1)[-1].lower() return self.EXTENSION_MAP.get(ext) def detect_from_content(self, content): first_lines = "\n".join(content.splitlines()[:50]) scores = {} for lang, patterns in self.CONTENT_PATTERNS.items(): score = 0 for pattern in patterns: matches = len(re.findall(pattern, first_lines, re.MULTILINE)) score += matches if score > 0: scores[lang] = score if scores: best_lang = max(scores, key=scores.get) return best_lang return None def detect(self, filename, content=""): ext_lang = self.detect_from_filename(filename) if ext_lang and ext_lang not in [ "text", "markdown", "json", "yaml", "xml", "html", "css", "dockerfile" ]: if content: content_lang = self.detect_from_content(content) if content_lang and content_lang != ext_lang: return content_lang return ext_lang if content: content_lang = self.detect_from_content(content) if content_lang: return content_lang return ext_lang or "text" def get_supported_languages(self): return sorted(set(self.EXTENSION_MAP.values())) def is_language_supported(self, language): return language in self.get_supported_languages() def detect_language(filename, content=""): detector = LanguageDetector() return detector.detect(filename, content)