196 lines
5.3 KiB
Python
196 lines
5.3 KiB
Python
class LanguageDetector:
|
|
EXTENSION_MAP = {
|
|
"py": "python",
|
|
"pyw": "python",
|
|
"pyx": "python",
|
|
"js": "javascript",
|
|
"mjs": "javascript",
|
|
"cjs": "javascript",
|
|
"jsx": "javascript",
|
|
"ts": "typescript",
|
|
"tsx": "typescript",
|
|
"mts": "typescript",
|
|
"cts": "typescript",
|
|
"java": "java",
|
|
"kt": "kotlin",
|
|
"kts": "kotlin",
|
|
"go": "go",
|
|
"rs": "rust",
|
|
"c": "c",
|
|
"h": "c",
|
|
"cpp": "cpp",
|
|
"cc": "cpp",
|
|
"cxx": "cpp",
|
|
"hpp": "cpp",
|
|
"hxx": "cpp",
|
|
"cs": "csharp",
|
|
"rb": "ruby",
|
|
"erb": "ruby",
|
|
"php": "php",
|
|
"swift": "swift",
|
|
"m": "objective-c",
|
|
"mm": "objective-c",
|
|
"scala": "scala",
|
|
"sc": "scala",
|
|
"jl": "julia",
|
|
"r": "r",
|
|
"R": "r",
|
|
"lua": "lua",
|
|
"pl": "perl",
|
|
"pm": "perl",
|
|
"sql": "sql",
|
|
"sh": "bash",
|
|
"bash": "bash",
|
|
"zsh": "bash",
|
|
"fish": "bash",
|
|
"yaml": "yaml",
|
|
"yml": "yaml",
|
|
"json": "json",
|
|
"xml": "xml",
|
|
"html": "html",
|
|
"htm": "html",
|
|
"css": "css",
|
|
"scss": "scss",
|
|
"sass": "sass",
|
|
"less": "less",
|
|
"md": "markdown",
|
|
"markdown": "markdown",
|
|
"txt": "text",
|
|
"dockerfile": "dockerfile",
|
|
"Dockerfile": "dockerfile",
|
|
}
|
|
|
|
CONTENT_PATTERNS = {
|
|
"python": [
|
|
r"^import\\s+\\w+",
|
|
r"^from\\s+\\w+\\s+import",
|
|
r"^def\\s+\\w+\\s*\\(",
|
|
r"^class\\s+\\w+\\s*[:\\(]",
|
|
r"^if\\s+__name__\\s*==\\s*['\"]__main__['\"]",
|
|
],
|
|
"javascript": [
|
|
r"^const\\s+\\w+\\s*=",
|
|
r"^let\\s+\\w+\\s*=",
|
|
r"^var\\s+\\w+\\s*=",
|
|
r"^function\\s+\\w+\\s*\\(",
|
|
r"=>\\s*\\{",
|
|
r"import\\s+.*\\s+from",
|
|
r"export\\s+(default\\s+)?",
|
|
],
|
|
"typescript": [
|
|
r"^interface\\s+\\w+\\s*\\{",
|
|
r"^type\\s+\\w+\\s*=",
|
|
r":\\s*(string|number|boolean|any|void|null|undefined)",
|
|
r"<[A-Z]\\w*>",
|
|
],
|
|
"java": [
|
|
r"^package\\s+[\\w.]+;",
|
|
r"^import\\s+[\\w.]+;",
|
|
r"^public\\s+(class|interface|enum)\\s+\\w+",
|
|
r"^private\\s+(static\\s+)?(final\\s+)?\\w+\\s+\\w+;",
|
|
],
|
|
"go": [
|
|
r"^package\\s+\\w+",
|
|
r"^import\\s*\\(",
|
|
r"func\\s+\\w+\\s*\\(",
|
|
r":=",
|
|
r"go\\s+func",
|
|
],
|
|
"rust": [
|
|
r"^fn\\s+\\w+\\s*\\(",
|
|
r"^impl\\s+\\w+",
|
|
r"^struct\\s+\\w+",
|
|
r"^enum\\s+\\w+",
|
|
r"let\\s+mut\\s+\\w+",
|
|
r"->\\s*\\w+",
|
|
],
|
|
"c": [
|
|
r"#include\\s*<",
|
|
r"#include\\s*\"",
|
|
r"int\\s+main\\s*\\(",
|
|
r"struct\\s+\\w+\\s*\\{",
|
|
r"void\\s+\\*?\\s*\\w+\\s*\\(",
|
|
],
|
|
"cpp": [
|
|
r"#include\\s*<",
|
|
r"#include\\s*\"",
|
|
r"class\\s+\\w+\\s*(:\\s*public)?",
|
|
r"std::\\w+",
|
|
r"using\\s+namespace\\s+std",
|
|
],
|
|
"ruby": [
|
|
r"^require\\s+['\"]",
|
|
r"^class\\s+\\w+(\\s*<\\s*\\w+)?",
|
|
r"^module\\s+\\w+",
|
|
r"def\\s+\\w+",
|
|
r"puts\\s+",
|
|
r"puts!",
|
|
],
|
|
"php": [
|
|
r"<\?php",
|
|
r"\$\\w+\\s*=",
|
|
r"function\\s+\\w+\\s*\\(",
|
|
r"class\\s+\\w+\\s*\\{",
|
|
],
|
|
}
|
|
|
|
def __init__(self):
|
|
self._tree_sitter_languages = {}
|
|
|
|
def detect_from_filename(self, filename):
|
|
if "." not in filename:
|
|
return None
|
|
|
|
ext = filename.rsplit(".", 1)[-1].lower()
|
|
return self.EXTENSION_MAP.get(ext)
|
|
|
|
def detect_from_content(self, content):
|
|
first_lines = "\n".join(content.splitlines()[:50])
|
|
|
|
scores = {}
|
|
|
|
for lang, patterns in self.CONTENT_PATTERNS.items():
|
|
score = 0
|
|
for pattern in patterns:
|
|
matches = len(re.findall(pattern, first_lines, re.MULTILINE))
|
|
score += matches
|
|
|
|
if score > 0:
|
|
scores[lang] = score
|
|
|
|
if scores:
|
|
best_lang = max(scores, key=scores.get)
|
|
return best_lang
|
|
|
|
return None
|
|
|
|
def detect(self, filename, content=""):
|
|
ext_lang = self.detect_from_filename(filename)
|
|
|
|
if ext_lang and ext_lang not in [
|
|
"text", "markdown", "json", "yaml", "xml", "html", "css", "dockerfile"
|
|
]:
|
|
if content:
|
|
content_lang = self.detect_from_content(content)
|
|
if content_lang and content_lang != ext_lang:
|
|
return content_lang
|
|
return ext_lang
|
|
|
|
if content:
|
|
content_lang = self.detect_from_content(content)
|
|
if content_lang:
|
|
return content_lang
|
|
|
|
return ext_lang or "text"
|
|
|
|
def get_supported_languages(self):
|
|
return sorted(set(self.EXTENSION_MAP.values()))
|
|
|
|
def is_language_supported(self, language):
|
|
return language in self.get_supported_languages()
|
|
|
|
|
|
def detect_language(filename, content=""):
|
|
detector = LanguageDetector()
|
|
return detector.detect(filename, content)
|