Files
git-diff-explainer-cli/src/gdiffer/language_detector.py
7000pctAUTO c055777858
Some checks failed
CI / test (3.11) (push) Has been cancelled
CI / test (3.12) (push) Has been cancelled
CI / lint (push) Has been cancelled
CI / build (push) Has been cancelled
CI / test (3.10) (push) Has been cancelled
fix: resolve CI issues - push complete implementation with tests
2026-02-02 15:30:35 +00:00

196 lines
5.3 KiB
Python

class LanguageDetector:
EXTENSION_MAP = {
"py": "python",
"pyw": "python",
"pyx": "python",
"js": "javascript",
"mjs": "javascript",
"cjs": "javascript",
"jsx": "javascript",
"ts": "typescript",
"tsx": "typescript",
"mts": "typescript",
"cts": "typescript",
"java": "java",
"kt": "kotlin",
"kts": "kotlin",
"go": "go",
"rs": "rust",
"c": "c",
"h": "c",
"cpp": "cpp",
"cc": "cpp",
"cxx": "cpp",
"hpp": "cpp",
"hxx": "cpp",
"cs": "csharp",
"rb": "ruby",
"erb": "ruby",
"php": "php",
"swift": "swift",
"m": "objective-c",
"mm": "objective-c",
"scala": "scala",
"sc": "scala",
"jl": "julia",
"r": "r",
"R": "r",
"lua": "lua",
"pl": "perl",
"pm": "perl",
"sql": "sql",
"sh": "bash",
"bash": "bash",
"zsh": "bash",
"fish": "bash",
"yaml": "yaml",
"yml": "yaml",
"json": "json",
"xml": "xml",
"html": "html",
"htm": "html",
"css": "css",
"scss": "scss",
"sass": "sass",
"less": "less",
"md": "markdown",
"markdown": "markdown",
"txt": "text",
"dockerfile": "dockerfile",
"Dockerfile": "dockerfile",
}
CONTENT_PATTERNS = {
"python": [
r"^import\\s+\\w+",
r"^from\\s+\\w+\\s+import",
r"^def\\s+\\w+\\s*\\(",
r"^class\\s+\\w+\\s*[:\\(]",
r"^if\\s+__name__\\s*==\\s*['\"]__main__['\"]",
],
"javascript": [
r"^const\\s+\\w+\\s*=",
r"^let\\s+\\w+\\s*=",
r"^var\\s+\\w+\\s*=",
r"^function\\s+\\w+\\s*\\(",
r"=>\\s*\\{",
r"import\\s+.*\\s+from",
r"export\\s+(default\\s+)?",
],
"typescript": [
r"^interface\\s+\\w+\\s*\\{",
r"^type\\s+\\w+\\s*=",
r":\\s*(string|number|boolean|any|void|null|undefined)",
r"<[A-Z]\\w*>",
],
"java": [
r"^package\\s+[\\w.]+;",
r"^import\\s+[\\w.]+;",
r"^public\\s+(class|interface|enum)\\s+\\w+",
r"^private\\s+(static\\s+)?(final\\s+)?\\w+\\s+\\w+;",
],
"go": [
r"^package\\s+\\w+",
r"^import\\s*\\(",
r"func\\s+\\w+\\s*\\(",
r":=",
r"go\\s+func",
],
"rust": [
r"^fn\\s+\\w+\\s*\\(",
r"^impl\\s+\\w+",
r"^struct\\s+\\w+",
r"^enum\\s+\\w+",
r"let\\s+mut\\s+\\w+",
r"->\\s*\\w+",
],
"c": [
r"#include\\s*<",
r"#include\\s*\"",
r"int\\s+main\\s*\\(",
r"struct\\s+\\w+\\s*\\{",
r"void\\s+\\*?\\s*\\w+\\s*\\(",
],
"cpp": [
r"#include\\s*<",
r"#include\\s*\"",
r"class\\s+\\w+\\s*(:\\s*public)?",
r"std::\\w+",
r"using\\s+namespace\\s+std",
],
"ruby": [
r"^require\\s+['\"]",
r"^class\\s+\\w+(\\s*<\\s*\\w+)?",
r"^module\\s+\\w+",
r"def\\s+\\w+",
r"puts\\s+",
r"puts!",
],
"php": [
r"<\?php",
r"\$\\w+\\s*=",
r"function\\s+\\w+\\s*\\(",
r"class\\s+\\w+\\s*\\{",
],
}
def __init__(self):
self._tree_sitter_languages = {}
def detect_from_filename(self, filename):
if "." not in filename:
return None
ext = filename.rsplit(".", 1)[-1].lower()
return self.EXTENSION_MAP.get(ext)
def detect_from_content(self, content):
first_lines = "\n".join(content.splitlines()[:50])
scores = {}
for lang, patterns in self.CONTENT_PATTERNS.items():
score = 0
for pattern in patterns:
matches = len(re.findall(pattern, first_lines, re.MULTILINE))
score += matches
if score > 0:
scores[lang] = score
if scores:
best_lang = max(scores, key=scores.get)
return best_lang
return None
def detect(self, filename, content=""):
ext_lang = self.detect_from_filename(filename)
if ext_lang and ext_lang not in [
"text", "markdown", "json", "yaml", "xml", "html", "css", "dockerfile"
]:
if content:
content_lang = self.detect_from_content(content)
if content_lang and content_lang != ext_lang:
return content_lang
return ext_lang
if content:
content_lang = self.detect_from_content(content)
if content_lang:
return content_lang
return ext_lang or "text"
def get_supported_languages(self):
return sorted(set(self.EXTENSION_MAP.values()))
def is_language_supported(self, language):
return language in self.get_supported_languages()
def detect_language(filename, content=""):
detector = LanguageDetector()
return detector.detect(filename, content)