Initial upload with CI/CD workflow
This commit is contained in:
129
codesnap/core/language_detection.py
Normal file
129
codesnap/core/language_detection.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""Language detection module for CodeSnap."""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class LanguageInfo:
|
||||
"""Information about a detected programming language."""
|
||||
|
||||
name: str
|
||||
extensions: tuple[str, ...]
|
||||
shebangs: tuple[str, ...]
|
||||
|
||||
|
||||
LANGUAGE_MAP: dict[str, LanguageInfo] = {
|
||||
"python": LanguageInfo(
|
||||
name="Python",
|
||||
extensions=(".py", ".pyw", ".pyi"),
|
||||
shebangs=("python", "python3", "pypy"),
|
||||
),
|
||||
"javascript": LanguageInfo(
|
||||
name="JavaScript",
|
||||
extensions=(".js", ".mjs", ".cjs"),
|
||||
shebangs=("node", "nodejs"),
|
||||
),
|
||||
"typescript": LanguageInfo(
|
||||
name="TypeScript",
|
||||
extensions=(".ts", ".tsx"),
|
||||
shebangs=(),
|
||||
),
|
||||
"go": LanguageInfo(
|
||||
name="Go",
|
||||
extensions=(".go",),
|
||||
shebangs=("go",),
|
||||
),
|
||||
"rust": LanguageInfo(
|
||||
name="Rust",
|
||||
extensions=(".rs",),
|
||||
shebangs=("rust", "rustc"),
|
||||
),
|
||||
"java": LanguageInfo(
|
||||
name="Java",
|
||||
extensions=(".java",),
|
||||
shebangs=(),
|
||||
),
|
||||
"c": LanguageInfo(
|
||||
name="C",
|
||||
extensions=(".c", ".h"),
|
||||
shebangs=(),
|
||||
),
|
||||
"cpp": LanguageInfo(
|
||||
name="C++",
|
||||
extensions=(".cpp", ".cc", ".cxx", ".hpp", ".hxx"),
|
||||
shebangs=(),
|
||||
),
|
||||
"ruby": LanguageInfo(
|
||||
name="Ruby",
|
||||
extensions=(".rb", ".erb"),
|
||||
shebangs=("ruby",),
|
||||
),
|
||||
"php": LanguageInfo(
|
||||
name="PHP",
|
||||
extensions=(".php",),
|
||||
shebangs=("php",),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class LanguageDetector:
|
||||
"""Detects programming languages from file extensions and shebangs."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._ext_to_lang: dict[str, str] = {}
|
||||
for lang_key, lang_info in LANGUAGE_MAP.items():
|
||||
for ext in lang_info.extensions:
|
||||
self._ext_to_lang[ext] = lang_key
|
||||
|
||||
def detect_from_path(self, path: Path) -> Optional[str]:
|
||||
"""Detect language from file path extension."""
|
||||
suffix = path.suffix.lower()
|
||||
return self._ext_to_lang.get(suffix)
|
||||
|
||||
def detect_from_content(self, content: str) -> Optional[str]:
|
||||
"""Detect language from file content shebang."""
|
||||
first_line = content.split("\n")[0].strip()
|
||||
if not first_line.startswith("#!"):
|
||||
return None
|
||||
|
||||
shebang = first_line[2:].strip()
|
||||
shebang_cmd = re.split(r"\s+", shebang)[0]
|
||||
|
||||
for lang_key, lang_info in LANGUAGE_MAP.items():
|
||||
for shebang_pattern in lang_info.shebangs:
|
||||
if shebang_cmd.endswith(shebang_pattern):
|
||||
return lang_key
|
||||
|
||||
return None
|
||||
|
||||
def detect(self, path: Path, content: Optional[str] = None) -> Optional[str]:
|
||||
"""Detect language using both path and content analysis."""
|
||||
ext_result = self.detect_from_path(path)
|
||||
if ext_result:
|
||||
return ext_result
|
||||
|
||||
if content is not None:
|
||||
return self.detect_from_content(content)
|
||||
|
||||
return None
|
||||
|
||||
def get_language_info(self, lang_key: str) -> Optional[LanguageInfo]:
|
||||
"""Get language information by key."""
|
||||
return LANGUAGE_MAP.get(lang_key)
|
||||
|
||||
def get_supported_extensions(self) -> list[str]:
|
||||
"""Get all supported file extensions."""
|
||||
return list(self._ext_to_lang.keys())
|
||||
|
||||
def get_supported_languages(self) -> list[str]:
|
||||
"""Get all supported language names."""
|
||||
return [info.name for info in LANGUAGE_MAP.values()]
|
||||
|
||||
|
||||
def detect_language(path: Path, content: Optional[str] = None) -> Optional[str]:
|
||||
"""Convenience function to detect language of a file."""
|
||||
detector = LanguageDetector()
|
||||
return detector.detect(path, content)
|
||||
Reference in New Issue
Block a user