Add source code files
This commit is contained in:
71
src/codeguard/analyzers/language_detector.py
Normal file
71
src/codeguard/analyzers/language_detector.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
"""Language detection module."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
from codeguard.core.models import Language
|
||||||
|
|
||||||
|
|
||||||
|
class LanguageDetector:
|
||||||
|
EXTENSION_MAP = {
|
||||||
|
".py": Language.PYTHON,
|
||||||
|
".pyw": Language.PYTHON,
|
||||||
|
".js": Language.JAVASCRIPT,
|
||||||
|
".jsx": Language.JAVASCRIPT,
|
||||||
|
".ts": Language.TYPESCRIPT,
|
||||||
|
".tsx": Language.TYPESCRIPT,
|
||||||
|
".go": Language.GO,
|
||||||
|
".rs": Language.RUST,
|
||||||
|
}
|
||||||
|
|
||||||
|
SHEBANG_PATTERNS = {
|
||||||
|
"python": [b"python", b"python3", b"python2"],
|
||||||
|
"bash": [b"bash", b"sh"],
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def detect(self, file_path: Path) -> Optional[Language]:
|
||||||
|
ext = file_path.suffix.lower()
|
||||||
|
return self.EXTENSION_MAP.get(ext)
|
||||||
|
|
||||||
|
def detect_from_content(self, content: bytes) -> Optional[Language]:
|
||||||
|
if not content:
|
||||||
|
return None
|
||||||
|
|
||||||
|
first_line = content.split(b"\n")[0].lower()
|
||||||
|
|
||||||
|
for lang, patterns in self.SHEBANG_PATTERNS.items():
|
||||||
|
for pattern in patterns:
|
||||||
|
if pattern in first_line:
|
||||||
|
if lang == "python":
|
||||||
|
return Language.PYTHON
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def is_supported(self, file_path: Path) -> bool:
|
||||||
|
return self.detect(file_path) is not None
|
||||||
|
|
||||||
|
def get_language_by_name(self, name: str) -> Optional[Language]:
|
||||||
|
name = name.lower()
|
||||||
|
for lang in Language:
|
||||||
|
if lang.value == name:
|
||||||
|
return lang
|
||||||
|
return None
|
||||||
|
|
||||||
|
def chunk_code(self, code: str, max_tokens: int = 8000) -> list[str]:
|
||||||
|
lines = code.split("\n")
|
||||||
|
chunks = []
|
||||||
|
current_chunk = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
current_chunk.append(line)
|
||||||
|
if len("".join(current_chunk)) > max_tokens:
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append("\n".join(current_chunk[:-1]))
|
||||||
|
current_chunk = [line]
|
||||||
|
|
||||||
|
if current_chunk:
|
||||||
|
chunks.append("\n".join(current_chunk))
|
||||||
|
|
||||||
|
return chunks if chunks else [code]
|
||||||
Reference in New Issue
Block a user