diff --git a/src/codeguard/analyzers/language_detector.py b/src/codeguard/analyzers/language_detector.py new file mode 100644 index 0000000..75d9490 --- /dev/null +++ b/src/codeguard/analyzers/language_detector.py @@ -0,0 +1,71 @@ +"""Language detection module.""" + +from pathlib import Path +from typing import Optional +from codeguard.core.models import Language + + +class LanguageDetector: + EXTENSION_MAP = { + ".py": Language.PYTHON, + ".pyw": Language.PYTHON, + ".js": Language.JAVASCRIPT, + ".jsx": Language.JAVASCRIPT, + ".ts": Language.TYPESCRIPT, + ".tsx": Language.TYPESCRIPT, + ".go": Language.GO, + ".rs": Language.RUST, + } + + SHEBANG_PATTERNS = { + "python": [b"python", b"python3", b"python2"], + "bash": [b"bash", b"sh"], + } + + def __init__(self): + pass + + def detect(self, file_path: Path) -> Optional[Language]: + ext = file_path.suffix.lower() + return self.EXTENSION_MAP.get(ext) + + def detect_from_content(self, content: bytes) -> Optional[Language]: + if not content: + return None + + first_line = content.split(b"\n")[0].lower() + + for lang, patterns in self.SHEBANG_PATTERNS.items(): + for pattern in patterns: + if pattern in first_line: + if lang == "python": + return Language.PYTHON + + return None + + def is_supported(self, file_path: Path) -> bool: + return self.detect(file_path) is not None + + def get_language_by_name(self, name: str) -> Optional[Language]: + name = name.lower() + for lang in Language: + if lang.value == name: + return lang + return None + + def chunk_code(self, code: str, max_tokens: int = 8000) -> list[str]: + lines = code.split("\n") + chunks = [] + current_chunk = [] + + for line in lines: + current_chunk.append(line) + if len("".join(current_chunk)) > max_tokens: + if current_chunk: + chunks.append("\n".join(current_chunk[:-1])) + current_chunk = [line] + + if current_chunk: + chunks.append("\n".join(current_chunk)) + + return chunks if chunks else [code]