Initial upload with CI/CD workflow

2026-01-30 22:12:49 +00:00
parent 70cc6415f7
commit 3a3a91f709
1 changed files with 251 additions and 0 deletions
--- a/codesnap/core/parser.py
+++ b/codesnap/core/parser.py
@@ -0,0 +1,251 @@
+"""Code parsing module using tree-sitter."""
+
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+from tree_sitter import Language, Parser
+
+from .language_detection import LanguageDetector
+
+
+@dataclass
+class ParsedFile:
+    """Represents a parsed source file."""
+
+    path: Path
+    language: str
+    content: str
+    ast: Optional[object] = None
+    error: Optional[str] = None
+
+
+@dataclass
+class ImportStatement:
+    """Represents an import/require statement."""
+
+    module: str
+    alias: Optional[str] = None
+    line_number: int = 0
+    is_from: False = False
+    names: list[str] = field(default_factory=list)
+
+
+LANGUAGE_PARSERS: dict[str, Parser] = {}
+
+
+def get_parser(language: str) -> Parser:
+    """Get or create a tree-sitter parser for a language."""
+    if language not in LANGUAGE_PARSERS:
+        try:
+            lang = Language.for_language(language)
+            parser = Parser()
+            parser.set_language(lang)
+            LANGUAGE_PARSERS[language] = parser
+        except Exception as e:
+            raise ValueError(f"Failed to load parser for {language}: {e}")
+    return LANGUAGE_PARSERS[language]
+
+
+class CodeParser:
+    """Parses source code files using tree-sitter."""
+
+    def __init__(self) -> None:
+        self.language_detector = LanguageDetector()
+
+    def parse_file(self, path: Path, content: Optional[str] = None) -> ParsedFile:
+        """Parse a single file."""
+        try:
+            if content is None:
+                content = path.read_text(encoding="utf-8")
+        except (IOError, UnicodeDecodeError) as e:
+            return ParsedFile(
+                path=path,
+                language="unknown",
+                content="",
+                error=str(e),
+            )
+
+        language = self.language_detector.detect(path, content)
+        if language is None:
+            return ParsedFile(
+                path=path,
+                language="unknown",
+                content=content,
+            )
+
+        try:
+            parser = get_parser(language)
+            tree = parser.parse(content.encode("utf-8"))
+            return ParsedFile(
+                path=path,
+                language=language,
+                content=content,
+                ast=tree,
+            )
+        except Exception as e:
+            return ParsedFile(
+                path=path,
+                language=language,
+                content=content,
+                error=str(e),
+            )
+
+    def parse_directory(
+        self, directory: Path, max_files: int = 1000
+    ) -> list[ParsedFile]:
+        """Parse all source files in a directory."""
+        parsed_files: list[ParsedFile] = []
+        files_processed = 0
+
+        for root, _, filenames in os.walk(directory):
+            if files_processed >= max_files:
+                break
+
+            for filename in filenames:
+                if files_processed >= max_files:
+                    break
+
+                filepath = Path(root) / filename
+                parsed = self.parse_file(filepath)
+                parsed_files.append(parsed)
+                files_processed += 1
+
+        return parsed_files
+
+    def extract_imports(self, parsed_file: ParsedFile) -> list[ImportStatement]:
+        """Extract import statements from a parsed file."""
+        imports: list[ImportStatement] = []
+
+        if parsed_file.ast is None:
+            return imports
+
+        language = parsed_file.language
+        content = parsed_file.content
+
+        if language == "python":
+            imports = self._extract_python_imports(content, parsed_file.path)
+        elif language in ("javascript", "typescript"):
+            imports = self._extract_js_imports(content, parsed_file.path)
+        elif language == "go":
+            imports = self._extract_go_imports(content, parsed_file.path)
+
+        return imports
+
+    def _extract_python_imports(
+        self, content: str, path: Path
+    ) -> list[ImportStatement]:
+        """Extract Python import statements."""
+        imports: list[ImportStatement] = []
+        lines = content.split("\n")
+
+        for i, line in enumerate(lines):
+            line = line.strip()
+            if line.startswith("import "):
+                module = line[7:].split()[0].split(".")[0]
+                imports.append(
+                    ImportStatement(
+                        module=module,
+                        line_number=i + 1,
+                        is_from=False,
+                    )
+                )
+            elif line.startswith("from "):
+                parts = line[5:].split()
+                if parts:
+                    module = parts[0]
+                    names = []
+                    if "import" in parts:
+                        idx = parts.index("import")
+                        if idx + 1 < len(parts):
+                            names = [n.strip().split(" as ")[0] for n in parts[idx + 1].split(",")]
+                    imports.append(
+                        ImportStatement(
+                            module=module,
+                            line_number=i + 1,
+                            is_from=True,
+                            names=names,
+                        )
+                    )
+
+        return imports
+
+    def _extract_js_imports(
+        self, content: str, path: Path
+    ) -> list[ImportStatement]:
+        """Extract JavaScript/TypeScript import statements."""
+        imports: list[ImportStatement] = []
+        lines = content.split("\n")
+
+        import_pattern = __import__("re").compile(
+            r"(?:import\s+(?:\{[^}]*\}|\*|[\w$]+)(?:\s+as\s+[\w$]+)?\s+from\s+)?['\"]([^'\"]+)['\"]"
+        )
+
+        for i, line in enumerate(lines):
+            line = line.strip()
+            if line.startswith("import "):
+                match = import_pattern.search(line)
+                if match:
+                    module = match.group(1)
+                    imports.append(
+                        ImportStatement(
+                            module=module,
+                            line_number=i + 1,
+                            is_from=True,
+                        )
+                    )
+            elif line.startswith("require("):
+                match = __import__("re").search(r"require\(['\"]([^'\"]+)['\"]", line)
+                if match:
+                    imports.append(
+                        ImportStatement(
+                            module=match.group(1),
+                            line_number=i + 1,
+                            is_from=False,
+                        )
+                    )
+
+        return imports
+
+    def _extract_go_imports(
+        self, content: str, path: Path
+    ) -> list[ImportStatement]:
+        """Extract Go import statements."""
+        imports: list[ImportStatement] = []
+        lines = content.split("\n")
+
+        in_import_block = False
+        import_start = -1
+
+        for i, line in enumerate(lines):
+            if line.strip() == "import (":
+                in_import_block = True
+                import_start = i + 1
+                continue
+            if in_import_block:
+                if line.strip() == ")":
+                    in_import_block = False
+                    continue
+                match = __import__("re").search(r'"([^"]+)"', line)
+                if match:
+                    module = match.group(1)
+                    imports.append(
+                        ImportStatement(
+                            module=module,
+                            line_number=i + 1,
+                            is_from=False,
+                        )
+                    )
+            elif line.startswith('import "'):
+                match = __import__("re").search(r'import "([^"]+)"', line)
+                if match:
+                    imports.append(
+                        ImportStatement(
+                            module=match.group(1),
+                            line_number=i + 1,
+                            is_from=False,
+                        )
+                    )
+
+        return imports