From 3a3a91f709afe193f48948b726cb45ad9bcd98c0 Mon Sep 17 00:00:00 2001 From: 7000pctAUTO Date: Fri, 30 Jan 2026 22:12:49 +0000 Subject: [PATCH] Initial upload with CI/CD workflow --- codesnap/core/parser.py | 251 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 codesnap/core/parser.py diff --git a/codesnap/core/parser.py b/codesnap/core/parser.py new file mode 100644 index 0000000..92888ae --- /dev/null +++ b/codesnap/core/parser.py @@ -0,0 +1,251 @@ +"""Code parsing module using tree-sitter.""" + +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +from tree_sitter import Language, Parser + +from .language_detection import LanguageDetector + + +@dataclass +class ParsedFile: + """Represents a parsed source file.""" + + path: Path + language: str + content: str + ast: Optional[object] = None + error: Optional[str] = None + + +@dataclass +class ImportStatement: + """Represents an import/require statement.""" + + module: str + alias: Optional[str] = None + line_number: int = 0 + is_from: False = False + names: list[str] = field(default_factory=list) + + +LANGUAGE_PARSERS: dict[str, Parser] = {} + + +def get_parser(language: str) -> Parser: + """Get or create a tree-sitter parser for a language.""" + if language not in LANGUAGE_PARSERS: + try: + lang = Language.for_language(language) + parser = Parser() + parser.set_language(lang) + LANGUAGE_PARSERS[language] = parser + except Exception as e: + raise ValueError(f"Failed to load parser for {language}: {e}") + return LANGUAGE_PARSERS[language] + + +class CodeParser: + """Parses source code files using tree-sitter.""" + + def __init__(self) -> None: + self.language_detector = LanguageDetector() + + def parse_file(self, path: Path, content: Optional[str] = None) -> ParsedFile: + """Parse a single file.""" + try: + if content is None: + content = path.read_text(encoding="utf-8") + except (IOError, UnicodeDecodeError) as e: + return ParsedFile( + path=path, + language="unknown", + content="", + error=str(e), + ) + + language = self.language_detector.detect(path, content) + if language is None: + return ParsedFile( + path=path, + language="unknown", + content=content, + ) + + try: + parser = get_parser(language) + tree = parser.parse(content.encode("utf-8")) + return ParsedFile( + path=path, + language=language, + content=content, + ast=tree, + ) + except Exception as e: + return ParsedFile( + path=path, + language=language, + content=content, + error=str(e), + ) + + def parse_directory( + self, directory: Path, max_files: int = 1000 + ) -> list[ParsedFile]: + """Parse all source files in a directory.""" + parsed_files: list[ParsedFile] = [] + files_processed = 0 + + for root, _, filenames in os.walk(directory): + if files_processed >= max_files: + break + + for filename in filenames: + if files_processed >= max_files: + break + + filepath = Path(root) / filename + parsed = self.parse_file(filepath) + parsed_files.append(parsed) + files_processed += 1 + + return parsed_files + + def extract_imports(self, parsed_file: ParsedFile) -> list[ImportStatement]: + """Extract import statements from a parsed file.""" + imports: list[ImportStatement] = [] + + if parsed_file.ast is None: + return imports + + language = parsed_file.language + content = parsed_file.content + + if language == "python": + imports = self._extract_python_imports(content, parsed_file.path) + elif language in ("javascript", "typescript"): + imports = self._extract_js_imports(content, parsed_file.path) + elif language == "go": + imports = self._extract_go_imports(content, parsed_file.path) + + return imports + + def _extract_python_imports( + self, content: str, path: Path + ) -> list[ImportStatement]: + """Extract Python import statements.""" + imports: list[ImportStatement] = [] + lines = content.split("\n") + + for i, line in enumerate(lines): + line = line.strip() + if line.startswith("import "): + module = line[7:].split()[0].split(".")[0] + imports.append( + ImportStatement( + module=module, + line_number=i + 1, + is_from=False, + ) + ) + elif line.startswith("from "): + parts = line[5:].split() + if parts: + module = parts[0] + names = [] + if "import" in parts: + idx = parts.index("import") + if idx + 1 < len(parts): + names = [n.strip().split(" as ")[0] for n in parts[idx + 1].split(",")] + imports.append( + ImportStatement( + module=module, + line_number=i + 1, + is_from=True, + names=names, + ) + ) + + return imports + + def _extract_js_imports( + self, content: str, path: Path + ) -> list[ImportStatement]: + """Extract JavaScript/TypeScript import statements.""" + imports: list[ImportStatement] = [] + lines = content.split("\n") + + import_pattern = __import__("re").compile( + r"(?:import\s+(?:\{[^}]*\}|\*|[\w$]+)(?:\s+as\s+[\w$]+)?\s+from\s+)?['\"]([^'\"]+)['\"]" + ) + + for i, line in enumerate(lines): + line = line.strip() + if line.startswith("import "): + match = import_pattern.search(line) + if match: + module = match.group(1) + imports.append( + ImportStatement( + module=module, + line_number=i + 1, + is_from=True, + ) + ) + elif line.startswith("require("): + match = __import__("re").search(r"require\(['\"]([^'\"]+)['\"]", line) + if match: + imports.append( + ImportStatement( + module=match.group(1), + line_number=i + 1, + is_from=False, + ) + ) + + return imports + + def _extract_go_imports( + self, content: str, path: Path + ) -> list[ImportStatement]: + """Extract Go import statements.""" + imports: list[ImportStatement] = [] + lines = content.split("\n") + + in_import_block = False + import_start = -1 + + for i, line in enumerate(lines): + if line.strip() == "import (": + in_import_block = True + import_start = i + 1 + continue + if in_import_block: + if line.strip() == ")": + in_import_block = False + continue + match = __import__("re").search(r'"([^"]+)"', line) + if match: + module = match.group(1) + imports.append( + ImportStatement( + module=module, + line_number=i + 1, + is_from=False, + ) + ) + elif line.startswith('import "'): + match = __import__("re").search(r'import "([^"]+)"', line) + if match: + imports.append( + ImportStatement( + module=match.group(1), + line_number=i + 1, + is_from=False, + ) + ) + + return imports