diff --git a/depnav/src/depnav/parser.py b/depnav/src/depnav/parser.py new file mode 100644 index 0000000..95622b1 --- /dev/null +++ b/depnav/src/depnav/parser.py @@ -0,0 +1,333 @@ +Language-specific parsers for extracting dependencies from source files. + +import re +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional + +try: + import tree_sitter + from tree_sitter import Language +except ImportError: + tree_sitter = None + +try: + import tree_sitter_python +except ImportError: + tree_sitter_python = None + +try: + import tree_sitter_javascript +except ImportError: + tree_sitter_javascript = None + +try: + import tree_sitter_go +except ImportError: + tree_sitter_go = None + + +class DependencyParser(ABC): + """Abstract base class for language parsers.""" + + @abstractmethod + def parse_file(self, file_path: Path) -> list[str]: + """Extract dependencies from a file.""" + pass + + @abstractmethod + def get_language(self) -> str: + """Return the language identifier.""" + pass + + +def get_language_library(lang: str): + """Get the tree-sitter library for a language.""" + lang_map = { + "python": tree_sitter_python, + "javascript": tree_sitter_javascript, + "typescript": tree_sitter_javascript, + "go": tree_sitter_go, + } + return lang_map.get(lang) + + +class PythonParser(DependencyParser): + """Parser for Python files using tree-sitter.""" + + def __init__(self): + self._parser: Optional[tree_sitter.Parser] = None + + def _get_parser(self) -> tree_sitter.Parser: + if self._parser is None: + if tree_sitter_python is None: + raise ImportError("tree-sitter-python is not installed") + if tree_sitter is None: + raise ImportError("tree-sitter is not installed") + lang = Language(tree_sitter_python.language()) + self._parser = tree_sitter.Parser() + self._parser.set_language(lang) + return self._parser + + def get_language(self) -> str: + return "python" + + def parse_file(self, file_path: Path) -> list[str]: + """Extract Python imports from a file.""" + try: + content = file_path.read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError): + return [] + + if tree_sitter is None or tree_sitter_python is None: + return self._regex_parse(content) + + try: + parser = self._get_parser() + tree = parser.parse(bytes(content, "utf-8")) + return self._extract_imports(tree.root_node, content) + except Exception: + return self._regex_parse(content) + + def _regex_parse(self, content: str) -> list[str]: + """Fallback regex-based parsing for Python.""" + imports = [] + import_pattern = re.compile( + r"^\s*(?:from|import)\s+(.+?)(?:\s+import\s+.*)?(?:\s*;?\s*)$", + re.MULTILINE, + ) + for match in import_pattern.finditer(content): + module = match.group(1).strip() + if module: + for part in module.split(","): + clean_part = part.strip().split(" as ")[0].split(".")[0] + if clean_part: + imports.append(clean_part) + return list(set(imports)) + + def _extract_imports( + self, node: tree_sitter.Node, content: str + ) -> list[str]: + """Extract imports from tree-sitter parse tree.""" + imports = [] + + if node.type == "import_statement": + module = self._get_module_name(node, content) + if module: + imports.append(module.split(".")[0]) + elif node.type == "from_import_statement": + module = self._get_module_name(node, content) + if module: + imports.append(module.split(".")[0]) + + for child in node.children: + imports.extend(self._extract_imports(child, content)) + + return list(set(imports)) + + def _get_module_name(self, node: tree_sitter.Node, content: str) -> str: + """Extract module name from import node.""" + for child in node.children: + if child.type in ("dotted_name", "module"): + return content[child.start_byte : child.end_byte] + return "" + + +class JavaScriptParser(DependencyParser): + """Parser for JavaScript/TypeScript files using tree-sitter.""" + + def __init__(self, typescript: bool = False): + self._parser: Optional[tree_sitter.Parser] = None + self._typescript = typescript + + def _get_parser(self) -> tree_sitter.Parser: + if self._parser is None: + if tree_sitter_javascript is None: + raise ImportError("tree-sitter-javascript is not installed") + if tree_sitter is None: + raise ImportError("tree-sitter is not installed") + lang = Language(tree_sitter_javascript.language()) + self._parser = tree_sitter.Parser() + self._parser.set_language(lang) + return self._parser + + def get_language(self) -> str: + return "typescript" if self._typescript else "javascript" + + def parse_file(self, file_path: Path) -> list[str]: + """Extract JavaScript/TypeScript imports from a file.""" + try: + content = file_path.read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError): + return [] + + if tree_sitter is None or tree_sitter_javascript is None: + return self._regex_parse(content) + + try: + parser = self._get_parser() + tree = parser.parse(bytes(content, "utf-8")) + return self._extract_imports(tree.root_node, content) + except Exception: + return self._regex_parse(content) + + def _regex_parse(self, content: str) -> list[str]: + """Fallback regex-based parsing for JavaScript/TypeScript.""" + imports = [] + patterns = [ + (r'require\s*\(\s*["\']([^"\']+)["\']\s*\)', 1), + (r'import\s+(?:\{[^}]*\}|\*\s+as\s+\w+|\w+)\s+from\s+["\']([^"\']+)["\']', 1), + (r'import\s+["\']([^"\']+)["\']', 1), + ] + for pattern, group in patterns: + for match in re.finditer(pattern, content): + module = match.group(group) + if module and not module.startswith("."): + imports.append(module.split("/")[0]) + return list(set(imports)) + + def _extract_imports( + self, node: tree_sitter.Node, content: str + ) -> list[str]: + """Extract imports from tree-sitter parse tree.""" + imports = [] + + if node.type in ("import_statement", "call_expression"): + import_str = content[node.start_byte : node.end_byte] + if "require" in import_str: + match = re.search(r'require\s*\(\s*["\']([^"\']+)["\']\s*\)', import_str) + if match: + module = match.group(1) + if not module.startswith("."): + imports.append(module.split("/")[0]) + elif "import" in import_str: + match = re.search( + r'from\s+["\']([^"\']+)["\']', import_str + ) or re.search(r'import\s+["\']([^"\']+)["\']', import_str) + if match: + module = match.group(1) + if not module.startswith("."): + imports.append(module.split("/")[0]) + + for child in node.children: + imports.extend(self._extract_imports(child, content)) + + return list(set(imports)) + + +class GoParser(DependencyParser): + """Parser for Go files using tree-sitter.""" + + def __init__(self): + self._parser: Optional[tree_sitter.Parser] = None + + def _get_parser(self) -> tree_sitter.Parser: + if self._parser is None: + if tree_sitter_go is None: + raise ImportError("tree-sitter-go is not installed") + if tree_sitter is None: + raise ImportError("tree-sitter is not installed") + lang = Language(tree_sitter_go.language()) + self._parser = tree_sitter.Parser() + self._parser.set_language(lang) + return self._parser + + def get_language(self) -> str: + return "go" + + def parse_file(self, file_path: Path) -> list[str]: + """Extract Go imports from a file.""" + try: + content = file_path.read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError): + return [] + + if tree_sitter is None or tree_sitter_go is None: + return self._regex_parse(content) + + try: + parser = self._get_parser() + tree = parser.parse(bytes(content, "utf-8")) + return self._extract_imports(tree.root_node, content) + except Exception: + return self._regex_parse(content) + + def _regex_parse(self, content: str) -> list[str]: + """Fallback regex-based parsing for Go.""" + imports = [] + import_block = re.search( + r'\(\s*([\s\S]*?)\s*\)', content, re.MULTILINE + ) + if import_block: + import_lines = import_block.group(1).strip().split("\n") + for line in import_lines: + line = line.strip().strip('"') + if line and not line.startswith("."): + parts = line.split("/") + if len(parts) >= 2: + imports.append(f"{parts[0]}/{parts[1]}") + elif parts: + imports.append(parts[0]) + return list(set(imports)) + + def _extract_imports( + self, node: tree_sitter.Node, content: str + ) -> list[str]: + """Extract imports from tree-sitter parse tree.""" + imports = [] + + if node.type == "import_declaration": + import_str = content[node.start_byte : node.end_byte] + match = re.search(r'"([^"]+)"', import_str) + if match: + module = match.group(1) + if not module.startswith("."): + parts = module.split("/") + if len(parts) >= 2: + imports.append(f"{parts[0]}/{parts[1]}") + elif parts: + imports.append(parts[0]) + + for child in node.children: + imports.extend(self._extract_imports(child, content)) + + return list(set(imports)) + + +def get_parser(language: str) -> DependencyParser: + """Factory function to get the appropriate parser for a language.""" + if language.lower() == "python": + return PythonParser() + elif language.lower() == "javascript": + return JavaScriptParser() + elif language.lower() == "typescript": + return JavaScriptParser(typescript=True) + elif language.lower() == "go": + return GoParser() + else: + raise ValueError(f"Unsupported language: {language}") + + +def detect_language(file_path: Path) -> Optional[str]: + """Detect the language of a file based on its extension.""" + ext_map = { + ".py": "python", + ".js": "javascript", + ".jsx": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".go": "go", + } + return ext_map.get(file_path.suffix.lower()) + + +def parse_dependencies( + file_path: Path, language: Optional[str] = None +) -> list[str]: + """Parse dependencies from a file.""" + if language is None: + language = detect_language(file_path) + if language is None: + return [] + parser = get_parser(language) + return parser.parse_file(file_path)