fix: resolve CI/CD issues with proper package structure and imports

2026-02-03 03:54:41 +00:00
parent 9773c9e46c
commit 8750e7574b
1 changed files with 544 additions and 0 deletions
--- a/src/local_api_docs_search/indexer/code.py
+++ b/src/local_api_docs_search/indexer/code.py
@@ -0,0 +1,544 @@
+"""Code comment indexer for Python, JavaScript, and TypeScript files."""
+
+import ast
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from local_api_docs_search.indexer.base import BaseIndexer
+from local_api_docs_search.models.document import Document, SourceType
+
+
+class CodeIndexer(BaseIndexer):
+    """Indexer for code comments and docstrings."""
+
+    source_type = SourceType.CODE
+
+    SUPPORTED_EXTENSIONS = {
+        ".py": "python",
+        ".js": "javascript",
+        ".jsx": "javascript",
+        ".ts": "typescript",
+        ".tsx": "typescript",
+    }
+
+    def __init__(self):
+        self._documents: List[Document] = []
+        self._parsed_files: Dict[str, Any] = {}
+
+    def index(
+        self, path: Path, recursive: bool = False, batch_size: int = 32
+    ) -> List[Document]:
+        """Index code files from the given path.
+
+        Args:
+            path: Path to file or directory
+            recursive: Whether to search recursively
+            batch_size: Documents per batch (for progress tracking)
+
+        Returns:
+            List of indexed Document objects
+        """
+        self._documents = []
+        self._parsed_files = {}
+
+        for file_path in self._find_files(path, recursive):
+            try:
+                docs = self._parse_file(file_path)
+                self._documents.extend(docs)
+            except Exception as e:
+                print(f"Warning: Failed to parse {file_path}: {e}")
+
+        return self._documents
+
+    def _parse_file(self, file_path: Path) -> List[Document]:
+        """Parse a single code file.
+
+        Args:
+            file_path: Path to the code file
+
+        Returns:
+            List of Document objects
+        """
+        ext = file_path.suffix.lower()
+        language = self.SUPPORTED_EXTENSIONS.get(ext)
+
+        if language is None:
+            return []
+
+        with open(file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        self._parsed_files[str(file_path)] = content
+
+        if language == "python":
+            return self._parse_python(content, file_path)
+        elif language in ("javascript", "typescript"):
+            return self._parse_js_ts(content, file_path, language)
+
+        return []
+
+    def _parse_python(self, content: str, file_path: Path) -> List[Document]:
+        """Parse Python file for docstrings.
+
+        Args:
+            content: Python file content
+            file_path: Path to the file
+
+        Returns:
+            List of Document objects
+        """
+        documents = []
+        doc_id_base = self._generate_id(file_path)
+
+        try:
+            tree = ast.parse(content)
+        except SyntaxError:
+            return []
+
+        module_doc = self._get_module_docstring(content)
+        if module_doc:
+            doc = Document(
+                id=f"{doc_id_base}_module",
+                content=module_doc,
+                source_type=self.source_type,
+                title=f"Module: {file_path.stem}",
+                file_path=str(file_path),
+                metadata={"doc_type": "module"},
+            )
+            documents.append(doc)
+
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
+                doc = self._parse_python_function(node, file_path, doc_id_base)
+                if doc:
+                    documents.append(doc)
+            elif isinstance(node, ast.ClassDef):
+                doc = self._parse_python_class(node, file_path, doc_id_base)
+                if doc:
+                    documents.append(doc)
+
+        if documents:
+            index_doc = Document(
+                id=f"{doc_id_base}_index",
+                content=self._generate_python_index(tree, file_path),
+                source_type=self.source_type,
+                title=f"Index: {file_path.stem}",
+                file_path=str(file_path),
+                metadata={"doc_type": "index"},
+            )
+            documents.append(index_doc)
+
+        return documents
+
+    def _get_module_docstring(self, content: str) -> Optional[str]:
+        """Extract module docstring.
+
+        Args:
+            content: Python file content
+
+        Returns:
+            Module docstring or None
+        """
+        tree = ast.parse(content)
+        if tree.body and isinstance(tree.body[0], ast.Expr):
+            docstring = tree.body[0].value
+            if isinstance(docstring, ast.Constant) and isinstance(
+                docstring.value, str
+            ):
+                return docstring.value
+        return None
+
+    def _parse_python_function(
+        self, node: ast.FunctionDef, file_path: Path, doc_id_base: str
+    ) -> Optional[Document]:
+        """Parse a Python function for docstring.
+
+        Args:
+            node: AST function node
+            file_path: Path to the file
+            doc_id_base: Base ID for document generation
+
+        Returns:
+            Document or None
+        """
+        docstring = self._get_docstring(node)
+        if not docstring:
+            return None
+
+        func_info = self._extract_python_function_info(node)
+
+        content = f"Function: {node.name}\n"
+        content += f"Docstring:\n{docstring}\n"
+        content += f"Parameters: {', '.join(func_info['args'])}\n"
+        content += f"Returns: {func_info['returns']}\n"
+        content += f"Line: {node.lineno}"
+
+        return Document(
+            id=f"{doc_id_base}_func_{node.name}",
+            content=content,
+            source_type=self.source_type,
+            title=f"Function: {node.name}",
+            file_path=str(file_path),
+            metadata={
+                "doc_type": "function",
+                "function_name": node.name,
+                "line": node.lineno,
+            },
+        )
+
+    def _parse_python_class(
+        self, node: ast.ClassDef, file_path: Path, doc_id_base: str
+    ) -> Optional[Document]:
+        """Parse a Python class for docstring.
+
+        Args:
+            node: AST class node
+            file_path: Path to the file
+            doc_id_base: Base ID for document generation
+
+        Returns:
+            Document or None
+        """
+        docstring = self._get_docstring(node)
+        if not docstring:
+            return None
+
+        methods = []
+        attributes = []
+
+        for item in node.body:
+            if isinstance(item, ast.FunctionDef) or isinstance(
+                item, ast.AsyncFunctionDef
+            ):
+                if not item.name.startswith("_"):
+                    methods.append(item.name)
+            elif isinstance(item, ast.AnnAssign) and isinstance(
+                item.target, ast.Name
+            ):
+                attributes.append(item.target.name)
+
+        content = f"Class: {node.name}\n"
+        content += f"Docstring:\n{docstring}\n"
+        if attributes:
+            content += f"Attributes: {', '.join(attributes)}\n"
+        if methods:
+            content += f"Methods: {', '.join(methods)}\n"
+        content += f"Line: {node.lineno}"
+
+        return Document(
+            id=f"{doc_id_base}_class_{node.name}",
+            content=content,
+            source_type=self.source_type,
+            title=f"Class: {node.name}",
+            file_path=str(file_path),
+            metadata={
+                "doc_type": "class",
+                "class_name": node.name,
+                "line": node.lineno,
+            },
+        )
+
+    def _get_docstring(self, node: ast.AST) -> Optional[str]:
+        """Extract docstring from an AST node.
+
+        Args:
+            node: AST node
+
+        Returns:
+            Docstring or None
+        """
+        if hasattr(node, "body") and node.body:
+            first = node.body[0]
+            if isinstance(first, ast.Expr) and isinstance(first.value, ast.Constant):
+                value = first.value.value
+                if isinstance(value, str):
+                    return value
+        return None
+
+    def _extract_python_function_info(
+        self, node: ast.FunctionDef
+    ) -> Dict[str, Any]:
+        """Extract function information.
+
+        Args:
+            node: AST function node
+
+        Returns:
+            Dictionary with function information
+        """
+        args = []
+        defaults = []
+
+        for arg in node.args.args:
+            if arg.arg != "self" and arg.arg != "cls":
+                args.append(arg.arg)
+
+        for default in node.args.defaults:
+            if isinstance(default, ast.Constant):
+                defaults.append(str(default.value))
+
+        returns = "unknown"
+        if node.returns:
+            if isinstance(node.returns, ast.Name):
+                returns = node.returns.id
+            elif isinstance(node.returns, ast.Constant):
+                returns = str(node.returns.value)
+
+        return {"args": args, "defaults": defaults, "returns": returns}
+
+    def _generate_python_index(
+        self, tree: ast.AST, file_path: Path
+    ) -> str:
+        """Generate an index of all documented items.
+
+        Args:
+            tree: Parsed AST tree
+            file_path: Path to the file
+
+        Returns:
+            Index content
+        """
+        functions = []
+        classes = []
+
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef) or isinstance(
+                node, ast.AsyncFunctionDef
+            ):
+                if self._get_docstring(node) and not node.name.startswith("_"):
+                    functions.append(node.name)
+            elif isinstance(node, ast.ClassDef):
+                if self._get_docstring(node):
+                    classes.append(node.name)
+
+        content = f"File: {file_path.name}\n\n"
+        if classes:
+            content += "Classes:\n" + "\n".join(f"  - {c}" for c in classes) + "\n\n"
+        if functions:
+            content += "Functions:\n" + "\n".join(f"  - {f}" for f in functions)
+
+        return content
+
+    def _parse_js_ts(
+        self, content: str, file_path: Path, language: str
+    ) -> List[Document]:
+        """Parse JavaScript/TypeScript file for JSDoc comments.
+
+        Args:
+            content: File content
+            file_path: Path to the file
+            language: Language identifier
+
+        Returns:
+            List of Document objects
+        """
+        documents = []
+        doc_id_base = self._generate_id(file_path)
+
+        jsdocs = self._extract_jsdocs(content)
+
+        if not jsdocs:
+            return documents
+
+        module_doc = self._extract_js_module_doc(content)
+        if module_doc:
+            doc = Document(
+                id=f"{doc_id_base}_module",
+                content=module_doc,
+                source_type=self.source_type,
+                title=f"Module: {file_path.stem}",
+                file_path=str(file_path),
+                metadata={"doc_type": "module"},
+            )
+            documents.append(doc)
+
+        for i, jsdoc in enumerate(jsdocs):
+            doc = self._create_jsdoc_document(jsdoc, file_path, doc_id_base, i)
+            documents.append(doc)
+
+        return documents
+
+    def _extract_jsdocs(self, content: str) -> List[Dict[str, Any]]:
+        """Extract JSDoc comments from content.
+
+        Args:
+            content: File content
+
+        Returns:
+            List of JSDoc dictionaries
+        """
+        jsdocs = []
+        pattern = r"/\*\*([\s\S]*?)\*/\s*(export\s+)?(async\s+)?(function|const|let|var|class|interface|type|enum)\s+(\w+)"
+        matches = re.findall(pattern, content, re.MULTILINE)
+
+        for match in matches:
+            full_comment = f"/**{match[0]}*/"
+            exported = bool(match[1])
+            async_kw = bool(match[2])
+            decl_type = match[3]
+            name = match[4]
+
+            parsed = self._parse_jsdoc_comment(full_comment)
+            parsed.update({
+                "name": name,
+                "type": decl_type,
+                "exported": exported,
+                "async": async_kw,
+            })
+            jsdocs.append(parsed)
+
+        return jsdocs
+
+    def _parse_jsdoc_comment(self, comment: str) -> Dict[str, Any]:
+        """Parse a JSDoc comment.
+
+        Args:
+            comment: JSDoc comment string
+
+        Returns:
+            Parsed JSDoc dictionary
+        """
+        result = {
+            "description": "",
+            "params": [],
+            "returns": None,
+            "examples": [],
+            "throws": [],
+            "see": [],
+        }
+
+        lines = comment.strip("/**").strip("*/").split("\n")
+        current_description = []
+
+        for line in lines:
+            line = line.strip().lstrip("*").strip()
+
+            if line.startswith("@param"):
+                param_match = re.match(r"@param\s+\{([^}]+)\}\s+(\w+)(?:\s+-)?\s*(.*)", line)
+                if param_match:
+                    result["params"].append({
+                        "type": param_match.group(1),
+                        "name": param_match.group(2),
+                        "description": param_match.group(3),
+                    })
+            elif line.startswith("@returns") or line.startswith("@return"):
+                return_match = re.match(r"@returns?\{([^}]+)\}\s*(.*)", line)
+                if return_match:
+                    result["returns"] = {
+                        "type": return_match.group(1),
+                        "description": return_match.group(2),
+                    }
+            elif line.startswith("@example"):
+                result["examples"].append(line[8:].strip())
+            elif line.startswith("@throws"):
+                throw_match = re.match(r"@throws\{([^}]+)\}\s*(.*)", line)
+                if throw_match:
+                    result["throws"].append({
+                        "type": throw_match.group(1),
+                        "description": throw_match.group(2),
+                    })
+            elif line.startswith("@see"):
+                result["see"].append(line[4:].strip())
+            elif line and not line.startswith("@"):
+                current_description.append(line)
+
+        result["description"] = " ".join(current_description)
+        return result
+
+    def _extract_js_module_doc(self, content: str) -> Optional[str]:
+        """Extract module-level documentation.
+
+        Args:
+            content: File content
+
+        Returns:
+            Module docstring or None
+        """
+        file_doc_pattern = r"/\*\*([\s\S]*?)\*/\s*@module\s+(\w+)"
+        match = re.search(file_doc_pattern, content)
+        if match:
+            return f"Module: {match.group(2)}\n\n{match.group(1).strip()}"
+        return None
+
+    def _create_jsdoc_document(
+        self,
+        jsdoc: Dict[str, Any],
+        file_path: Path,
+        doc_id_base: str,
+        index: int,
+    ) -> Document:
+        """Create a Document from parsed JSDoc.
+
+        Args:
+            jsdoc: Parsed JSDoc dictionary
+            file_path: Path to the source file
+            doc_id_base: Base ID for document generation
+            index: Index for ID generation
+
+        Returns:
+            Document object
+        """
+        content_parts = []
+
+        decl_type = jsdoc.get("type", "unknown")
+        name = jsdoc.get("name", "unknown")
+        is_async = "async " if jsdoc.get("async") else ""
+        is_exported = "export " if jsdoc.get("exported") else ""
+
+        content_parts.append(f"{is_exported}{is_async}{decl_type} {name}")
+
+        if jsdoc.get("description"):
+            content_parts.append(f"\nDescription: {jsdoc['description']}")
+
+        if jsdoc.get("params"):
+            param_lines = ["\nParameters:"]
+            for param in jsdoc["params"]:
+                param_lines.append(
+                    f"  - {param['name']} ({param['type']}): {param['description']}"
+                )
+            content_parts.append("\n".join(param_lines))
+
+        if jsdoc.get("returns"):
+            ret = jsdoc["returns"]
+            content_parts.append(f"\nReturns ({ret['type']}): {ret['description']}")
+
+        if jsdoc.get("examples"):
+            examples = "\nExamples:\n" + "\n".join(
+                f"  {i+1}. {ex}" for i, ex in enumerate(jsdoc["examples"])
+            )
+            content_parts.append(examples)
+
+        content = "\n".join(content_parts)
+
+        return Document(
+            id=f"{doc_id_base}_jsdoc_{index}",
+            content=content,
+            source_type=self.source_type,
+            title=f"{decl_type.capitalize()}: {name}",
+            file_path=str(file_path),
+            metadata={
+                "doc_type": "jsdoc",
+                "name": name,
+                "jsdoc_type": decl_type,
+            },
+        )
+
+    def _is_supported_file(self, path: Path) -> bool:
+        """Check if the file is a supported code file.
+
+        Args:
+            path: Path to the file
+
+        Returns:
+            True if the file extension is supported
+        """
+        return path.suffix.lower() in self.SUPPORTED_EXTENSIONS
+
+    def get_documents(self) -> List[Document]:
+        """Get all indexed documents.
+
+        Returns:
+            List of Document objects
+        """
+        return self._documents