local-api-docs-search/src/indexer/openapi.py

"""OpenAPI/Swagger specification indexer."""

import hashlib
import json
from pathlib import Path
from typing import Any, Dict, List, Optional

from openapi_spec_validator import validate
from yaml import safe_load

from src.indexer.base import BaseIndexer
from src.models.document import Document, SourceType


class OpenAPIIndexer(BaseIndexer):
    """Indexer for OpenAPI/Swagger specifications."""

    source_type = SourceType.OPENAPI

    SUPPORTED_EXTENSIONS = {".yaml", ".yml", ".json"}

    def __init__(self):
        self._documents: List[Document] = []

    def index(
        self, path: Path, recursive: bool = False, batch_size: int = 32
    ) -> List[Document]:
        """Index OpenAPI specifications from the given path.

        Args:
            path: Path to file or directory
            recursive: Whether to search recursively
            batch_size: Documents per batch (for progress tracking)

        Returns:
            List of indexed Document objects
        """
        self._documents = []

        for file_path in self._find_files(path, recursive):
            try:
                docs = self._parse_file(file_path)
                self._documents.extend(docs)
            except Exception as e:
                print(f"Warning: Failed to parse {file_path}: {e}")

        return self._documents

    def _parse_file(self, file_path: Path) -> List[Document]:
        """Parse a single OpenAPI file.

        Args:
            file_path: Path to the OpenAPI file

        Returns:
            List of Document objects
        """
        with open(file_path, "r") as f:
            content = f.read()

        if file_path.suffix == ".json":
            spec = json.loads(content)
        else:
            spec = safe_load(content)

        if spec is None:
            return []

        validation_errors = self._validate_spec(spec, file_path)
        if validation_errors:
            print(f"Warning: Validation errors in {file_path}: {validation_errors}")

        return self._extract_documents(spec, file_path)

    def _validate_spec(
        self, spec: Dict[str, Any], file_path: Path
    ) -> Optional[str]:
        """Validate an OpenAPI specification.

        Args:
            spec: The parsed specification
            file_path: Path to the source file

        Returns:
            None if valid, error message otherwise
        """
        try:
            validate(spec)
            return None
        except Exception as e:
            return str(e)

    def _extract_documents(
        self, spec: Dict[str, Any], file_path: Path
    ) -> List[Document]:
        """Extract searchable documents from an OpenAPI spec.

        Args:
            spec: The parsed OpenAPI specification
            file_path: Path to the source file

        Returns:
            List of Document objects
        """
        documents = []
        spec_info = spec.get("info", {})
        title = spec_info.get("title", file_path.stem)
        version = spec_info.get("version", "unknown")

        doc_id_base = self._generate_id(file_path)

        info_doc = Document(
            id=f"{doc_id_base}_info",
            content=self._format_info_content(spec_info),
            source_type=self.source_type,
            title=f"{title} - API Info",
            file_path=str(file_path),
            metadata={"version": version, "section": "info"},
        )
        documents.append(info_doc)

        for path, path_item in spec.get("paths", {}).items():
            path_docs = self._extract_path_documents(
                path, path_item, spec, file_path, doc_id_base
            )
            documents.extend(path_docs)

        for tag, tag_spec in spec.get("tags", []):
            tag_doc = Document(
                id=f"{doc_id_base}_tag_{tag}",
                content=self._format_tag_content(tag, tag_spec),
                source_type=self.source_type,
                title=f"Tag: {tag}",
                file_path=str(file_path),
                metadata={"section": "tags", "tag": tag},
            )
            documents.append(tag_doc)

        for schema_name, schema in spec.get("components", {}).get("schemas", {}).items():
            schema_doc = self._extract_schema_document(
                schema_name, schema, file_path, doc_id_base
            )
            if schema_doc:
                documents.append(schema_doc)

        return documents

    def _extract_path_documents(
        self,
        path: str,
        path_item: Dict[str, Any],
        spec: Dict[str, Any],
        file_path: Path,
        doc_id_base: str,
    ) -> List[Document]:
        """Extract documents from a path item.

        Args:
            path: The path string
            path_item: The path item specification
            spec: The full OpenAPI specification
            file_path: Path to the source file
            doc_id_base: Base ID for document generation

        Returns:
            List of Document objects
        """
        documents = []
        path_hash = hashlib.md5(path.encode()).hexdigest()[:8]

        methods = ["get", "post", "put", "patch", "delete", "options", "head", "trace"]

        for method in methods:
            if method in path_item:
                operation = path_item[method]
                doc = self._extract_operation_document(
                    method, path, operation, spec, file_path, doc_id_base, path_hash
                )
                documents.append(doc)

        summary = path_item.get("summary", "")
        description = path_item.get("description", "")
        if summary or description:
            path_doc = Document(
                id=f"{doc_id_base}_path_{path_hash}",
                content=f"Path: {path}\nSummary: {summary}\nDescription: {description}",
                source_type=self.source_type,
                title=f"Path: {path}",
                file_path=str(file_path),
                metadata={"section": "path", "path": path},
            )
            documents.append(path_doc)

        return documents

    def _extract_operation_document(
        self,
        method: str,
        path: str,
        operation: Dict[str, Any],
        spec: Dict[str, Any],
        file_path: Path,
        doc_id_base: str,
        path_hash: str,
    ) -> Document:
        """Extract a document from an operation.

        Args:
            method: HTTP method
            path: API path
            operation: The operation specification
            spec: The full OpenAPI specification
            file_path: Path to the source file
            doc_id_base: Base ID for document generation
            path_hash: Hash of the path for ID generation

        Returns:
            Document object
        """
        op_id = operation.get("operationId", f"{method}_{path_hash}")
        summary = operation.get("summary", "")
        description = operation.get("description", "")
        deprecated = operation.get("deprecated", False)

        content_parts = [
            f"Method: {method.upper()}",
            f"Path: {path}",
            f"Operation ID: {op_id}",
            f"Summary: {summary}",
            f"Description: {description}",
        ]

        if deprecated:
            content_parts.append("Status: DEPRECATED")

        tags = operation.get("tags", [])
        if tags:
            content_parts.append(f"Tags: {', '.join(tags)}")

        parameters = operation.get("parameters", [])
        if parameters:
            param_content = self._format_parameters(parameters)
            content_parts.append(f"Parameters:\n{param_content}")

        request_body = operation.get("requestBody", {})
        if request_body:
            rb_content = self._format_request_body(request_body, spec)
            content_parts.append(f"Request Body:\n{rb_content}")

        responses = operation.get("responses", {})
        resp_content = self._format_responses(responses)
        content_parts.append(f"Responses:\n{resp_content}")

        return Document(
            id=f"{doc_id_base}_{op_id}",
            content="\n".join(content_parts),
            source_type=self.source_type,
            title=f"{method.upper()} {path}",
            file_path=str(file_path),
            metadata={
                "section": "operation",
                "method": method,
                "path": path,
                "operation_id": op_id,
                "deprecated": deprecated,
            },
        )

    def _format_parameters(self, parameters: List[Dict[str, Any]]) -> str:
        """Format parameters for display.

        Args:
            parameters: List of parameter specifications

        Returns:
            Formatted parameter string
        """
        lines = []
        for param in parameters:
            name = param.get("name", "unknown")
            in_loc = param.get("in", "unknown")
            required = param.get("required", False)
            description = param.get("description", "")
            param_type = param.get("schema", {}).get("type", "any")

            lines.append(
                f"  - {name} ({in_loc}, {'required' if required else 'optional'}): {param_type}"
            )
            if description:
                lines.append(f"    Description: {description}")

        return "\n".join(lines) if lines else "  No parameters"

    def _format_request_body(
        self, request_body: Dict[str, Any], spec: Dict[str, Any]
    ) -> str:
        """Format request body for display.

        Args:
            request_body: Request body specification
            spec: The full OpenAPI specification

        Returns:
            Formatted request body string
        """
        lines = []
        description = request_body.get("description", "")
        if description:
            lines.append(f"Description: {description}")

        required = request_body.get("required", False)
        lines.append(f"Required: {required}")

        content = request_body.get("content", {})
        for content_type, content_spec in content.items():
            schema = content_spec.get("schema", {})
            schema_ref = schema.get("$ref", "")
            if schema_ref:
                resolved = self._resolve_ref(schema_ref, spec)
                if resolved:
                    schema = resolved
            lines.append(f"Content-Type: {content_type}")
            lines.append(f"Schema: {json.dumps(schema, indent=4)}")

        return "\n".join(lines)

    def _format_responses(self, responses: Dict[str, Any]) -> str:
        """Format responses for display.

        Args:
            responses: Response specifications

        Returns:
            Formatted response string
        """
        lines = []
        for status_code, response in responses.items():
            description = response.get("description", "")
            lines.append(f"  {status_code}: {description}")

            content = response.get("content", {})
            for content_type, content_spec in content.items():
                schema = content_spec.get("schema", {})
                if schema:
                    schema_type = schema.get("type", "unknown")
                    lines.append(f"    Content-Type: {content_type}")
                    lines.append(f"    Schema Type: {schema_type}")

        return "\n".join(lines) if lines else "  No responses defined"

    def _resolve_ref(self, ref: str, spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """Resolve a $ref reference.

        Args:
            ref: The reference string
            spec: The full OpenAPI specification

        Returns:
            Resolved schema or None
        """
        if not ref.startswith("#/"):
            return None

        parts = ref[2:].split("/")
        current = spec

        for part in parts:
            if isinstance(current, dict):
                current = current.get(part)
            else:
                return None

        return current

    def _extract_schema_document(
        self,
        schema_name: str,
        schema: Dict[str, Any],
        file_path: Path,
        doc_id_base: str,
    ) -> Document:
        """Extract a document from a schema.

        Args:
            schema_name: Name of the schema
            schema: Schema specification
            file_path: Path to the source file
            doc_id_base: Base ID for document generation

        Returns:
            Document object
        """
        content_parts = [
            f"Schema: {schema_name}",
        ]

        schema_type = schema.get("type", "object")
        content_parts.append(f"Type: {schema_type}")

        description = schema.get("description", "")
        if description:
            content_parts.append(f"Description: {description}")

        required_fields = schema.get("required", [])
        if required_fields:
            content_parts.append(f"Required Fields: {', '.join(required_fields)}")

        properties = schema.get("properties", {})
        if properties:
            prop_lines = ["Properties:"]
            for prop_name, prop_spec in properties.items():
                prop_type = prop_spec.get("type", "unknown")
                prop_desc = prop_spec.get("description", "")
                prop_required = prop_name in required_fields
                prop_lines.append(
                    f"  - {prop_name} ({prop_type}, {'required' if prop_required else 'optional'})"
                )
                if prop_desc:
                    prop_lines.append(f"    Description: {prop_desc}")
            content_parts.append("\n".join(prop_lines))

        return Document(
            id=f"{doc_id_base}_schema_{schema_name}",
            content="\n".join(content_parts),
            source_type=self.source_type,
            title=f"Schema: {schema_name}",
            file_path=str(file_path),
            metadata={"section": "schema", "schema_name": schema_name},
        )

    def _format_info_content(self, info: Dict[str, Any]) -> str:
        """Format the API info section.

        Args:
            info: Info object from specification

        Returns:
            Formatted info content
        """
        parts = []
        for key in ["title", "version", "description", "termsOfService", "contact", "license"]:
            if key in info:
                value = info[key]
                if isinstance(value, dict):
                    if "name" in value:
                        parts.append(f"{key}: {value['name']}")
                    if "url" in value:
                        parts.append(f"{key} URL: {value['url']}")
                else:
                    parts.append(f"{key}: {value}")
        return "\n".join(parts)

    def _format_tag_content(self, tag: str, tag_spec: Dict[str, Any]) -> str:
        """Format tag content.

        Args:
            tag: Tag name
            tag_spec: Tag specification

        Returns:
            Formatted tag content
        """
        parts = [f"Tag: {tag}"]
        description = tag_spec.get("description", "")
        if description:
            parts.append(f"Description: {description}")
        external_docs = tag_spec.get("externalDocs", {})
        if external_docs:
            docs_url = external_docs.get("url", "")
            if docs_url:
                parts.append(f"External Docs: {docs_url}")
        return "\n".join(parts)

    def _is_supported_file(self, path: Path) -> bool:
        """Check if the file is a supported OpenAPI file.

        Args:
            path: Path to the file

        Returns:
            True if the file extension is supported
        """
        return path.suffix.lower() in self.SUPPORTED_EXTENSIONS

    def get_documents(self) -> List[Document]:
        """Get all indexed documents.

        Returns:
            List of Document objects
        """
        return self._documents