diff --git a/src/indexer/base.py b/src/indexer/base.py new file mode 100644 index 0000000..4a040ee --- /dev/null +++ b/src/indexer/base.py @@ -0,0 +1,81 @@ +"""Base indexer interface for documentation parsing.""" + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Generator, List + +from src.models.document import Document, SourceType + + +class BaseIndexer(ABC): + """Abstract base class for document indexers.""" + + source_type: SourceType + + @abstractmethod + def index(self, path: Path, recursive: bool = False) -> List[Document]: + """Index documents from the given path. + + Args: + path: Path to file or directory to index + recursive: Whether to search directories recursively + + Returns: + List of indexed Document objects + """ + pass + + @abstractmethod + def get_documents(self) -> List[Document]: + """Get all indexed documents. + + Returns: + List of Document objects + """ + pass + + def _find_files(self, path: Path, recursive: bool = False) -> Generator[Path, None, None]: + """Find files to index in the given path. + + Args: + path: Path to file or directory + recursive: Whether to search recursively + + Yields: + Path objects for each file found + """ + if path.is_file(): + if self._is_supported_file(path): + yield path + elif path.is_dir(): + pattern = "**/*" if recursive else "*" + for file_path in path.glob(pattern): + if file_path.is_file() and self._is_supported_file(file_path): + yield file_path + + @abstractmethod + def _is_supported_file(self, path: Path) -> bool: + """Check if the file is supported by this indexer. + + Args: + path: Path to the file + + Returns: + True if the file is supported + """ + pass + + def _generate_id(self, file_path: Path, suffix: str = "") -> str: + """Generate a unique document ID. + + Args: + file_path: Path to the source file + suffix: Optional suffix to add to the ID + + Returns: + Unique document ID string + """ + stem = file_path.stem.replace(" ", "_").lower() + if suffix: + return f"{stem}_{suffix}" + return stem