Add indexer modules (base, openapi, readme, code)

2026-02-03 01:21:36 +00:00
parent 42ff56b5d8
commit 8dcf6cc0be
1 changed files with 257 additions and 0 deletions
--- a/src/indexer/readme.py
+++ b/src/indexer/readme.py
@@ -0,0 +1,257 @@
+"""README/Markdown file indexer."""
+
+import hashlib
+from pathlib import Path
+from typing import Generator, List, Tuple
+
+import yaml
+from markdown import markdown
+
+from src.indexer.base import BaseIndexer
+from src.models.document import Document, SourceType
+
+
+class READMEIndexer(BaseIndexer):
+    """Indexer for README and Markdown files."""
+
+    source_type = SourceType.README
+
+    SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt"}
+
+    def __init__(self):
+        self._documents: List[Document] = []
+
+    def index(
+        self, path: Path, recursive: bool = False, chunk_size: int = 1000
+    ) -> List[Document]:
+        """Index README/Markdown files from the given path.
+
+        Args:
+            path: Path to file or directory
+            recursive: Whether to search recursively
+            chunk_size: Maximum chunk size in characters
+
+        Returns:
+            List of indexed Document objects
+        """
+        self._documents = []
+
+        for file_path in self._find_files(path, recursive):
+            try:
+                docs = self._parse_file(file_path, chunk_size)
+                self._documents.extend(docs)
+            except Exception as e:
+                print(f"Warning: Failed to parse {file_path}: {e}")
+
+        return self._documents
+
+    def _parse_file(
+        self, file_path: Path, chunk_size: int = 1000
+    ) -> List[Document]:
+        """Parse a single Markdown file.
+
+        Args:
+            file_path: Path to the Markdown file
+            chunk_size: Maximum chunk size
+
+        Returns:
+            List of Document objects
+        """
+        with open(file_path, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        title = self._extract_title(content, file_path.stem)
+        sections = self._parse_sections(content)
+
+        documents = []
+        doc_id_base = self._generate_id(file_path)
+
+        if not sections:
+            doc = Document(
+                id=doc_id_base,
+                content=content.strip(),
+                source_type=self.source_type,
+                title=title,
+                file_path=str(file_path),
+                metadata={"section": "root"},
+            )
+            documents.append(doc)
+        else:
+            for i, (section_title, section_content, level) in enumerate(sections):
+                chunks = self._chunk_content(
+                    section_content, section_title, chunk_size
+                )
+                for j, chunk in enumerate(chunks):
+                    doc_id = f"{doc_id_base}_section_{i}_{j}" if len(chunks) > 1 else f"{doc_id_base}_section_{i}"
+                    doc = Document(
+                        id=doc_id,
+                        content=chunk,
+                        source_type=self.source_type,
+                        title=f"{title} - {section_title}",
+                        file_path=str(file_path),
+                        metadata={
+                            "section": section_title,
+                            "section_level": level,
+                            "chunk_index": j,
+                            "total_chunks": len(chunks),
+                        },
+                    )
+                    documents.append(doc)
+
+            if len(sections) == 1:
+                full_doc = Document(
+                    id=f"{doc_id_base}_full",
+                    content=content.strip(),
+                    source_type=self.source_type,
+                    title=f"{title} (Full)",
+                    file_path=str(file_path),
+                    metadata={"section": "full_document"},
+                )
+                documents.append(full_doc)
+
+        return documents
+
+    def _extract_title(self, content: str, default: str) -> str:
+        """Extract the title from Markdown content.
+
+        Args:
+            content: Markdown content
+            default: Default title if none found
+
+        Returns:
+            Extracted title
+        """
+        for line in content.split("\n"):
+            line = line.strip()
+            if line.startswith("# "):
+                return line[2:].strip()
+        return default
+
+    def _parse_sections(
+        self, content: str
+    ) -> List[Tuple[str, str, int]]:
+        """Parse Markdown content into sections.
+
+        Args:
+            content: Markdown content
+
+        Returns:
+            List of (title, content, level) tuples
+        """
+        sections = []
+        lines = content.split("\n")
+        current_section = ("", "", 0)
+        current_lines = []
+
+        in_code_block = False
+        code_fence = "```"
+
+        for line in lines:
+            if line.startswith(code_fence):
+                in_code_block = not in_code_block
+                if not in_code_block:
+                    current_lines.append(line)
+                continue
+
+            if not in_code_block and line.startswith("#"):
+                if current_section[1]:
+                    sections.append(
+                        (current_section[0], "\n".join(current_lines), current_section[2])
+                    )
+
+                header = line.lstrip("#")
+                level = len(line) - len(header)
+                title = header.strip()
+                current_lines = []
+                current_section = (title, "", level)
+            else:
+                current_lines.append(line)
+
+        if current_section[1]:
+            sections.append(
+                (current_section[0], "\n".join(current_lines), current_section[2])
+            )
+
+        return sections
+
+    def _chunk_content(
+        self, content: str, section_title: str, max_size: int
+    ) -> List[str]:
+        """Chunk content into smaller pieces.
+
+        Args:
+            content: Section content
+            section_title: Section title for context
+            max_size: Maximum chunk size
+
+        Returns:
+            List of content chunks
+        """
+        if len(content) <= max_size:
+            return [content]
+
+        chunks = []
+        current_chunk = []
+        current_size = 0
+
+        paragraphs = self._split_paragraphs(content)
+
+        for para in paragraphs:
+            para_size = len(para)
+
+            if current_size + para_size > max_size and current_chunk:
+                chunks.append("\n\n".join(current_chunk))
+                current_chunk = []
+                current_size = 0
+
+            current_chunk.append(para)
+            current_size += para_size
+
+        if current_chunk:
+            chunks.append("\n\n".join(current_chunk))
+
+        return chunks
+
+    def _split_paragraphs(self, content: str) -> List[str]:
+        """Split content into paragraphs.
+
+        Args:
+            content: Section content
+
+        Returns:
+            List of paragraphs
+        """
+        paragraphs = []
+        current_lines = []
+
+        for line in content.split("\n"):
+            stripped = line.strip()
+            if stripped:
+                current_lines.append(line)
+            elif current_lines:
+                paragraphs.append("\n".join(current_lines))
+                current_lines = []
+
+        if current_lines:
+            paragraphs.append("\n".join(current_lines))
+
+        return paragraphs
+
+    def _is_supported_file(self, path: Path) -> bool:
+        """Check if the file is a supported Markdown file.
+
+        Args:
+            path: Path to the file
+
+        Returns:
+            True if the file extension is supported
+        """
+        return path.suffix.lower() in self.SUPPORTED_EXTENSIONS
+
+    def get_documents(self) -> List[Document]:
+        """Get all indexed documents.
+
+        Returns:
+            List of Document objects
+        """
+        return self._documents