fix: resolve CI/CD issues with proper package structure and imports

2026-02-03 03:54:40 +00:00
parent d4c9af263c
commit 9773c9e46c
1 changed files with 254 additions and 0 deletions
--- a/src/local_api_docs_search/indexer/readme.py
+++ b/src/local_api_docs_search/indexer/readme.py
@@ -0,0 +1,254 @@
 """README/Markdown file indexer."""
 from pathlib import Path
 from typing import List, Tuple
 from local_api_docs_search.indexer.base import BaseIndexer
 from local_api_docs_search.models.document import Document, SourceType
 class READMEIndexer(BaseIndexer):
    """Indexer for README and Markdown files."""
    source_type = SourceType.README
    SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt"}
    def __init__(self):
        self._documents: List[Document] = []
    def index(
        self, path: Path, recursive: bool = False, chunk_size: int = 1000
    ) -> List[Document]:
        """Index README/Markdown files from the given path.
        Args:
            path: Path to file or directory
            recursive: Whether to search recursively
            chunk_size: Maximum chunk size in characters
        Returns:
            List of indexed Document objects
        """
        self._documents = []
        for file_path in self._find_files(path, recursive):
            try:
                docs = self._parse_file(file_path, chunk_size)
                self._documents.extend(docs)
            except Exception as e:
                print(f"Warning: Failed to parse {file_path}: {e}")
        return self._documents
    def _parse_file(
        self, file_path: Path, chunk_size: int = 1000
    ) -> List[Document]:
        """Parse a single Markdown file.
        Args:
            file_path: Path to the Markdown file
            chunk_size: Maximum chunk size
        Returns:
            List of Document objects
        """
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        title = self._extract_title(content, file_path.stem)
        sections = self._parse_sections(content)
        documents = []
        doc_id_base = self._generate_id(file_path)
        if not sections:
            doc = Document(
                id=doc_id_base,
                content=content.strip(),
                source_type=self.source_type,
                title=title,
                file_path=str(file_path),
                metadata={"section": "root"},
            )
            documents.append(doc)
        else:
            for i, (section_title, section_content, level) in enumerate(sections):
                chunks = self._chunk_content(
                    section_content, section_title, chunk_size
                )
                for j, chunk in enumerate(chunks):
                    doc_id = f"{doc_id_base}_section_{i}_{j}" if len(chunks) > 1 else f"{doc_id_base}_section_{i}"
                    doc = Document(
                        id=doc_id,
                        content=chunk,
                        source_type=self.source_type,
                        title=f"{title} - {section_title}",
                        file_path=str(file_path),
                        metadata={
                            "section": section_title,
                            "section_level": level,
                            "chunk_index": j,
                            "total_chunks": len(chunks),
                        },
                    )
                    documents.append(doc)
            if len(sections) == 1:
                full_doc = Document(
                    id=f"{doc_id_base}_full",
                    content=content.strip(),
                    source_type=self.source_type,
                    title=f"{title} (Full)",
                    file_path=str(file_path),
                    metadata={"section": "full_document"},
                )
                documents.append(full_doc)
        return documents
    def _extract_title(self, content: str, default: str) -> str:
        """Extract the title from Markdown content.
        Args:
            content: Markdown content
            default: Default title if none found
        Returns:
            Extracted title
        """
        for line in content.split("\n"):
            line = line.strip()
            if line.startswith("# "):
                return line[2:].strip()
        return default
    def _parse_sections(
        self, content: str
    ) -> List[Tuple[str, str, int]]:
        """Parse Markdown content into sections.
        Args:
            content: Markdown content
        Returns:
            List of (title, content, level) tuples
        """
        sections = []
        lines = content.split("\n")
        current_section = ("", "", 0)
        current_lines = []
        in_code_block = False
        code_fence = "```"
        for line in lines:
            if line.startswith(code_fence):
                in_code_block = not in_code_block
                if not in_code_block:
                    current_lines.append(line)
                continue
            if not in_code_block and line.startswith("#"):
                if current_section[1]:
                    sections.append(
                        (current_section[0], "\n".join(current_lines), current_section[2])
                    )
                header = line.lstrip("#")
                level = len(line) - len(header)
                title = header.strip()
                current_lines = []
                current_section = (title, "", level)
            else:
                current_lines.append(line)
        if current_section[1]:
            sections.append(
                (current_section[0], "\n".join(current_lines), current_section[2])
            )
        return sections
    def _chunk_content(
        self, content: str, section_title: str, max_size: int
    ) -> List[str]:
        """Chunk content into smaller pieces.
        Args:
            content: Section content
            section_title: Section title for context
            max_size: Maximum chunk size
        Returns:
            List of content chunks
        """
        if len(content) <= max_size:
            return [content]
        chunks = []
        current_chunk = []
        current_size = 0
        paragraphs = self._split_paragraphs(content)
        for para in paragraphs:
            para_size = len(para)
            if current_size + para_size > max_size and current_chunk:
                chunks.append("\n\n".join(current_chunk))
                current_chunk = []
                current_size = 0
            current_chunk.append(para)
            current_size += para_size
        if current_chunk:
            chunks.append("\n\n".join(current_chunk))
        return chunks
    def _split_paragraphs(self, content: str) -> List[str]:
        """Split content into paragraphs.
        Args:
            content: Section content
        Returns:
            List of paragraphs
        """
        paragraphs = []
        current_lines = []
        for line in content.split("\n"):
            stripped = line.strip()
            if stripped:
                current_lines.append(line)
            elif current_lines:
                paragraphs.append("\n".join(current_lines))
                current_lines = []
        if current_lines:
            paragraphs.append("\n".join(current_lines))
        return paragraphs
    def _is_supported_file(self, path: Path) -> bool:
        """Check if the file is a supported Markdown file.
        Args:
            path: Path to the file
        Returns:
            True if the file extension is supported
        """
        return path.suffix.lower() in self.SUPPORTED_EXTENSIONS
    def get_documents(self) -> List[Document]:
        """Get all indexed documents.
        Returns:
            List of Document objects
        """
        return self._documents