Initial upload: shell-history-semantic-search v0.1.0

2026-03-22 18:15:29 +00:00
parent d9a85d3122
commit 7fc45e0849
1 changed files with 97 additions and 0 deletions
--- a/src/shell_history_search/core/embeddings.py
+++ b/src/shell_history_search/core/embeddings.py
@@ -0,0 +1,97 @@
 import os
 from pathlib import Path
 from typing import Optional
 import logging
 import numpy as np
 from sentence_transformers import SentenceTransformer
 logger = logging.getLogger(__name__)
 DEFAULT_MODEL_NAME = "all-MiniLM-L6-v2"
 DEFAULT_CACHE_DIR = Path.home() / ".cache" / "shell_history_search" / "models"
 def get_cache_dir() -> Path:
    cache_path = os.environ.get("SHELL_HISTORY_MODEL_CACHE", str(DEFAULT_CACHE_DIR))
    return Path(cache_path)
 class EmbeddingService:
    def __init__(
        self,
        model_name: str = DEFAULT_MODEL_NAME,
        cache_dir: Optional[Path] = None,
        device: Optional[str] = None,
    ):
        self.model_name = model_name
        self.cache_dir = cache_dir or get_cache_dir()
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.device = device or "cpu"
        self._model: Optional[SentenceTransformer] = None
        self._embedding_dim: Optional[int] = None
    def _load_model(self) -> SentenceTransformer:
        if self._model is None:
            logger.info(f"Loading embedding model: {self.model_name}")
            self._model = SentenceTransformer(
                self.model_name,
                cache_folder=str(self.cache_dir),
                device=self.device,
            )
            self._embedding_dim = self._model.get_sentence_embedding_dimension()
            logger.info(f"Model loaded. Embedding dimension: {self._embedding_dim}")
        return self._model
    @property
    def model(self) -> SentenceTransformer:
        return self._load_model()
    @property
    def embedding_dim(self) -> int:
        if self._embedding_dim is None:
            self._load_model()
        assert self._embedding_dim is not None
        return self._embedding_dim
    def encode(self, texts: list[str], batch_size: int = 32) -> np.ndarray:
        if not texts:
            return np.array([])
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=True,
        )
        return embeddings.astype(np.float32)
    def encode_single(self, text: str) -> np.ndarray:
        return self.encode([text])[0]
    @staticmethod
    def embedding_to_blob(embedding: np.ndarray) -> bytes:
        return embedding.astype(np.float32).tobytes()
    @staticmethod
    def blob_to_embedding(blob: bytes, dim: int) -> np.ndarray:
        return np.frombuffer(blob, dtype=np.float32).reshape(-1, dim)
    @staticmethod
    def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
        if a.ndim == 1:
            a = a.reshape(1, -1)
        if b.ndim == 1:
            b = b.reshape(1, -1)
        a_norm = np.linalg.norm(a, axis=1, keepdims=True)
        b_norm = np.linalg.norm(b, axis=1, keepdims=True)
        a_normalized = a / (a_norm + 1e-8)
        b_normalized = b / (b_norm + 1e-8)
        similarity = np.dot(a_normalized, b_normalized.T)
        return float(similarity[0, 0])