Source code for indexers.bm25_indexer

from typing import Any, Dict, List, Optional, Tuple

import numpy as np
from rank_bm25 import BM25Okapi

from .base import BaseIndexer


[docs] class BM25Indexer(BaseIndexer): """ Sparse retrieval indexer using BM25 algorithm. Primarily uses 'text' from metadata for indexing. """ def __init__(self, dimension: int = 0): # Dimension is not strictly needed for BM25 but kept for API consistency super().__init__("BM25", dimension) self.bm25: Optional[BM25Okapi] = None self.metadata = [] self.corpus_tokens = [] def _tokenize(self, text: str) -> List[str]: return text.lower().split()
[docs] def build_index(self, embeddings: List[List[float]], metadata: List[Dict[str, Any]]) -> None: """ Build BM25 index. Expects a 'text' field in the metadata dictionaries. """ self.metadata = metadata self.corpus_tokens = [self._tokenize(m.get("text", "")) for m in metadata] if self.corpus_tokens: self.bm25 = BM25Okapi(self.corpus_tokens)
[docs] def search(self, query_text: Any, top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]: """ Search using BM25. Note: query here can be a string (text) or a vector (ignored). """ if self.bm25 is None: return [] if isinstance(query_text, str): tokenized_query = self._tokenize(query_text) else: # If a vector is passed, we can't do much without the original text. # In a hybrid search context, the Collection will pass the text. return [] scores = self.bm25.get_scores(tokenized_query) top_n = np.argsort(scores)[::-1][:top_k] results = [] for i in top_n: if scores[i] > 0: results.append((self.metadata[i], float(scores[i]))) return results
[docs] def get_size(self) -> int: import sys # Very rough estimation of memory usage return sys.getsizeof(self.corpus_tokens) + sys.getsizeof(self.metadata)
[docs] def cleanup(self) -> None: self.bm25 = None self.corpus_tokens = [] self.metadata = []