Source code for indexers.bm25_indexer
from typing import Any, Dict, List, Optional, Tuple
import numpy as np
from rank_bm25 import BM25Okapi
from .base import BaseIndexer
[docs]
class BM25Indexer(BaseIndexer):
"""
Sparse retrieval indexer using BM25 algorithm.
Primarily uses 'text' from metadata for indexing.
"""
def __init__(self, dimension: int = 0):
# Dimension is not strictly needed for BM25 but kept for API consistency
super().__init__("BM25", dimension)
self.bm25: Optional[BM25Okapi] = None
self.metadata = []
self.corpus_tokens = []
def _tokenize(self, text: str) -> List[str]:
return text.lower().split()
[docs]
def build_index(self, embeddings: List[List[float]], metadata: List[Dict[str, Any]]) -> None:
"""
Build BM25 index.
Expects a 'text' field in the metadata dictionaries.
"""
self.metadata = metadata
self.corpus_tokens = [self._tokenize(m.get("text", "")) for m in metadata]
if self.corpus_tokens:
self.bm25 = BM25Okapi(self.corpus_tokens)
[docs]
def search(self, query_text: Any, top_k: int = 5) -> List[Tuple[Dict[str, Any], float]]:
"""
Search using BM25.
Note: query here can be a string (text) or a vector (ignored).
"""
if self.bm25 is None:
return []
if isinstance(query_text, str):
tokenized_query = self._tokenize(query_text)
else:
# If a vector is passed, we can't do much without the original text.
# In a hybrid search context, the Collection will pass the text.
return []
scores = self.bm25.get_scores(tokenized_query)
top_n = np.argsort(scores)[::-1][:top_k]
results = []
for i in top_n:
if scores[i] > 0:
results.append((self.metadata[i], float(scores[i])))
return results
[docs]
def get_size(self) -> int:
import sys
# Very rough estimation of memory usage
return sys.getsizeof(self.corpus_tokens) + sys.getsizeof(self.metadata)
[docs]
def cleanup(self) -> None:
self.bm25 = None
self.corpus_tokens = []
self.metadata = []