Source code for data

import json
import os
from typing import Any, Dict, List

from core import Collection

# --- Retrieval Zoo Mapping ---
# In a real scenario, these would point to Hugging Face datasets or S3 buckets.
ZOO_MAP = {
    "squad-v2": "https://huggingface.co/datasets/adityak74/embenx-zoo/resolve/main/squad-v2.parquet",
    "natural-questions": "https://huggingface.co/datasets/adityak74/embenx-zoo/resolve/main/nq.parquet",
    "ms-marco": "https://huggingface.co/datasets/adityak74/embenx-zoo/resolve/main/msmarco.parquet",
}

[docs] def load_from_zoo(dataset_name: str, cache_dir: str = ".embenx_cache") -> Collection: """ Download and load a pre-built collection from the Embenx Retrieval Zoo. """ if dataset_name not in ZOO_MAP: raise ValueError(f"Dataset '{dataset_name}' not found in zoo. Available: {list(ZOO_MAP.keys())}") url = ZOO_MAP[dataset_name] os.makedirs(cache_dir, exist_ok=True) local_path = os.path.join(cache_dir, f"{dataset_name}.parquet") if not os.path.exists(local_path): import requests print(f"Downloading {dataset_name} from Embenx Zoo...") response = requests.get(url, stream=True) response.raise_for_status() with open(local_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) return Collection.from_parquet(local_path)
[docs] def list_zoo() -> list: """List all available pre-built collections in the zoo.""" return list(ZOO_MAP.keys())
[docs] def load_documents( dataset_name: str, subset: str = "default", split: str = "train", max_docs: int = 100 ) -> List[Dict[str, Any]]: """ Load documents from Hugging Face or local files. """ if os.path.exists(dataset_name): # Local file path if dataset_name.endswith(".json"): with open(dataset_name, "r") as f: data = json.load(f) docs = data if isinstance(data, list) else [data] elif dataset_name.endswith(".parquet"): import pandas as pd df = pd.read_parquet(dataset_name) docs = df.to_dict(orient="records") else: raise ValueError(f"Unsupported file format: {dataset_name}") return docs[:max_docs] # Hugging Face fallback try: from datasets import load_dataset ds = load_dataset(dataset_name, subset, split=split, streaming=True) docs = [] for i, doc in enumerate(ds): if i >= max_docs: break docs.append(doc) return docs except Exception as e: raise RuntimeError(f"Failed to load dataset {dataset_name}: {e}")
[docs] def save_collection(collection: Collection, path: str): """ Save a collection's vectors and metadata to disk. """ collection.to_parquet(path)