Agents_Course_Assignment / create_vector_database.py
KPatelis's picture
Upload 15 files
2834b30 verified
"""
Populate the Supabase vector store with GAIA benchmark embeddings.
Prerequisites:
- SUPABASE_URL and SUPABASE_SERVICE_KEY in .env
- pgvector extension enabled in Supabase
- gaia_documents table created with columns: content (text), metadata (jsonb), embedding (vector)
"""
import os
import json
from dotenv import load_dotenv
from supabase.client import Client, create_client
from sentence_transformers import SentenceTransformer
from utils import load_config
load_dotenv()
config = load_config()
data_path = config["data"]
supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)
embeddings = SentenceTransformer(
model_name_or_path=config["vector_store"]["embedding_model_name"],
cache_folder=config["models"]["cache_folder"],
)
with open(data_path, "r") as jsonl_file:
json_list = list(jsonl_file)
documents = []
for json_str in json_list:
json_data = json.loads(json_str)
content = json_data["Question"]
embedding = embeddings.encode(content, normalize_embeddings=True).tolist()
documents.append({
"content": content,
"metadata": {
"source": "vector_search",
"task_id": json_data["task_id"],
},
"embedding": embedding,
})
print(f"Inserting {len(documents)} documents into Supabase...")
try:
response = supabase.table("gaia_documents").insert(documents).execute()
print("Done.")
except Exception as e:
print("Error inserting data into Supabase:", e)