File size: 1,570 Bytes
bdc5edd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
Populate the Supabase vector store with GAIA benchmark embeddings.

Prerequisites:
  - SUPABASE_URL and SUPABASE_SERVICE_KEY in .env
  - pgvector extension enabled in Supabase
  - gaia_documents table created with columns: content (text), metadata (jsonb), embedding (vector)
"""
import os
import json
from dotenv import load_dotenv
from supabase.client import Client, create_client
from sentence_transformers import SentenceTransformer
from gaia.utils import load_config

load_dotenv()

config = load_config()
data_path = config["data"]

supabase_url = os.getenv("SUPABASE_URL")
supabase_key = os.getenv("SUPABASE_SERVICE_KEY")

supabase: Client = create_client(supabase_url, supabase_key)
embeddings = SentenceTransformer(
    model_name_or_path=config["vector_store"]["embedding_model_name"],
    cache_folder=config["models"]["cache_folder"],
)

with open(data_path, "r") as jsonl_file:
    json_list = list(jsonl_file)

documents = []
for json_str in json_list:
    json_data = json.loads(json_str)
    content = json_data["Question"]
    embedding = embeddings.encode(content, normalize_embeddings=True).tolist()
    documents.append({
        "content": content,
        "metadata": {
            "source": "vector_search",
            "task_id": json_data["task_id"],
        },
        "embedding": embedding,
    })

print(f"Inserting {len(documents)} documents into Supabase...")
try:
    response = supabase.table("gaia_documents").insert(documents).execute()
    print("Done.")
except Exception as e:
    print("Error inserting data into Supabase:", e)