Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import chromadb | |
| import pandas as pd | |
| from llama_index.core import VectorStoreIndex | |
| from llama_index.vector_stores.chroma import ChromaVectorStore | |
| from llama_index.core import StorageContext | |
| from llama_index.core import Document | |
| from dotenv import load_dotenv | |
| load_dotenv() # Load OPENAI_API_KEY from .env (not included in repo) | |
| import gdown | |
| data = None | |
| def get_data(download=False): | |
| global data | |
| if data is None: | |
| data = Data(download) | |
| return data | |
| class Data: | |
| def __init__(self, download=False): | |
| print("Initializing Data...") | |
| print(f"Download: {download}") | |
| self.client = None | |
| self.collection = None | |
| self.index = None | |
| if download: | |
| self.download_data() | |
| self.load_data() | |
| def download_data(self): | |
| # Download the already indexed data | |
| if not os.path.exists("./chroma_db"): | |
| try: | |
| print("Downloading data...") | |
| file_id = "1JvYQ9E5zDBKRCUKkxejDvp7UGwzxDAUW" | |
| url = f"https://drive.google.com/uc?export=download&id={file_id}" | |
| output = "chroma_db.zip" | |
| gdown.download(url, output, quiet=False) | |
| print("Unzipping data...") | |
| os.system("unzip chroma_db.zip") | |
| except Exception as e: | |
| print(f"Error downloading data: {e}") | |
| return os.path.exists("./chroma_db") | |
| def load_data(self): | |
| print("Loading data...") | |
| with open('data/train-v1.1.json', 'r') as f: | |
| raw_data = json.load(f) | |
| raw_documents = [] | |
| documents = [] | |
| for data in raw_data['data']: | |
| title = data['title'] | |
| for par in data['paragraphs']: | |
| context = par['context'] | |
| for qa in par['qas']: | |
| question = qa['question'] | |
| answers = [] | |
| for ans in qa['answers']: | |
| if ans['text'] not in answers: | |
| answers.append(ans['text']) | |
| for answer in answers: | |
| raw_documents.append([title, context, question, answer]) | |
| doc = f""" | |
| Title: {title} | |
| Context: {context} | |
| Question: {question} | |
| Acceptable Answers: | |
| {[f"{i+1}. {ans}" for i, ans in enumerate(answers)]} | |
| """ | |
| # Remove padding on each line | |
| doc = "\n".join([line.strip() for line in doc.split("\n")]) | |
| documents.append(doc) | |
| self.df = pd.DataFrame(raw_documents, columns=["Title", "Context", "Question", "Answer"]) | |
| self.documents = [Document(text=t) for t in documents] | |
| print("Raw Data loaded") | |
| if not os.path.exists("./chroma_db"): | |
| # Attempt to generate an index from the raw data | |
| print("Creating Chroma DB...") | |
| # initialize client, setting path to save data | |
| self.client = chromadb.PersistentClient(path="./chroma_db") | |
| # create collection | |
| self.collection = self.client.get_or_create_collection("simple_index") | |
| # assign chroma as the vector_store to the context | |
| vector_store = ChromaVectorStore(chroma_collection=self.collection) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| # create your index | |
| self.index = VectorStoreIndex.from_documents( | |
| self.documents, storage_context=storage_context | |
| ) | |
| print("Chroma DB created") | |
| else: | |
| print("Chroma DB already exists") | |
| print("Loading index...") | |
| # initialize client | |
| self.client = chromadb.PersistentClient(path="./chroma_db") | |
| # get collection | |
| self.collection = self.client.get_or_create_collection("simple_index") | |
| # assign chroma as the vector_store to the context | |
| vector_store = ChromaVectorStore(chroma_collection=self.collection) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| # load your index from stored vectors | |
| self.index = VectorStoreIndex.from_vector_store( | |
| vector_store, storage_context=storage_context | |
| ) | |
| print("Index loaded") |