Spaces:

Presidentlin
/

Aidan-Bench

Runtime error

App Files Files Community

Presidentlin commited on Aug 12, 2024

Commit

c9e00de

1 Parent(s): 0e9562f

x

Browse files

Files changed (4) hide show

__pycache__/main.cpython-310.pyc +0 -0
__pycache__/prompts.cpython-310.pyc +0 -0
app.py +31 -69
main.py +63 -77

__pycache__/main.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/main.cpython-310.pyc and b/__pycache__/main.cpython-310.pyc differ

__pycache__/prompts.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/prompts.cpython-310.pyc and b/__pycache__/prompts.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -1,10 +1,7 @@
 import streamlit as st
-from main import get_novelty_score
-from models import chat_with_model, embed
-from prompts import questions as predefined_questions, create_gen_prompt, create_judge_prompt
 import requests
-import numpy as np
-import os
 # Set the title in the browser tab
 st.set_page_config(page_title="Aidan Bench - Generator")
@@ -95,86 +92,51 @@ if st.session_state.open_router_key and st.session_state.openai_api_key:
     # Display selected questions
     st.write("Selected Questions:", selected_questions)
     # Benchmark Execution
     if st.button("Start Benchmark"):
         if not selected_questions:
             st.warning("Please select at least one question.")
         else:
-            # Initialize progress bar
             progress_bar = st.progress(0)
             num_questions = len(selected_questions)
-            results = []  # List to store results
-            # Iterate through selected questions
             for i, question in enumerate(selected_questions):
                 # Display current question
                 st.write(f"Processing question {i+1}/{num_questions}: {question}")
-                previous_answers = []
-                question_novelty = 0
-                try:
-                    while True:
-                        gen_prompt = create_gen_prompt(question, previous_answers)
-                        try:
-                            new_answer = chat_with_model(
-                                prompt=gen_prompt,
-                                model=model_name,
-                                open_router_key=st.session_state.open_router_key,
-                                openai_api_key=st.session_state.openai_api_key
-                            )
-                        except requests.exceptions.RequestException as e:
-                            st.error(f"API Error: {e}")
-                            break
-                        judge_prompt = create_judge_prompt(question, new_answer)
-                        judge = "openai/gpt-4o-mini"
-                        try:
-                            judge_response = chat_with_model(
-                                prompt=judge_prompt,
-                                model=judge,
-                                open_router_key=st.session_state.open_router_key,
-                                openai_api_key=st.session_state.openai_api_key
-                            )
-                        except requests.exceptions.RequestException as e:
-                            st.error(f"API Error (Judge): {e}")
-                            break
-                        coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
-                        if coherence_score <= 3:
-                            st.warning("Output is incoherent. Moving to next question.")
-                            break
-                        novelty_score = get_novelty_score(new_answer, previous_answers, st.session_state.openai_api_key)
-                        if novelty_score < 0.1:
-                            st.warning("Output is redundant. Moving to next question.")
-                            break
-                        st.write(f"New Answer:\n{new_answer}")
-                        st.write(f"Coherence Score: {coherence_score}")
-                        st.write(f"Novelty Score: {novelty_score}")
-                        previous_answers.append(new_answer)
-                        question_novelty += novelty_score
-                except Exception as e:
-                    st.error(f"Error processing question: {e}")
-                results.append({
-                    "question": question,
-                    "answers": previous_answers,
-                    "coherence_score": coherence_score,
-                    "novelty_score": novelty_score
-                })
                 # Update progress bar
                 progress_bar.progress((i + 1) / num_questions)
-            st.success("Benchmark completed!")
             # Display results in a table
             st.write("Results:")

 import streamlit as st
+from main import benchmark_model_multithreaded, benchmark_model_sequential
+from prompts import questions as predefined_questions
 import requests
 # Set the title in the browser tab
 st.set_page_config(page_title="Aidan Bench - Generator")
     # Display selected questions
     st.write("Selected Questions:", selected_questions)
+    # Choose execution mode
+    execution_mode = st.radio("Execution Mode:", ["Sequential", "Multithreaded"])
     # Benchmark Execution
     if st.button("Start Benchmark"):
         if not selected_questions:
             st.warning("Please select at least one question.")
         else:
+        # Initialize progress bar
             progress_bar = st.progress(0)
             num_questions = len(selected_questions)
+            results = []
+            # Stop button
+            stop_button = st.button("Stop Benchmark")
+            # Benchmarking loop
             for i, question in enumerate(selected_questions):
                 # Display current question
                 st.write(f"Processing question {i+1}/{num_questions}: {question}")
+                # ... (benchmarking logic using the chosen execution mode)
+                if execution_mode == "Sequential":
+                    question_results = benchmark_model_sequential(model_name, [question], st.session_state.open_router_key, st.session_state.openai_api_key)
+                else:  # Multithreaded
+                    question_results = benchmark_model_multithreaded(model_name, [question], st.session_state.open_router_key, st.session_state.openai_api_key)
+                results.extend(question_results)
                 # Update progress bar
                 progress_bar.progress((i + 1) / num_questions)
+                # Check if stop button is clicked
+                if stop_button:
+                    st.warning("Benchmark stopped!")
+                    break  # Exit the loop
+            # Display results (even if interrupted)
+            st.write("Results:")
+            # ... (table generation logic - Same as before)
+            if stop_button:
+                st.warning("Partial results displayed due to interruption.")
+            else:
+                st.success("Benchmark completed!")
             # Display results in a table
             st.write("Results:")

main.py CHANGED Viewed

@@ -1,30 +1,14 @@
 import numpy as np
 from models import chat_with_model, embed
-from prompts import questions, create_gen_prompt, create_judge_prompt
-from colorama import Fore, Style
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import threading
-import argparse
-def parse_arguments():
-    parser = argparse.ArgumentParser(description="Benchmark a language model.")
-    parser.add_argument("model_name", type=str, help="Name of the model to benchmark")
-    parser.add_argument("--single-threaded", action="store_true", help="Run in single-threaded mode")
-    return parser.parse_args()
-def benchmark_model(model_name, multithreaded=False):
-    if multithreaded:
-        return benchmark_model_multithreaded(model_name)
-    else:
-        return benchmark_model_sequential(model_name)
-def process_question(question, model_name):
     start_time = time.time()
-    print(Fore.RED + question + Style.RESET_ALL)
     previous_answers = []
     question_novelty = 0
@@ -32,110 +16,112 @@ def process_question(question, model_name):
         while True:
             gen_prompt = create_gen_prompt(question, previous_answers)
             try:
-                new_answer = chat_with_model(prompt=gen_prompt, model=model_name)
             except Exception as e:
-                print(Fore.RED + f"Error generating answer: {str(e)}" + Style.RESET_ALL)
                 break
             judge_prompt = create_judge_prompt(question, new_answer)
             judge = "openai/gpt-4o-mini"
             try:
-                judge_response = chat_with_model(prompt=judge_prompt, model=judge)
             except Exception as e:
-                print(Fore.RED + f"Error getting judge response: {str(e)}" + Style.RESET_ALL)
                 break
-            coherence_score = int(judge_response.split("<coherence_score>")[
-                                1].split("</coherence_score>")[0])
             if coherence_score <= 3:
-                print(
-                    Fore.YELLOW + "Output is incoherent. Moving to next question." + Style.RESET_ALL)
                 break
-            novelty_score = get_novelty_score(new_answer, previous_answers)
             if novelty_score < 0.1:
-                print(
-                    Fore.YELLOW + "Output is redundant. Moving to next question." + Style.RESET_ALL)
                 break
-            print(f"New Answer:\n{new_answer}")
-            print(Fore.GREEN + f"Coherence Score: {coherence_score}")
-            print(f"Novelty Score: {novelty_score}" + Style.RESET_ALL)
             previous_answers.append(new_answer)
             question_novelty += novelty_score
     except Exception as e:
-        print(Fore.RED + f"Unexpected error processing question: {str(e)}" + Style.RESET_ALL)
     time_taken = time.time() - start_time
-    print(Fore.BLUE)
-    print(f"Total novelty score for this question: {question_novelty}")
-    print(f"Time taken: {time_taken} seconds")
-    print(Style.RESET_ALL)
-    return question_novelty
-def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key=None):
-        new_embedding = embed(new_answer, openai_api_key)
-        # If there are no previous answers, return maximum novelty
-        if not previous_answers:
-            return 1.0
-        previous_embeddings = [embed(answer, openai_api_key) for answer in previous_answers]
-        similarities = [
-            np.dot(new_embedding, prev_embedding) /
-            (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
-            for prev_embedding in previous_embeddings
-        ]
-        max_similarity = max(similarities)
-        novelty = 1 - max_similarity
-        return novelty
-def benchmark_model_multithreaded(model_name):
     novelty_score = 0
-    print_lock = threading.Lock()
     with ThreadPoolExecutor(max_workers=len(questions)) as executor:
         future_to_question = {executor.submit(
-            process_question, question, model_name): question for question in questions}
         for future in as_completed(future_to_question):
             question = future_to_question[future]
-            question_novelty = future.result()
-            with print_lock:
-                novelty_score += question_novelty
-    print(Fore.YELLOW)
-    print(f"Total novelty score across all questions: {novelty_score}")
-    print(Style.RESET_ALL)
-    return novelty_score
-def benchmark_model_sequential(model_name):
     novelty_score = 0
-    for question in questions:
-        question_novelty = process_question(question, model_name)
         novelty_score += question_novelty
-    print(Fore.YELLOW)
-    print(f"Total novelty score across all questions: {novelty_score}")
-    print(Style.RESET_ALL)
-    return novelty_score
-if __name__ == "__main__":
-    args = parse_arguments()
-    benchmark_model(args.model_name, multithreaded=not args.single_threaded)

 import numpy as np
 from models import chat_with_model, embed
+from prompts import create_gen_prompt, create_judge_prompt
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import threading
+import streamlit as st  # Import Streamlit
+def process_question(question, model_name, open_router_key, openai_api_key):
     start_time = time.time()
+    st.write(f"<span style='color:red'>{question}</span>", unsafe_allow_html=True)  # Display question in red
     previous_answers = []
     question_novelty = 0
         while True:
             gen_prompt = create_gen_prompt(question, previous_answers)
             try:
+                new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key, openai_api_key=openai_api_key)
             except Exception as e:
+                st.write(f"<span style='color:red'>Error generating answer: {str(e)}</span>", unsafe_allow_html=True)  # Display error in red
                 break
             judge_prompt = create_judge_prompt(question, new_answer)
             judge = "openai/gpt-4o-mini"
             try:
+                judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key, openai_api_key=openai_api_key)
             except Exception as e:
+                st.write(f"<span style='color:red'>Error getting judge response: {str(e)}</span>", unsafe_allow_html=True)  # Display error in red
                 break
+            coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
             if coherence_score <= 3:
+                st.write("<span style='color:yellow'>Output is incoherent. Moving to next question.</span>", unsafe_allow_html=True)  # Display warning in yellow
                 break
+            novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
             if novelty_score < 0.1:
+                st.write("<span style='color:yellow'>Output is redundant. Moving to next question.</span>", unsafe_allow_html=True)  # Display warning in yellow
                 break
+            st.write(f"**New Answer:**\n{new_answer}")
+            st.write(f"<span style='color:green'>Coherence Score: {coherence_score}</span>", unsafe_allow_html=True)  # Display coherence score in green
+            st.write(f"**Novelty Score:** {novelty_score}")
             previous_answers.append(new_answer)
             question_novelty += novelty_score
     except Exception as e:
+        st.write(f"<span style='color:red'>Unexpected error processing question: {str(e)}</span>", unsafe_allow_html=True)  # Display error in red
     time_taken = time.time() - start_time
+    st.write(f"<span style='color:blue'>Total novelty score for this question: {question_novelty}</span>", unsafe_allow_html=True)  # Display novelty score in blue
+    st.write(f"<span style='color:blue'>Time taken: {time_taken} seconds</span>", unsafe_allow_html=True)  # Display time taken in blue
+    return question_novelty, [
+        {
+            "question": question,
+            "answers": previous_answers,
+            "coherence_score": coherence_score,
+            "novelty_score": question_novelty
+        }
+    ]
+def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key):
+    new_embedding = embed(new_answer, openai_api_key)
+    # If there are no previous answers, return maximum novelty
+    if not previous_answers:
+        return 1.0
+    previous_embeddings = [embed(answer, openai_api_key) for answer in previous_answers]
+    similarities = [
+        np.dot(new_embedding, prev_embedding) /
+        (np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
+        for prev_embedding in previous_embeddings
+    ]
+    max_similarity = max(similarities)
+    novelty = 1 - max_similarity
+    return novelty
+def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key):
     novelty_score = 0
+    print_lock = threading.Lock()  # Lock for thread-safe printing
+    results = []
     with ThreadPoolExecutor(max_workers=len(questions)) as executor:
         future_to_question = {executor.submit(
+            process_question, question, model_name, open_router_key, openai_api_key): question for question in questions}
         for future in as_completed(future_to_question):
             question = future_to_question[future]
+            try:
+                question_novelty, question_results = future.result()
+                with print_lock:
+                    novelty_score += question_novelty
+                    results.extend(question_results)
+                    st.write(f"<span style='color:yellow'>Total novelty score across all questions (so far): {novelty_score}</span>", unsafe_allow_html=True)
+            except Exception as e:
+                with print_lock:
+                    st.write(f"<span style='color:red'>Error in thread: {str(e)}</span>", unsafe_allow_html=True)
+    st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>", unsafe_allow_html=True)
+    return results
+def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key):
     novelty_score = 0
+    results = []
+    for i, question in enumerate(questions):
+        question_novelty, question_results = process_question(question, model_name, open_router_key, openai_api_key)
         novelty_score += question_novelty
+        results.extend(question_results)
+        st.write(f"<span style='color:yellow'>Total novelty score across processed questions: {novelty_score}</span>", unsafe_allow_html=True)  # Display progress after each question
+    st.write(f"<span style='color:yellow'>Final total novelty score across all questions: {novelty_score}</span>", unsafe_allow_html=True)
+    return results