Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Running

App Files Files Community

Yassine Mhirsi commited on 2 days ago

Commit

22ad0ba

1 Parent(s): e97ac87

similarity

Browse files

Files changed (4) hide show

routes/topic.py +20 -19
services/topic_service.py +1 -1
topic_similarity_google_example.py +182 -0
topic_similarity_langchain_example.py +54 -0

routes/topic.py CHANGED Viewed

@@ -4,7 +4,7 @@ from fastapi import APIRouter, HTTPException
 from datetime import datetime
 import logging
-from services.topic_service import topic_service
 from models.topic import (
     TopicRequest,
     TopicResponse,
@@ -19,15 +19,16 @@ logger = logging.getLogger(__name__)
 @router.post("/extract", response_model=TopicResponse, tags=["Topic Extraction"])
 async def extract_topic(request: TopicRequest):
     """
-    Extract a topic from a given text/argument
-    - **text**: The input text or argument to extract topic from (5-5000 chars)
-    Returns the extracted topic description
     """
     try:
-        # Extract topic
-        topic = topic_service.extract_topic(request.text)
         # Build response
         response = TopicResponse(
@@ -36,29 +37,29 @@ async def extract_topic(request: TopicRequest):
             timestamp=datetime.now().isoformat()
         )
-        logger.info(f"Topic extracted: {topic[:50]}...")
         return response
     except ValueError as e:
         logger.error(f"Validation error: {str(e)}")
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
-        logger.error(f"Topic extraction error: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Topic extraction failed: {str(e)}")
 @router.post("/batch-extract", response_model=BatchTopicResponse, tags=["Topic Extraction"])
 async def batch_extract_topics(request: BatchTopicRequest):
     """
-    Extract topics from multiple texts/arguments
-    - **texts**: List of texts to extract topics from (max 50)
-    Returns extracted topics for all texts
     """
     try:
-        # Batch extract topics
-        topics = topic_service.batch_extract_topics(request.texts)
         # Build response
         results = []
@@ -74,10 +75,10 @@ async def batch_extract_topics(request: BatchTopicRequest):
                     )
                 )
             else:
-                # Skip failed extractions or handle as needed
-                logger.warning(f"Failed to extract topic for text at index {i}")
-        logger.info(f"Batch topic extraction completed: {len(results)}/{len(request.texts)} successful")
         return BatchTopicResponse(
             results=results,
@@ -89,6 +90,6 @@ async def batch_extract_topics(request: BatchTopicRequest):
         logger.error(f"Validation error: {str(e)}")
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
-        logger.error(f"Batch topic extraction error: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Batch topic extraction failed: {str(e)}")

 from datetime import datetime
 import logging
+from services.topic_similarity_service import topic_similarity_service
 from models.topic import (
     TopicRequest,
     TopicResponse,
 @router.post("/extract", response_model=TopicResponse, tags=["Topic Extraction"])
 async def extract_topic(request: TopicRequest):
     """
+    Find the most similar topic from predefined topics for a given text/argument
+    - **text**: The input text or argument to find similar topic for (5-5000 chars)
+    Returns the most similar topic from the predefined list
     """
     try:
+        # Find most similar topic
+        result = topic_similarity_service.find_most_similar_topic(request.text)
+        topic = result["topic"]
         # Build response
         response = TopicResponse(
             timestamp=datetime.now().isoformat()
         )
+        logger.info(f"Most similar topic found: {topic[:50]}... (similarity: {result['similarity']:.4f})")
         return response
     except ValueError as e:
         logger.error(f"Validation error: {str(e)}")
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
+        logger.error(f"Topic similarity error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Topic similarity search failed: {str(e)}")
 @router.post("/batch-extract", response_model=BatchTopicResponse, tags=["Topic Extraction"])
 async def batch_extract_topics(request: BatchTopicRequest):
     """
+    Find the most similar topics from predefined topics for multiple texts/arguments
+    - **texts**: List of texts to find similar topics for (max 50)
+    Returns the most similar topics from the predefined list for all texts
     """
     try:
+        # Batch find similar topics
+        topics = topic_similarity_service.batch_find_similar_topics(request.texts)
         # Build response
         results = []
                     )
                 )
             else:
+                # Skip failed searches or handle as needed
+                logger.warning(f"Failed to find similar topic for text at index {i}")
+        logger.info(f"Batch topic similarity search completed: {len(results)}/{len(request.texts)} successful")
         return BatchTopicResponse(
             results=results,
         logger.error(f"Validation error: {str(e)}")
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
+        logger.error(f"Batch topic similarity error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Batch topic similarity search failed: {str(e)}")

services/topic_service.py CHANGED Viewed

@@ -22,7 +22,7 @@ class TopicService:
     def __init__(self):
         self.llm = None
-        self.model_name = "openai/gpt-oss-safeguard-120b"  # another model meta-llama/llama-4-scout-17b-16e-instruct
         self.initialized = False
     def initialize(self, model_name: Optional[str] = None):

     def __init__(self):
         self.llm = None
+        self.model_name = "openai/gpt-oss-safeguard-20b"  # another model meta-llama/llama-4-scout-17b-16e-instruct
         self.initialized = False
     def initialize(self, model_name: Optional[str] = None):

topic_similarity_google_example.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from datetime import datetime
+import os
+import json
+import hashlib
+from pathlib import Path
+from dotenv import load_dotenv
+from google import genai
+from google.genai import types
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+# Load environment variables from .env file
+load_dotenv()
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+if not GOOGLE_API_KEY:
+    raise ValueError("GOOGLE_API_KEY is not set in environment variables.")
+# Get the path to topics.json relative to this file
+TOPICS_FILE = Path(__file__).parent.parent / "data" / "topics.json"
+# Cache file for topic embeddings
+EMBEDDINGS_CACHE_FILE = Path(__file__).parent.parent / "data" / "topic_embeddings_cache.json"
+# Create a Google Generative AI client with the API key
+client = genai.Client(api_key=GOOGLE_API_KEY)
+def load_topics():
+    """Load topics from topics.json file."""
+    with open(TOPICS_FILE, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data.get("topics", [])
+def get_topics_hash(topics):
+    """Generate a hash of the topics list to verify cache validity."""
+    topics_str = json.dumps(topics, sort_keys=True)
+    return hashlib.md5(topics_str.encode('utf-8')).hexdigest()
+def load_cached_embeddings():
+    """Load cached topic embeddings if they exist and are valid."""
+    if not EMBEDDINGS_CACHE_FILE.exists():
+        return None
+    try:
+        with open(EMBEDDINGS_CACHE_FILE, 'r', encoding='utf-8') as f:
+            cache_data = json.load(f)
+        # Verify cache is valid by checking topics hash
+        current_topics = load_topics()
+        current_hash = get_topics_hash(current_topics)
+        if cache_data.get("topics_hash") == current_hash:
+            # Convert list embeddings back to numpy arrays
+            embeddings = [np.array(emb) for emb in cache_data.get("embeddings", [])]
+            return embeddings
+        else:
+            # Topics have changed, cache is invalid
+            return None
+    except (json.JSONDecodeError, KeyError, ValueError) as e:
+        # Cache file is corrupted or invalid format
+        print(f"Warning: Could not load cached embeddings: {e}")
+        return None
+def save_cached_embeddings(embeddings, topics):
+    """Save topic embeddings to cache file."""
+    topics_hash = get_topics_hash(topics)
+    # Convert numpy arrays to lists for JSON serialization
+    embeddings_list = [emb.tolist() for emb in embeddings]
+    cache_data = {
+        "topics_hash": topics_hash,
+        "embeddings": embeddings_list,
+        "model": "models/text-embedding-004",
+        "cached_at": datetime.now().isoformat()
+    }
+    try:
+        with open(EMBEDDINGS_CACHE_FILE, 'w', encoding='utf-8') as f:
+            json.dump(cache_data, f, indent=2)
+        print(f"Cached {len(embeddings)} topic embeddings to {EMBEDDINGS_CACHE_FILE}")
+    except Exception as e:
+        print(f"Warning: Could not save cached embeddings: {e}")
+def get_topic_embeddings():
+    """
+    Get topic embeddings, loading from cache if available, otherwise generating and caching them.
+    Returns:
+        numpy.ndarray: Array of topic embeddings
+    """
+    topics = load_topics()
+    # Try to load from cache first
+    cached_embeddings = load_cached_embeddings()
+    if cached_embeddings is not None:
+        print(f"Loaded {len(cached_embeddings)} topic embeddings from cache")
+        return np.array(cached_embeddings)
+    # Cache miss or invalid - generate embeddings
+    print(f"Generating embeddings for {len(topics)} topics (this may take a moment)...")
+    embedding_response = client.models.embed_content(
+        model="models/text-embedding-004",
+        contents=topics,
+        config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
+    )
+    if not hasattr(embedding_response, "embeddings") or embedding_response.embeddings is None:
+        raise RuntimeError("Embedding API did not return embeddings.")
+    embeddings = [np.array(e.values) for e in embedding_response.embeddings]
+    # Save to cache for future use
+    save_cached_embeddings(embeddings, topics)
+    return np.array(embeddings)
+def find_most_similar_topic(input_text: str):
+    """
+    Compare a single input text to all topics and return the highest cosine similarity.
+    Uses cached topic embeddings to avoid re-embedding topics on every call.
+    Args:
+        input_text: The text to compare against topics
+    Returns:
+        dict: Contains 'topic', 'similarity', and 'index' of the most similar topic
+    """
+    # Load topics from JSON file
+    topics = load_topics()
+    if not topics:
+        raise ValueError("No topics found in topics.json")
+    # Get topic embeddings (from cache or generate)
+    topic_embeddings = get_topic_embeddings()
+    # Only embed the input text (much faster!)
+    embedding_response = client.models.embed_content(
+        model="models/text-embedding-004",
+        contents=[input_text],
+        config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
+    )
+    if not hasattr(embedding_response, "embeddings") or embedding_response.embeddings is None:
+        raise RuntimeError("Embedding API did not return embeddings.")
+    # Extract input embedding
+    input_embedding = np.array(embedding_response.embeddings[0].values).reshape(1, -1)
+    # Calculate cosine similarity between input and each topic
+    similarities = cosine_similarity(input_embedding, topic_embeddings)[0]
+    # Find the highest similarity
+    max_index = np.argmax(similarities)
+    max_similarity = similarities[max_index]
+    most_similar_topic = topics[max_index]
+    return {
+        "topic": most_similar_topic,
+        "similarity": float(max_similarity),
+        "index": int(max_index)
+    }
+if __name__ == "__main__":
+    # Example usage
+    #start time
+    start_time = datetime.now()
+    test_text = "we should abandon the use of school uniform since one should be allowed to express their individuality by the clothes they were."
+    result = find_most_similar_topic(test_text)
+    print(f"Input text: '{test_text}'")
+    print(f"Most similar topic: '{result['topic']}'")
+    print(f"Cosine similarity: {result['similarity']:.4f}%")
+    #end time
+    end_time = datetime.now()
+    #in seconds
+    print(f"Time taken: {(end_time - start_time).total_seconds()} seconds")

topic_similarity_langchain_example.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import json
+import os
+from datetime import datetime
+from dotenv import load_dotenv
+load_dotenv()
+from langchain_community.vectorstores import FAISS
+from langchain_core.example_selectors import (
+    SemanticSimilarityExampleSelector,
+)
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+# Load topics from data file
+with open(
+    file="data/topics.json",
+    encoding="utf-8"
+) as f:
+    data = json.load(f)
+# Make sure each example is a dict with "topic" key (wrap as dict if plain string)
+def format_examples(examples):
+    formatted = []
+    for ex in examples:
+        if isinstance(ex, str):
+            formatted.append({"topic": ex})
+        elif isinstance(ex, dict) and "topic" in ex:
+            formatted.append({"topic": ex["topic"]})
+        else:
+            formatted.append({"topic": str(ex)})
+    return formatted
+# topics.json should have a top-level "topics" key
+examples = data.get("topics", [])
+formatted_examples = format_examples(examples)
+start_time = datetime.now()
+example_selector = SemanticSimilarityExampleSelector.from_examples(
+    examples=formatted_examples,
+    embeddings=GoogleGenerativeAIEmbeddings(
+        model="models/text-embedding-004",
+        api_key=os.getenv("GOOGLE_API_KEY")
+    ),
+    vectorstore_cls=FAISS,
+    k=1,
+    input_keys=["topic"],
+)
+# Example call to selector (for demonstration; remove in production)
+result = example_selector.select_examples(
+    {"topic": "people who are terminally ill and suffering greatly should have the right to end their own life if they so desire."}
+)
+print(result)
+end_time = datetime.now()
+print(f"Time taken: {(end_time - start_time).total_seconds()} seconds")