Spaces:

sanchitshaleen
/

chat-with-your-data

Running

chat-with-your-data / server /llm_system /core /cache.py

sanchitshaleen

Initial deployment of RAG with Gemma-3 to Hugging Face Spaces

4aec76b 6 days ago

10.3 kB

	"""Query response caching layer for production-grade RAG systems.

	This module provides intelligent in-memory caching of RAG responses with TTL-based
	expiration and automatic eviction. It's designed to dramatically improve performance
	for repeated queries in knowledge base systems.

	Performance Impact:
	- Cache hit: <100ms response time (no LLM inference needed)
	- Cache miss: ~70-90s response time (full LLM generation)
	- Speed improvement: ~700x faster for cached queries
	- Typical hit rate: 30-50% in RAG workloads with repeated questions

	Architecture:
	- Uses SHA256 hash of normalized question as cache key (global, not per-user)
	- TTL-based expiration (default 1 hour)
	- Max size: 500 responses with LRU eviction
	- Thread-safe module-level dictionary for caching

	Usage:
	>>> from cache import ResponseCache
	>>> cache = ResponseCache(ttl_seconds=3600) # 1 hour TTL
	>>> cached_answer = cache.get("What is RAG?", session_id) # Check cache
	>>> if not cached_answer:
	... answer = generate_answer(...)
	... cache.set("What is RAG?", session_id, answer) # Store response
	"""

	import hashlib
	import json
	from typing import Optional, Dict, Any
	import time
	from logger import get_logger

	log = get_logger("rag_cache")

	# Module-level cache storage (persistent across requests)
	_response_cache: Dict[str, Dict[str, Any]] = {}

	# Maximum number of responses to cache before LRU eviction
	_CACHE_MAX_SIZE = 500


	class ResponseCache:
	"""Intelligent in-memory cache for RAG responses with TTL and LRU eviction.

	This cache stores generated responses keyed by normalized questions, enabling
	rapid retrieval (<100ms) of answers to previously asked questions without
	requiring LLM inference.

	Thread Safety:
	- Uses module-level dictionary which is safe for concurrent reads
	- Write operations should be serialized (fine for typical request handling)

	Memory Management:
	- TTL-based expiration: entries automatically expire after configured duration
	- LRU eviction: oldest entries removed when cache exceeds max size
	- Per-entry tracking: creation time stored for TTL and eviction logic
	"""

	def __init__(self, ttl_seconds: int = 3600):
	"""Initialize response cache with configurable TTL.

	Args:
	ttl_seconds (int): Time-to-live for cached responses in seconds.
	Default is 3600 (1 hour).

	Example:
	>>> cache = ResponseCache(ttl_seconds=7200) # 2 hour TTL
	"""
	self.ttl_seconds = ttl_seconds
	log.info(f"🚀 ResponseCache initialized (TTL: {ttl_seconds}s, Max Size: {_CACHE_MAX_SIZE})")

	@staticmethod
	def _get_cache_key(question: str, session_id: str) -> str:
	"""Generate cache key from question AND session_id.

	Uses SHA256 hash of normalized question + session_id for per-user caching.
	This design choice means:
	- Each user gets isolated cache entries for their documents
	- Prevents cross-user cache contamination
	- Still deduplicates identical questions from same user
	- Reduces stale cache issues when documents change

	Args:
	question (str): The user's question (will be normalized: lowercase, trimmed)
	session_id (str): Session identifier (user-specific) - NOW INCLUDED IN KEY

	Returns:
	str: 16-character hexadecimal cache key (SHA256 hash prefix)

	Example:
	>>> key1 = ResponseCache._get_cache_key("What is RAG?", "user1")
	>>> key2 = ResponseCache._get_cache_key("what is rag?", "user1")
	>>> key3 = ResponseCache._get_cache_key("what is rag?", "user2")
	>>> key1 == key2 # True - same user, same normalized question
	>>> key1 == key3 # False - different users
	"""
	# Normalize question for consistent hashing AND include session_id
	cache_input = f"{session_id}:{question.strip().lower()}"
	# Return first 16 chars of SHA256 hash (still unique with very high probability)
	return hashlib.sha256(cache_input.encode()).hexdigest()[:16]

	def get(self, question: str, session_id: str) -> Optional[str]:
	"""Retrieve cached response for a question if it exists and hasn't expired.

	Performs cache lookup and automatically removes expired entries.

	Args:
	question (str): The user's question to look up in cache.
	session_id (str): Session identifier (for logging/tracing purposes).

	Returns:
	Optional[str]: The cached answer if found and not expired, None otherwise.

	Example:
	>>> cache = ResponseCache()
	>>> answer = cache.get("What is RAG?", "user123")
	>>> if answer:
	... print(f"Cache hit! Answer: {answer[:100]}...")
	... else:
	... print("Cache miss - need to generate response")
	"""
	# Generate cache key from normalized question
	cache_key = self._get_cache_key(question, session_id)
	log.info(f"🔍 Cache lookup: q='{question[:50]}...' key={cache_key}, "
	f"cache_keys={list(_response_cache.keys())}")

	# Check if key exists in cache
	if cache_key in _response_cache:
	entry = _response_cache[cache_key]

	# Check if entry has expired based on TTL
	if time.time() - entry["created_at"] > self.ttl_seconds:
	log.info(f"⏰ Cache entry expired: {cache_key}")
	del _response_cache[cache_key]
	return None

	# Cache hit: return the cached answer
	log.info(f"✅ Cache HIT: {cache_key} (saved ~70s!)")
	return entry["answer"]

	# Cache miss: no entry found
	return None

	def set(self, question: str, session_id: str, answer: str) -> None:
	"""Cache a newly generated response for future requests.

	Automatically handles cache eviction when max size is exceeded (LRU).

	Args:
	question (str): The user's question.
	session_id (str): Session identifier (for logging/tracing).
	answer (str): The generated answer to cache.

	Example:
	>>> cache = ResponseCache()
	>>> generated_answer = rag_chain.invoke({"input": "What is RAG?"})
	>>> cache.set("What is RAG?", "user123", generated_answer)
	"""
	global _response_cache

	# Generate cache key
	cache_key = self._get_cache_key(question, session_id)
	log.info(f"💾 Caching response: key={cache_key}, answer_len={len(answer)}, "
	f"cache_size_before={len(_response_cache)}")

	# Evict oldest entry if cache is full (LRU - Least Recently Used)
	if len(_response_cache) >= _CACHE_MAX_SIZE:
	oldest_key = min(_response_cache.keys(),
	key=lambda k: _response_cache[k]["created_at"])
	log.info(f"🗑️ Cache eviction (LRU): removing {oldest_key}")
	del _response_cache[oldest_key]

	# Store new cache entry with metadata
	_response_cache[cache_key] = {
	"answer": answer,
	"created_at": time.time(),
	"expires_at": time.time() + self.ttl_seconds
	}

	log.info(f"💾 Cache SET: {cache_key} (size: {len(_response_cache)}/{_CACHE_MAX_SIZE})")

	def clear(self) -> None:
	"""Clear entire cache (e.g., for testing or reset).

	Example:
	>>> cache = ResponseCache()
	>>> cache.clear() # Removes all cached responses
	"""
	global _response_cache
	_response_cache.clear()
	log.info("🗑️ Cache cleared")

	def clear_user_cache(self, session_id: str) -> None:
	"""Clear cache entries for a specific user.

	Useful when a user uploads new documents and we want fresh responses.

	Args:
	session_id (str): User/session identifier whose cache to clear

	Example:
	>>> cache = ResponseCache()
	>>> cache.clear_user_cache("user123") # Clears all entries for user123
	"""
	global _response_cache

	# Find all keys that start with this user's session_id
	keys_to_remove = [k for k in _response_cache.keys() if k.startswith(session_id[:8])]

	for key in keys_to_remove:
	del _response_cache[key]

	log.info(f"🗑️ Cleared {len(keys_to_remove)} cache entries for user: {session_id}")

	def stats(self) -> Dict[str, Any]:
	"""Get current cache statistics.

	Returns:
	Dict containing:
	- size: Current number of cached responses
	- max_size: Maximum allowed cache size
	- ttl_seconds: Time-to-live duration in seconds

	Example:
	>>> cache = ResponseCache()
	>>> stats = cache.stats()
	>>> print(f"Cache size: {stats['size']}/{stats['max_size']}")
	"""
	return {
	"size": len(_response_cache),
	"max_size": _CACHE_MAX_SIZE,
	"ttl_seconds": self.ttl_seconds,
	}


	def create_response_cache(**kwargs) -> ResponseCache:
	"""Factory function to create ResponseCache instance.

	Provides a convenient way to instantiate cache with custom parameters.

	Args:
	**kwargs: Arguments to pass to ResponseCache constructor:
	- ttl_seconds (int): Time-to-live in seconds (default: 3600)

	Returns:
	ResponseCache: Initialized cache instance with provided settings.

	Example:
	>>> # Create cache with default 1-hour TTL
	>>> cache = create_response_cache()

	>>> # Create cache with 30-minute TTL
	>>> cache = create_response_cache(ttl_seconds=1800)
	"""
	return ResponseCache(**kwargs)