"""Configuration module for RAG system. Central configuration hub for all system parameters including: - LLM Model Selection and Parameters: Chat, summarization, temperature settings - Embedding Model: Vector representation for semantic search - Chunking Strategy: Document segmentation for retrieval - Vector Database: Qdrant configuration for similarity search - Inference Infrastructure: Ollama local LLM serving - Chat History: Redis backend for multi-turn conversation state - Evaluation Metrics: DeepEval LLM-as-Judge configuration - CORS/Security: Frontend origin whitelisting - Performance: Token streaming and dummy response simulation All configuration can be overridden via environment variables or direct modification. For production deployments, review EVALUATION_TIMEOUT, REDIS_URL, and OLLAMA_BASE_URL. CST Timezone: All timestamps use CST (America/Chicago) for consistent logging across deployment. """ import os from typing import Optional from dotenv import load_dotenv # Load environment variables from .env file # override=False ensures environment variables take precedence over .env file load_dotenv(override=False) # ============================================================================ # LLM Model Configuration # ============================================================================ LLM_CHAT_MODEL_NAME: str = "gemma3:latest" # Main chat model (Ollama) LLM_CHAT_TEMPERATURE: float = 0.75 # Temperature: 0=deterministic, 1=creative LLM_SUMMARY_MODEL_NAME: str = "gemma3:latest" # For conversation summarization LLM_SUMMARY_TEMPERATURE: float = 0.5 # Lower temp for consistent summaries EMB_MODEL_NAME: str = "mxbai-embed-large:latest" # Embedding model for semantic search (deprecated, kept for backward compatibility) # ============================================================================ # Jina v4 Multi-Modal Embeddings Configuration # ============================================================================ # Enable Jina v4 for multi-modal embeddings (text + images) USE_JINA_EMBEDDINGS: bool = os.getenv("USE_JINA_EMBEDDINGS", "true").lower() == "true" # Jina v4 model settings JINA_MODEL_NAME: str = "jinaai/jina-embeddings-v4" # HuggingFace model identifier JINA_TASK: str = "retrieval" # Task-specific adapter: 'retrieval', 'text-matching', 'code' JINA_EMBEDDING_DIM: int = 2048 # Default dimension (can be truncated to 1024, 512, 256, 128) JINA_EMBEDDING_DIM_TRUNCATE: int = int(os.getenv("JINA_EMBEDDING_DIM_TRUNCATE", "1024")) # Truncate to save memory JINA_DEVICE: str = os.getenv("JINA_DEVICE", "cuda") # 'cuda' or 'cpu' JINA_BATCH_SIZE: int = 32 # Batch size for inference JINA_MAX_LENGTH: int = 32768 # Max sequence length # Image extraction settings for multi-modal documents EXTRACT_IMAGES_FROM_PDF: bool = True # Extract images from PDFs IMAGE_OUTPUT_DIR: str = os.getenv("IMAGE_OUTPUT_DIR", "user_uploads/extracted_images") # Where to store extracted images (relative to server dir) IMAGE_MAX_SIZE: tuple = (1024, 1024) # Resize images to this size IMAGE_FORMAT: str = "PNG" # Image format (PNG, JPEG, WEBP) # ============================================================================ # Content & Token Management # ============================================================================ # Maximum total tokens allowed in context window (chat_history + input + context) # Adjust based on model capability (Gemma3: ~14k useful for 32k model) MAX_CONTENT_SIZE: int = 14000 # ============================================================================ # Connection Verification # ============================================================================ # Whether to verify LLM/embedding model availability at startup # Set to True for Docker/production, False for local development VERIFY_LLM_CONNECTION: bool = False VERIFY_EMB_CONNECTION: bool = False # ============================================================================ # Document Chunking Strategy # ============================================================================ # Character limit per chunk (larger = fewer chunks but less precision) DOC_CHAR_LIMIT: int = 2000 # Overlap between chunks for context continuity (prevents semantic breaks) DOC_OVERLAP_NO: int = 250 # Token-based retrieval calculations (roughly 4 chars per token) DOC_TOKEN_SIZE: int = DOC_CHAR_LIMIT // 4 # ~500 tokens per chunk DOCS_NUM_COUNT: int = 3000 // DOC_TOKEN_SIZE # ~6 documents retrieved # ============================================================================ # Infrastructure: Ollama & Qdrant # ============================================================================ # Ollama Configuration (local LLM inference) # Use 'ollama' hostname in Docker Compose, 'localhost' for local development OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434") # Vector Database Configuration (Qdrant) # Use 'qdrant' hostname in Docker Compose, 'localhost' for local development QDRANT_URL: str = os.getenv("QDRANT_URL", "http://localhost:6333") QDRANT_COLLECTION_NAME: str = "rag_documents" # Collection stores all document chunks QDRANT_API_KEY: Optional[str] = os.getenv("QDRANT_API_KEY", None) # Optional for Qdrant Cloud # ============================================================================ # Response Streaming (Dummy Mode) # ============================================================================ # Simulate token-by-token streaming in dummy/test mode TOKENS_PER_SEC: int = 50 # Tokens yielded per second BATCH_TOKEN_PS: int = 2 # Tokens per batch (for realism) # ============================================================================ # Security & CORS # ============================================================================ from typing import Optional # Allowed frontend origins (for browser-based requests) # Add Streamlit frontend and any external services here ALLOWED_ORIGINS: list = [ "http://localhost:8501", # Local Streamlit development "http://127.0.0.1:5500", # Local Live Server ] # ============================================================================ # Chat History & Persistence # ============================================================================ # Backend for conversation state: 'memory' or 'redis' # Use 'redis' for production (persistent across restarts) # Use 'memory' for lightweight testing HISTORY_BACKEND: str = "redis" # Redis connection string # Use 'redis' hostname in Docker Compose, 'localhost' for local development REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0") # Session TTL in seconds (0 = no expiry, 2592000 = 30 days) # Set to auto-expire old sessions to save memory REDIS_HISTORY_TTL_SECONDS: int = 0 # ============================================================================ # Evaluation & Metrics (DeepEval) # ============================================================================ # Enable/disable LLM-as-Judge evaluation metrics # False = faster response times (<100ms cache hits unaffected) # True = adds 5-8s for faithfulness & answer relevancy evaluation ENABLE_METRICS_EVALUATION: bool = os.getenv("ENABLE_METRICS_EVALUATION", "false").lower() == "true" # Timeout for complete evaluation suite (seconds) # Production: 3-5s, Development: 8-10s EVALUATION_TIMEOUT: float = float(os.getenv("EVALUATION_TIMEOUT", "8.0"))