"""Configuration module for RAG system.

Central configuration hub for all system parameters including:
- LLM Model Selection and Parameters: Chat, summarization, temperature settings
- Embedding Model: Vector representation for semantic search
- Chunking Strategy: Document segmentation for retrieval
- Vector Database: Qdrant configuration for similarity search
- Inference Infrastructure: Ollama local LLM serving
- Chat History: Redis backend for multi-turn conversation state
- Evaluation Metrics: DeepEval LLM-as-Judge configuration
- CORS/Security: Frontend origin whitelisting
- Performance: Token streaming and dummy response simulation

All configuration can be overridden via environment variables or direct modification.
For production deployments, review EVALUATION_TIMEOUT, REDIS_URL, and OLLAMA_BASE_URL.

CST Timezone: All timestamps use CST (America/Chicago) for consistent logging across deployment.
"""

import os
from typing import Optional
from dotenv import load_dotenv

# Load environment variables from .env file
# override=False ensures environment variables take precedence over .env file
load_dotenv(override=False)


# ============================================================================
# LLM Model Configuration
# ============================================================================

LLM_CHAT_MODEL_NAME: str = "gemma3:latest"              # Main chat model (Ollama)
LLM_CHAT_TEMPERATURE: float = 0.75                      # Temperature: 0=deterministic, 1=creative
LLM_SUMMARY_MODEL_NAME: str = "gemma3:latest"           # For conversation summarization
LLM_SUMMARY_TEMPERATURE: float = 0.5                    # Lower temp for consistent summaries
EMB_MODEL_NAME: str = "mxbai-embed-large:latest"        # Embedding model for semantic search (deprecated, kept for backward compatibility)

# ============================================================================
# Jina v4 Multi-Modal Embeddings Configuration
# ============================================================================

# Enable Jina v4 for multi-modal embeddings (text + images)
USE_JINA_EMBEDDINGS: bool = os.getenv("USE_JINA_EMBEDDINGS", "true").lower() == "true"

# Jina v4 model settings
JINA_MODEL_NAME: str = "jinaai/jina-embeddings-v4"      # HuggingFace model identifier
JINA_TASK: str = "retrieval"                             # Task-specific adapter: 'retrieval', 'text-matching', 'code'
JINA_EMBEDDING_DIM: int = 2048                           # Default dimension (can be truncated to 1024, 512, 256, 128)
JINA_EMBEDDING_DIM_TRUNCATE: int = int(os.getenv("JINA_EMBEDDING_DIM_TRUNCATE", "1024"))  # Truncate to save memory
JINA_DEVICE: str = os.getenv("JINA_DEVICE", "cuda")      # 'cuda' or 'cpu'
JINA_BATCH_SIZE: int = 32                                # Batch size for inference
JINA_MAX_LENGTH: int = 32768                             # Max sequence length

# Image extraction settings for multi-modal documents
EXTRACT_IMAGES_FROM_PDF: bool = True                    # Extract images from PDFs
IMAGE_OUTPUT_DIR: str = os.getenv("IMAGE_OUTPUT_DIR", "user_uploads/extracted_images")  # Where to store extracted images (relative to server dir)
IMAGE_MAX_SIZE: tuple = (1024, 1024)                    # Resize images to this size
IMAGE_FORMAT: str = "PNG"                               # Image format (PNG, JPEG, WEBP)


# ============================================================================
# Content & Token Management
# ============================================================================

# Maximum total tokens allowed in context window (chat_history + input + context)
# Adjust based on model capability (Gemma3: ~14k useful for 32k model)
MAX_CONTENT_SIZE: int = 14000


# ============================================================================
# Connection Verification
# ============================================================================

# Whether to verify LLM/embedding model availability at startup
# Set to True for Docker/production, False for local development
VERIFY_LLM_CONNECTION: bool = False
VERIFY_EMB_CONNECTION: bool = False


# ============================================================================
# Document Chunking Strategy
# ============================================================================

# Character limit per chunk (larger = fewer chunks but less precision)
DOC_CHAR_LIMIT: int = 2000
# Overlap between chunks for context continuity (prevents semantic breaks)
DOC_OVERLAP_NO: int = 250

# Token-based retrieval calculations (roughly 4 chars per token)
DOC_TOKEN_SIZE: int = DOC_CHAR_LIMIT // 4               # ~500 tokens per chunk
DOCS_NUM_COUNT: int = 3000 // DOC_TOKEN_SIZE            # ~6 documents retrieved


# ============================================================================
# Infrastructure: Ollama & Qdrant
# ============================================================================

# Ollama Configuration (local LLM inference)
# Use 'ollama' hostname in Docker Compose, 'localhost' for local development
OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")

# Vector Database Configuration (Qdrant)
# Use 'qdrant' hostname in Docker Compose, 'localhost' for local development
QDRANT_URL: str = os.getenv("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION_NAME: str = "rag_documents"            # Collection stores all document chunks
QDRANT_API_KEY: Optional[str] = os.getenv("QDRANT_API_KEY", None)  # Optional for Qdrant Cloud


# ============================================================================
# Response Streaming (Dummy Mode)
# ============================================================================

# Simulate token-by-token streaming in dummy/test mode
TOKENS_PER_SEC: int = 50                                # Tokens yielded per second
BATCH_TOKEN_PS: int = 2                                 # Tokens per batch (for realism)


# ============================================================================
# Security & CORS
# ============================================================================

from typing import Optional

# Allowed frontend origins (for browser-based requests)
# Add Streamlit frontend and any external services here
ALLOWED_ORIGINS: list = [
	"http://localhost:8501",      # Local Streamlit development
	"http://127.0.0.1:5500",      # Local Live Server
]


# ============================================================================
# Chat History & Persistence
# ============================================================================

# Backend for conversation state: 'memory' or 'redis'
# Use 'redis' for production (persistent across restarts)
# Use 'memory' for lightweight testing
HISTORY_BACKEND: str = "redis"

# Redis connection string
# Use 'redis' hostname in Docker Compose, 'localhost' for local development
REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0")

# Session TTL in seconds (0 = no expiry, 2592000 = 30 days)
# Set to auto-expire old sessions to save memory
REDIS_HISTORY_TTL_SECONDS: int = 0


# ============================================================================
# Evaluation & Metrics (DeepEval)
# ============================================================================

# Enable/disable LLM-as-Judge evaluation metrics
# False = faster response times (<100ms cache hits unaffected)
# True = adds 5-8s for faithfulness & answer relevancy evaluation
ENABLE_METRICS_EVALUATION: bool = os.getenv("ENABLE_METRICS_EVALUATION", "false").lower() == "true"

# Timeout for complete evaluation suite (seconds)
# Production: 3-5s, Development: 8-10s
EVALUATION_TIMEOUT: float = float(os.getenv("EVALUATION_TIMEOUT", "8.0"))