import streamlit as st import plotly.graph_objects as go import pandas as pd import numpy as np import os import xgboost as xgb from src.models.explain import generate_shap_explanation from src.features.build_features import calculate_risk_score st.set_page_config(page_title="FairValue Transfer Cap Estimator", layout="wide") # ── Currency Config ─────────────────────────────────────────────────────────── EUR_TO_GBP = 0.85 # Approximate conversion — review quarterly st.title("FairValue Transfer Cap Estimator") st.markdown( "A rigorous, data-driven 'Transfer Ceiling Calculator' grounded in ML " "and Hedonic Pricing Theory." ) # ── Data Loading ───────────────────────────────────────────────────────────── # Fixed: was @st.cache_resource — mutations persisted across user sessions. # @st.cache_data correctly serialises DataFrames per session. @st.cache_data def load_player_data(): """Loads processed player features CSV. Returns None if file not found.""" df_path = "data/processed/app_features.csv" if not os.path.exists(df_path): return None df = pd.read_csv(df_path) mv_rename_map = { col: 'market_value_in_eur' for col in df.columns if 'market' in col.lower() and 'value' in col.lower() } if mv_rename_map: df.rename(columns=mv_rename_map, inplace=True) df = df.loc[:, ~df.columns.duplicated()].copy() return df # ── Model Loading ───────────────────────────────────────────────────────────── # @st.cache_resource is correct here — the model object is shared, not copied. @st.cache_resource def load_model(): """Loads XGBoost model from disk. Returns (model, size_bytes, path).""" xgb_model = xgb.XGBRegressor() for path in ["fairvalue_xgboost.json", "FairValue_xgboost.json"]: if os.path.exists(path): size = os.path.getsize(path) xgb_model.load_model(path) return xgb_model, size, path return None, 0, None df = load_player_data() model, model_size, model_path = load_model() if df is None: st.error("⚠️ Data file not found: `data/processed/app_features.csv`. Re-run the pipeline.") st.stop() if model is None: st.error("⚠️ MODEL_FILE_NOT_FOUND — ensure `fairvalue_xgboost.json` is in the project root.") st.stop() if model_size < 1000: st.error( f"❌ MODEL CORRUPTED: `{model_path}` is only {model_size:,} bytes " "(expected ~400 KB). Please re-upload the correct model file." ) st.stop() # Fixed: was missing — execution continued and produced silent zero predictions # ── Sidebar ─────────────────────────────────────────────────────────────────── st.sidebar.header("Player Transfer Profile") input_mode = st.sidebar.radio("Input Mode", ["Select Existing Player", "Create Custom Player"]) name_col = next( (c for c in ['name', 'name_x', 'Player_Name', 'Name'] if c in df.columns), None ) selected_name = "" if input_mode == "Select Existing Player": if name_col is None: st.error("No player name column found in data. Please re-run the pipeline.") st.stop() player_list = sorted(df[name_col].astype(str).unique().tolist()) selected_name = st.sidebar.selectbox("Target Database Player", player_list) player_data = df[df[name_col].astype(str) == selected_name].iloc[0:1].copy() contract_years = st.sidebar.slider( "Contract Years Remaining", 0.5, 6.0, float(player_data['Contract_Years_Left'].iloc[0]), 0.5 ) age = st.sidebar.slider("Age", 16, 40, int(player_data['Age'].iloc[0])) inj_col = next( (c for c in ['Injury_Days_Total_24m', 'Injury_Days'] if c in player_data.columns), None ) injuries = st.sidebar.number_input( "Injury missed days (24m)", 0, 500, int(player_data[inj_col].iloc[0]) if inj_col else 10 ) else: player_data = df.median(numeric_only=True).to_frame().T contract_years = st.sidebar.slider("Contract Years Remaining", 0.5, 6.0, 2.0, 0.5) age = st.sidebar.slider("Age", 16, 40, 24) injuries = st.sidebar.number_input("Injury missed days (24m)", 0, 500, 10) if 'market_value_in_eur' in player_data.columns: m_val = st.sidebar.number_input("Current Market Value Estimation (£m)", 1.0, 200.0, 20.0) player_data['market_value_in_eur'] = (m_val * 1_000_000) / EUR_TO_GBP asking_price = st.sidebar.number_input("Selling Club Asking Price (£m)", 1.0, 300.0, 45.0) # ── Hype Factor Integration (consumed from Page 3 Live Player Intel) ────────── # Fixed: was never read — hard_cap calculation ignored session_state entirely. hype_all = st.session_state.get('player_hype_metrics', {}) hype_entry = hype_all.get(selected_name.lower(), {}) if selected_name else {} hype_premium_pct = hype_entry.get('hype_premium_percent', 0.0) if hype_premium_pct != 0.0: sign = "+" if hype_premium_pct > 0 else "" st.sidebar.info( f"💡 **Hype Factor Active:** {sign}{hype_premium_pct:.1f}% \n" "*Run Page 3 → Live Player Intel to refresh.*" ) else: st.sidebar.caption("No Hype Factor loaded. Run Page 3 to enrich this estimate.") # ── Build Inference Vector ───────────────────────────────────────────────────── expected_cols = model.feature_names_in_ player_data = player_data.copy() player_data['Contract_Years_Left'] = contract_years player_data['Age'] = age if 'Injury_Days_Total_24m' in player_data.columns: player_data['Injury_Days_Total_24m'] = injuries # Recompute derived risk flags so they reflect the slider values, not stale DB data. # Without this, Risk_Contract / Risk_Age / Risk_Injury don't update when sliders move. player_data = calculate_risk_score( player_data, contract_col='Contract_Years_Left', age_col='Age', injury_col='Injury_Days_Total_24m' ) X_infer = player_data.reindex(columns=expected_cols, fill_value=0) if st.button("Calculate Prediction", type="primary"): raw_preds = model.predict(X_infer) log_pv = raw_preds[0] baseline_pv = max(float(np.expm1(log_pv)), 0.0) baseline_pv_m = baseline_pv / 1_000_000 conservative_bound = baseline_pv * 0.85 # Internal rule-based risk discount (transparent to users) risk_pct = ( (0.20 if contract_years < 1.5 else 0.0) + (0.15 if age > 30 else 0.0) + (0.10 if injuries > 60 else 0.0) ) # External hype multiplier from Page 3 NLP sentiment hype_multiplier = 1.0 + (hype_premium_pct / 100.0) hard_cap = conservative_bound * (1.0 - risk_pct) * hype_multiplier hard_cap_m = hard_cap / 1_000_000 col1, col2 = st.columns([2, 1]) with col1: st.subheader("Price Range Recommendation") fig = go.Figure(go.Indicator( mode="gauge+number+delta", value=asking_price, delta={'reference': hard_cap_m, 'increasing': {'color': "red"}}, title={'text': "Asking Price vs Hard Cap (£m)"}, gauge={ 'axis': {'range': [0, max(asking_price * 1.2, 100)]}, 'threshold': {'line': {'color': "white", 'width': 4}, 'value': hard_cap_m} } )) st.plotly_chart(fig, use_container_width=True) with col2: st.subheader("Metrics Breakdown") st.metric("Predicted Market Value (Baseline)", f"£{baseline_pv_m:.1f}m") st.metric("Risk-Adjusted Hard Cap", f"£{hard_cap_m:.1f}m") if hype_premium_pct != 0.0: sign = "+" if hype_premium_pct > 0 else "" st.metric("Hype / Form Adjustment", f"{sign}{hype_premium_pct:.1f}%") if asking_price > hard_cap_m: overpay = asking_price - hard_cap_m st.error(f"⚠️ Winner's Curse Risk: £{overpay:.1f}m Overpay") else: st.success("✅ Asking price is within Fair Value bounds.") # ── SHAP Explainability Panel ───────────────────────────────────────────── # Fixed: shap was imported but never used. Now wired to generate_shap_explanation. st.markdown("---") st.subheader("🔬 XAI Explainability — What Drives This Valuation?") with st.spinner("Computing SHAP feature contributions..."): try: _, explanation_df = generate_shap_explanation(model, X_infer) top_shap = explanation_df.head(10).copy() top_shap['Label'] = top_shap['Feature'].str.replace('_', ' ').str.title() fig_shap = go.Figure(go.Bar( x=top_shap['Contribution_to_LogPrice'], y=top_shap['Label'], orientation='h', marker_color=[ '#e74c3c' if v < 0 else '#2ecc71' for v in top_shap['Contribution_to_LogPrice'] ], text=[f"{v:+.3f}" for v in top_shap['Contribution_to_LogPrice']], textposition='outside', )) fig_shap.update_layout( title="Top 10 Feature Contributions to Transfer Fee (Log-Price Scale)", xaxis_title="SHAP Value (Additive impact on log transfer fee)", yaxis={'categoryorder': 'total ascending'}, height=420, margin=dict(l=10, r=10, t=50, b=10), ) st.plotly_chart(fig_shap, use_container_width=True) st.caption( "🟢 Green = boosts transfer value | 🔴 Red = depresses transfer value | " "Sorted by absolute magnitude." ) except Exception as shap_err: st.warning(f"SHAP panel unavailable: {shap_err}") with st.expander("🛠️ Technical Deep Dive (Internal Metrics)"): st.write(f"**Model:** `{model_path}` ({model_size:,} bytes)") st.write(f"**Raw Log-Scale Prediction:** `{log_pv:.4f}`") mv_val = ( X_infer['market_value_in_eur'].iloc[0] if 'market_value_in_eur' in X_infer.columns else "MISSING" ) if isinstance(mv_val, (int, float)): st.write(f"**Market Value Input (EUR):** `{mv_val:,.0f}`") else: st.write(f"**Market Value Input:** `{mv_val}`") st.write("**Full Feature Vector:**", X_infer) st.markdown("---") st.subheader("Model Performance Assessment") st.markdown(f"**Training Set:** `{len(df):,}` transfers | **Engine:** XGBoost + RandomizedSearchCV") st.markdown("**Validation MAE:** `~£23,980,000` | **Status:** Production Ready")