Spaces:

charanKompala
/

business-analyst-gpt

Sleeping

File size: 17,413 Bytes

6ee1b87

import gradio as gr
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

def smart_dataset_analysis(file):
    """Complete dataset analysis with intelligent recommendations"""
    if not file: return "Upload a dataset to get intelligent analysis."
    
    try:
        # Smart file reading
        df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name)
        
        # Deep dataset profiling
        profile = {
            'shape': df.shape,
            'numeric_cols': list(df.select_dtypes(include=[np.number]).columns),
            'categorical_cols': list(df.select_dtypes(include=['object', 'category']).columns),
            'datetime_cols': list(df.select_dtypes(include=['datetime64']).columns),
            'missing_data': df.isnull().sum().to_dict(),
            'duplicates': df.duplicated().sum(),
            'memory_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2)
        }
        
        # Smart target detection with scoring
        target_scores = {}
        target_keywords = ['target', 'label', 'class', 'outcome', 'price', 'sales', 'churn', 'rating', 'status']
        
        for col in df.columns:
            score = 0
            col_lower = col.lower()
            
            # Keyword matching
            score += 15 if any(kw in col_lower for kw in target_keywords) else 0
            
            # Statistical characteristics
            if col in profile['numeric_cols']:
                unique_ratio = df[col].nunique() / len(df)
                if 0.02 < unique_ratio < 0.95:  # Good target range
                    score += 10
                if df[col].std() > 0:  # Has variance
                    score += 5
            elif col in profile['categorical_cols']:
                unique_count = df[col].nunique()
                if 2 <= unique_count <= 20:  # Good classification range
                    score += 12
                    score += 5 if unique_count == 2 else 0  # Binary bonus
            
            # Position bias (targets often at end)
            if list(df.columns).index(col) >= len(df.columns) - 3:
                score += 3
                
            target_scores[col] = score
        
        # Get top targets
        top_targets = sorted(target_scores.items(), key=lambda x: x[1], reverse=True)[:3]
        smart_targets = [col for col, score in top_targets if score > 8]
        
        # Generate ML recommendations for each target
        ml_recommendations = []
        for target in smart_targets[:2]:  # Top 2 targets
            target_type = 'numeric' if target in profile['numeric_cols'] else 'categorical'
            unique_vals = df[target].nunique()
            
            # Smart model selection based on actual data
            if target_type == 'numeric':
                skewness = df[target].skew()
                if abs(skewness) > 2:
                    models = ['XGBoost Regressor', 'Random Forest', 'Gradient Boosting']
                    reason = f"Highly skewed data (skew: {skewness:.2f}) - tree models handle non-linearity"
                else:
                    models = ['Linear Regression', 'Ridge/Lasso', 'Random Forest']
                    reason = f"Normal distribution - linear models suitable"
            else:
                if unique_vals == 2:
                    models = ['Logistic Regression', 'XGBoost Classifier', 'Random Forest']
                    reason = f"Binary classification - balanced approach recommended"
                elif unique_vals <= 5:
                    models = ['Multinomial Logistic', 'XGBoost', 'Random Forest']
                    reason = f"{unique_vals} classes - multi-class classification"
                else:
                    models = ['XGBoost', 'Neural Networks', 'Random Forest']
                    reason = f"{unique_vals} classes - complex multi-class problem"
            
            ml_recommendations.append({
                'target': target,
                'type': target_type,
                'unique_vals': unique_vals,
                'models': models,
                'reason': reason,
                'features': [c for c in df.columns if c != target]
            })
        
        # Smart visualization recommendations
        viz_recs = []
        
        # Executive KPIs
        for col in profile['numeric_cols'][:4]:
            viz_recs.append(f"📊 KPI Card: {col} (Current: {df[col].iloc[-1]:.2f}, Trend: {'↗️' if df[col].corr(pd.Series(range(len(df)))) > 0 else '↘️'})")
        
        # Comparative analysis
        for cat in profile['categorical_cols'][:2]:
            for num in profile['numeric_cols'][:2]:
                if df[cat].nunique() <= 10:
                    avg_by_cat = df.groupby(cat)[num].mean().round(2).to_dict()
                    viz_recs.append(f"📊 Bar Chart: {num} by {cat} - Averages: {avg_by_cat}")
        
        # Correlation insights
        if len(profile['numeric_cols']) >= 2:
            corr_matrix = df[profile['numeric_cols']].corr()
            strongest_corr = corr_matrix.abs().unstack().sort_values(ascending=False)
            strongest_corr = strongest_corr[strongest_corr < 1.0].head(3)
            for (var1, var2), corr_val in strongest_corr.items():
                viz_recs.append(f"🔗 Scatter Plot: {var1} vs {var2} (Correlation: {corr_val:.3f})")
        
        # Distribution insights
        for col in profile['numeric_cols'][:3]:
            q1, q3 = df[col].quantile([0.25, 0.75])
            outliers = len(df[(df[col] < q1 - 1.5*(q3-q1)) | (df[col] > q3 + 1.5*(q3-q1))])
            viz_recs.append(f"📈 Histogram: {col} distribution (Outliers: {outliers}, Skew: {df[col].skew():.2f})")
        
        # Generate comprehensive report
        report = f"""# 🧠 INTELLIGENT DATASET ANALYSIS

## 📊 Dataset Intelligence
**Rows:** {profile['shape'][0]:,} | **Columns:** {profile['shape'][1]} | **Size:** {profile['memory_mb']} MB | **Duplicates:** {profile['duplicates']}

**Data Quality Score:** {((len(df) - sum(profile['missing_data'].values())) / (len(df) * len(df.columns)) * 100):.1f}%

## 🎯 SMART TARGET DETECTION
"""
        
        for i, (target, score) in enumerate(top_targets[:3], 1):
            target_info = f"**{i}. {target}** (Confidence: {score}/20)"
            if target in smart_targets:
                target_info += f" ✅ **RECOMMENDED**"
                if target in profile['numeric_cols']:
                    target_info += f" | Values: {df[target].min():.2f} - {df[target].max():.2f} | Mean: {df[target].mean():.2f}"
                else:
                    top_values = df[target].value_counts().head(3).to_dict()
                    target_info += f" | Top categories: {top_values}"
            report += f"{target_info}\n"
        
        report += f"\n## 🤖 ML MODEL RECOMMENDATIONS\n"
        
        for i, rec in enumerate(ml_recommendations, 1):
            report += f"""### Scenario {i}: Predict `{rec['target']}`
**Problem Type:** {rec['type'].upper()} {'REGRESSION' if rec['type'] == 'numeric' else 'CLASSIFICATION'}
**AI Reasoning:** {rec['reason']}
**Recommended Models:**
1. 🥇 **{rec['models'][0]}** (Primary choice)
2. 🥈 **{rec['models'][1]}** (Alternative)  
3. 🥉 **{rec['models'][2]}** (Backup option)
**Features Available:** {len(rec['features'])} variables
**Data Split:** {int(len(df)*0.8):,} train / {int(len(df)*0.2):,} test

"""
        
        # Data preprocessing recommendations
        missing_cols = [col for col, missing in profile['missing_data'].items() if missing > 0]
        high_cardinality = [col for col in profile['categorical_cols'] if df[col].nunique() > 20]
        
        report += f"""## ⚙️ PREPROCESSING PIPELINE
**Missing Data:** {len(missing_cols)} columns need attention
"""
        if missing_cols:
            for col in missing_cols[:5]:
                missing_pct = (profile['missing_data'][col] / len(df)) * 100
                strategy = "Drop column" if missing_pct > 50 else ("Median fill" if col in profile['numeric_cols'] else "Mode fill")
                report += f"- `{col}`: {missing_pct:.1f}% missing → {strategy}\n"
        
        if high_cardinality:
            report += f"**High Cardinality:** {len(high_cardinality)} categorical columns need encoding\n"
            for col in high_cardinality[:3]:
                report += f"- `{col}`: {df[col].nunique()} categories → Target encoding recommended\n"
        
        report += f"\n## 📊 SMART VISUALIZATIONS\n"
        for viz in viz_recs:
            report += f"{viz}\n"
        
        # Performance predictions
        sample_size_category = "Large" if len(df) > 50000 else ("Medium" if len(df) > 5000 else "Small")
        feature_ratio = len(df.columns) / len(df)
        
        report += f"""
## 🚀 IMPLEMENTATION ROADMAP

### Phase 1: Data Preparation (Week 1)
- Handle {len(missing_cols)} missing data issues
- Encode {len(profile['categorical_cols'])} categorical variables
- Feature scaling for {len(profile['numeric_cols'])} numeric variables

### Phase 2: Model Development (Week 2-3)
- {sample_size_category} dataset → Expected training time: {'Minutes' if len(df) < 10000 else 'Hours'}
- Feature importance analysis using top recommended models
- Cross-validation with {5 if len(df) > 1000 else 3}-fold strategy

### Phase 3: Production (Week 4)
- Model deployment pipeline
- Performance monitoring dashboard
- A/B testing framework

## 💡 BUSINESS IMPACT PREDICTION
**Model Accuracy Expectation:** {85 if len(df) > 10000 and len(missing_cols) < 3 else 75}%+
**ROI Timeline:** 3-6 months
**Automation Potential:** {min(90, len(profile['numeric_cols']) * 10)}% of manual analysis
"""
        
        return report
        
    except Exception as e:
        return f"Analysis failed: {str(e)}\nEnsure file is valid CSV/Excel format."

def smart_ba_chat(message, history):
    """Intelligent BA assistant with contextual responses"""
    
    # Smart response generation based on keywords
    keywords = message.lower()
    
    if any(word in keywords for word in ['requirement', 'functional', 'specification']):
        response = f"""## 📋 Smart Requirements Analysis for: "{message}"

### 🎯 Identified Requirements Type
**Primary:** {'Functional' if 'function' in keywords else 'Business'} Requirements
**Complexity:** {'High' if len(message.split()) > 20 else 'Medium'}

### 📝 Generated Requirements Framework
1. **Must Have (P1)**
   - Core functionality: {message.split('.')[0] if '.' in message else message[:50]}
   - User authentication and authorization
   - Data validation and error handling

2. **Should Have (P2)**  
   - Reporting and analytics dashboard
   - Export/import capabilities
   - Audit trail functionality

3. **Could Have (P3)**
   - Advanced filtering options
   - Mobile responsiveness
   - Integration APIs

### ✅ Acceptance Criteria Template
```
Given: User has appropriate permissions
When: User performs {message.split()[-1] if message.split() else 'action'}
Then: System should respond within 3 seconds
And: Changes should be logged for audit
```

### 🔍 Next Steps
- [ ] Stakeholder validation workshop
- [ ] Technical feasibility assessment  
- [ ] Resource estimation and timeline
"""
    
    elif any(word in keywords for word in ['process', 'workflow', 'procedure']):
        response = f"""## 🔄 Process Analysis for: "{message}"

### 📊 Current State Assessment
**Process Complexity:** {'High' if 'complex' in keywords or 'multiple' in keywords else 'Medium'}
**Stakeholders Involved:** {'Multiple departments' if 'department' in keywords else 'Single team'}

### 🎯 Identified Pain Points
- Manual handoffs and delays
- Lack of visibility and tracking
- Inconsistent execution
- No performance metrics

### 🚀 Recommended Solution
**Automation Level:** {80 if 'automate' in keywords else 60}%
**Expected Efficiency Gain:** {40 if 'improve' in keywords else 25}%
**Implementation Timeline:** {'3-6 months' if 'large' in keywords else '6-12 weeks'}

### 📈 Process Optimization Steps
1. **Map Current State** (Week 1-2)
2. **Identify Bottlenecks** (Week 3)  
3. **Design Future State** (Week 4-5)
4. **Pilot Implementation** (Week 6-8)
5. **Full Rollout** (Week 9-12)

### 🎯 Success Metrics
- Process cycle time reduction: 50%+
- Error rate reduction: 80%+
- User satisfaction score: 4.5+/5
"""
    
    elif any(word in keywords for word in ['dashboard', 'report', 'visual', 'chart']):
        response = f"""## 📊 Visualization Strategy for: "{message}"

### 🎨 Smart Chart Recommendations
**Data Type Detected:** {'Time-series' if 'time' in keywords or 'trend' in keywords else 'Categorical'}
**Audience Level:** {'Executive' if 'executive' in keywords else 'Operational'}

### 📈 Recommended Visualizations
1. **KPI Dashboard**
   - Primary metrics with trend indicators
   - Color-coded status (Red/Yellow/Green)
   - Real-time data refresh

2. **Comparative Analysis**  
   - Bar charts for category comparison
   - Heat maps for correlation analysis
   - Scatter plots for relationship insights

3. **Trend Analysis**
   - Line charts for time-based data
   - Area charts for cumulative metrics
   - Waterfall charts for variance analysis

### 🎯 Dashboard Layout Strategy
```
┌─────────────────────────────────────────┐
│          Executive Summary KPIs         │
├─────────────────┬───────────────────────┤
│   Primary Chart │    Filter Controls    │
│   (60% width)   │    & Drill-downs     │
├─────────────────┴───────────────────────┤
│           Supporting Analytics          │
└─────────────────────────────────────────┘
```

### 💡 Business Intelligence Features
- Interactive filtering and drill-down
- Automated insights and anomaly detection
- Mobile-responsive design
- Export capabilities (PDF, Excel, PowerPoint)
"""
    
    else:
        # Generic intelligent response
        response = f"""## 🧠 Smart Business Analysis for: "{message}"

### 🎯 Analysis Summary
**Domain:** {'Technology' if any(tech in keywords for tech in ['system', 'software', 'digital', 'ai', 'ml']) else 'Business Operations'}
**Complexity:** {'High' if len(message.split()) > 15 else 'Medium'}
**Urgency:** {'High' if any(urgent in keywords for urgent in ['urgent', 'asap', 'critical']) else 'Normal'}

### 📊 Key Considerations
1. **Stakeholder Impact Analysis**
   - Primary users: Business operations team
   - Secondary users: Management and IT
   - Decision makers: Department heads

2. **Success Criteria Definition**  
   - Quantitative: ROI > 20%, Time savings > 30%
   - Qualitative: User satisfaction, Process efficiency
   - Timeline: {'6-8 weeks' if 'quick' in keywords else '3-4 months'}

3. **Risk Assessment**
   - Technical: {'Medium' if 'technical' in keywords else 'Low'}
   - Business: {'High' if 'change' in keywords else 'Medium'}  
   - Resource: Based on scope and timeline

### 🚀 Recommended Action Plan
**Phase 1:** Requirement gathering and stakeholder alignment
**Phase 2:** Solution design and prototype development  
**Phase 3:** Implementation and testing
**Phase 4:** Deployment and change management

### 💡 Next Steps
- Schedule stakeholder interviews
- Define detailed acceptance criteria
- Create project timeline and milestones
- Identify potential risks and mitigation strategies
"""
    
    return response

# Streamlined Gradio Interface
with gr.Blocks(title="Smart Business Analyst GPT", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 Smart Business Analyst GPT\n## AI-Powered Dataset Analysis & Business Intelligence")
    
    with gr.Tabs():
        with gr.TabItem("💬 Intelligent BA Assistant"):
            chatbot = gr.Chatbot(height=500, label="Smart BA Assistant")
            msg = gr.Textbox(placeholder="Describe your business challenge...", label="Your Question", lines=2)
            
            with gr.Row():
                submit = gr.Button("🚀 Get Smart Analysis", variant="primary")
                clear = gr.Button("Clear")
                
        with gr.TabItem("📊 Complete Dataset Analysis"):
            gr.Markdown("### Upload your dataset for comprehensive AI analysis")
            
            file_upload = gr.File(label="Upload CSV/Excel", file_types=[".csv", ".xlsx", ".xls"])
            analyze_btn = gr.Button("🧠 Complete Analysis", variant="primary", size="lg")
            analysis_output = gr.Textbox(label="Intelligent Analysis Report", lines=30, show_copy_button=True)
    
    # Event handlers
    def chat_respond(message, history):
        response = smart_ba_chat(message, history)
        history.append((message, response))
        return "", history
    
    msg.submit(chat_respond, [msg, chatbot], [msg, chatbot])
    submit.click(chat_respond, [msg, chatbot], [msg, chatbot])  
    clear.click(lambda: [], None, chatbot)
    analyze_btn.click(smart_dataset_analysis, file_upload, analysis_output)

if __name__ == "__main__":
    demo.launch()