File size: 17,413 Bytes
6ee1b87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
import gradio as gr
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

def smart_dataset_analysis(file):
    """Complete dataset analysis with intelligent recommendations"""
    if not file: return "Upload a dataset to get intelligent analysis."
    
    try:
        # Smart file reading
        df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name)
        
        # Deep dataset profiling
        profile = {
            'shape': df.shape,
            'numeric_cols': list(df.select_dtypes(include=[np.number]).columns),
            'categorical_cols': list(df.select_dtypes(include=['object', 'category']).columns),
            'datetime_cols': list(df.select_dtypes(include=['datetime64']).columns),
            'missing_data': df.isnull().sum().to_dict(),
            'duplicates': df.duplicated().sum(),
            'memory_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2)
        }
        
        # Smart target detection with scoring
        target_scores = {}
        target_keywords = ['target', 'label', 'class', 'outcome', 'price', 'sales', 'churn', 'rating', 'status']
        
        for col in df.columns:
            score = 0
            col_lower = col.lower()
            
            # Keyword matching
            score += 15 if any(kw in col_lower for kw in target_keywords) else 0
            
            # Statistical characteristics
            if col in profile['numeric_cols']:
                unique_ratio = df[col].nunique() / len(df)
                if 0.02 < unique_ratio < 0.95:  # Good target range
                    score += 10
                if df[col].std() > 0:  # Has variance
                    score += 5
            elif col in profile['categorical_cols']:
                unique_count = df[col].nunique()
                if 2 <= unique_count <= 20:  # Good classification range
                    score += 12
                    score += 5 if unique_count == 2 else 0  # Binary bonus
            
            # Position bias (targets often at end)
            if list(df.columns).index(col) >= len(df.columns) - 3:
                score += 3
                
            target_scores[col] = score
        
        # Get top targets
        top_targets = sorted(target_scores.items(), key=lambda x: x[1], reverse=True)[:3]
        smart_targets = [col for col, score in top_targets if score > 8]
        
        # Generate ML recommendations for each target
        ml_recommendations = []
        for target in smart_targets[:2]:  # Top 2 targets
            target_type = 'numeric' if target in profile['numeric_cols'] else 'categorical'
            unique_vals = df[target].nunique()
            
            # Smart model selection based on actual data
            if target_type == 'numeric':
                skewness = df[target].skew()
                if abs(skewness) > 2:
                    models = ['XGBoost Regressor', 'Random Forest', 'Gradient Boosting']
                    reason = f"Highly skewed data (skew: {skewness:.2f}) - tree models handle non-linearity"
                else:
                    models = ['Linear Regression', 'Ridge/Lasso', 'Random Forest']
                    reason = f"Normal distribution - linear models suitable"
            else:
                if unique_vals == 2:
                    models = ['Logistic Regression', 'XGBoost Classifier', 'Random Forest']
                    reason = f"Binary classification - balanced approach recommended"
                elif unique_vals <= 5:
                    models = ['Multinomial Logistic', 'XGBoost', 'Random Forest']
                    reason = f"{unique_vals} classes - multi-class classification"
                else:
                    models = ['XGBoost', 'Neural Networks', 'Random Forest']
                    reason = f"{unique_vals} classes - complex multi-class problem"
            
            ml_recommendations.append({
                'target': target,
                'type': target_type,
                'unique_vals': unique_vals,
                'models': models,
                'reason': reason,
                'features': [c for c in df.columns if c != target]
            })
        
        # Smart visualization recommendations
        viz_recs = []
        
        # Executive KPIs
        for col in profile['numeric_cols'][:4]:
            viz_recs.append(f"πŸ“Š KPI Card: {col} (Current: {df[col].iloc[-1]:.2f}, Trend: {'↗️' if df[col].corr(pd.Series(range(len(df)))) > 0 else 'β†˜οΈ'})")
        
        # Comparative analysis
        for cat in profile['categorical_cols'][:2]:
            for num in profile['numeric_cols'][:2]:
                if df[cat].nunique() <= 10:
                    avg_by_cat = df.groupby(cat)[num].mean().round(2).to_dict()
                    viz_recs.append(f"πŸ“Š Bar Chart: {num} by {cat} - Averages: {avg_by_cat}")
        
        # Correlation insights
        if len(profile['numeric_cols']) >= 2:
            corr_matrix = df[profile['numeric_cols']].corr()
            strongest_corr = corr_matrix.abs().unstack().sort_values(ascending=False)
            strongest_corr = strongest_corr[strongest_corr < 1.0].head(3)
            for (var1, var2), corr_val in strongest_corr.items():
                viz_recs.append(f"πŸ”— Scatter Plot: {var1} vs {var2} (Correlation: {corr_val:.3f})")
        
        # Distribution insights
        for col in profile['numeric_cols'][:3]:
            q1, q3 = df[col].quantile([0.25, 0.75])
            outliers = len(df[(df[col] < q1 - 1.5*(q3-q1)) | (df[col] > q3 + 1.5*(q3-q1))])
            viz_recs.append(f"πŸ“ˆ Histogram: {col} distribution (Outliers: {outliers}, Skew: {df[col].skew():.2f})")
        
        # Generate comprehensive report
        report = f"""# 🧠 INTELLIGENT DATASET ANALYSIS

## πŸ“Š Dataset Intelligence
**Rows:** {profile['shape'][0]:,} | **Columns:** {profile['shape'][1]} | **Size:** {profile['memory_mb']} MB | **Duplicates:** {profile['duplicates']}

**Data Quality Score:** {((len(df) - sum(profile['missing_data'].values())) / (len(df) * len(df.columns)) * 100):.1f}%

## 🎯 SMART TARGET DETECTION
"""
        
        for i, (target, score) in enumerate(top_targets[:3], 1):
            target_info = f"**{i}. {target}** (Confidence: {score}/20)"
            if target in smart_targets:
                target_info += f" βœ… **RECOMMENDED**"
                if target in profile['numeric_cols']:
                    target_info += f" | Values: {df[target].min():.2f} - {df[target].max():.2f} | Mean: {df[target].mean():.2f}"
                else:
                    top_values = df[target].value_counts().head(3).to_dict()
                    target_info += f" | Top categories: {top_values}"
            report += f"{target_info}\n"
        
        report += f"\n## πŸ€– ML MODEL RECOMMENDATIONS\n"
        
        for i, rec in enumerate(ml_recommendations, 1):
            report += f"""### Scenario {i}: Predict `{rec['target']}`
**Problem Type:** {rec['type'].upper()} {'REGRESSION' if rec['type'] == 'numeric' else 'CLASSIFICATION'}
**AI Reasoning:** {rec['reason']}
**Recommended Models:**
1. πŸ₯‡ **{rec['models'][0]}** (Primary choice)
2. πŸ₯ˆ **{rec['models'][1]}** (Alternative)  
3. πŸ₯‰ **{rec['models'][2]}** (Backup option)
**Features Available:** {len(rec['features'])} variables
**Data Split:** {int(len(df)*0.8):,} train / {int(len(df)*0.2):,} test

"""
        
        # Data preprocessing recommendations
        missing_cols = [col for col, missing in profile['missing_data'].items() if missing > 0]
        high_cardinality = [col for col in profile['categorical_cols'] if df[col].nunique() > 20]
        
        report += f"""## βš™οΈ PREPROCESSING PIPELINE
**Missing Data:** {len(missing_cols)} columns need attention
"""
        if missing_cols:
            for col in missing_cols[:5]:
                missing_pct = (profile['missing_data'][col] / len(df)) * 100
                strategy = "Drop column" if missing_pct > 50 else ("Median fill" if col in profile['numeric_cols'] else "Mode fill")
                report += f"- `{col}`: {missing_pct:.1f}% missing β†’ {strategy}\n"
        
        if high_cardinality:
            report += f"**High Cardinality:** {len(high_cardinality)} categorical columns need encoding\n"
            for col in high_cardinality[:3]:
                report += f"- `{col}`: {df[col].nunique()} categories β†’ Target encoding recommended\n"
        
        report += f"\n## πŸ“Š SMART VISUALIZATIONS\n"
        for viz in viz_recs:
            report += f"{viz}\n"
        
        # Performance predictions
        sample_size_category = "Large" if len(df) > 50000 else ("Medium" if len(df) > 5000 else "Small")
        feature_ratio = len(df.columns) / len(df)
        
        report += f"""
## πŸš€ IMPLEMENTATION ROADMAP

### Phase 1: Data Preparation (Week 1)
- Handle {len(missing_cols)} missing data issues
- Encode {len(profile['categorical_cols'])} categorical variables
- Feature scaling for {len(profile['numeric_cols'])} numeric variables

### Phase 2: Model Development (Week 2-3)
- {sample_size_category} dataset β†’ Expected training time: {'Minutes' if len(df) < 10000 else 'Hours'}
- Feature importance analysis using top recommended models
- Cross-validation with {5 if len(df) > 1000 else 3}-fold strategy

### Phase 3: Production (Week 4)
- Model deployment pipeline
- Performance monitoring dashboard
- A/B testing framework

## πŸ’‘ BUSINESS IMPACT PREDICTION
**Model Accuracy Expectation:** {85 if len(df) > 10000 and len(missing_cols) < 3 else 75}%+
**ROI Timeline:** 3-6 months
**Automation Potential:** {min(90, len(profile['numeric_cols']) * 10)}% of manual analysis
"""
        
        return report
        
    except Exception as e:
        return f"Analysis failed: {str(e)}\nEnsure file is valid CSV/Excel format."

def smart_ba_chat(message, history):
    """Intelligent BA assistant with contextual responses"""
    
    # Smart response generation based on keywords
    keywords = message.lower()
    
    if any(word in keywords for word in ['requirement', 'functional', 'specification']):
        response = f"""## πŸ“‹ Smart Requirements Analysis for: "{message}"

### 🎯 Identified Requirements Type
**Primary:** {'Functional' if 'function' in keywords else 'Business'} Requirements
**Complexity:** {'High' if len(message.split()) > 20 else 'Medium'}

### πŸ“ Generated Requirements Framework
1. **Must Have (P1)**
   - Core functionality: {message.split('.')[0] if '.' in message else message[:50]}
   - User authentication and authorization
   - Data validation and error handling

2. **Should Have (P2)**  
   - Reporting and analytics dashboard
   - Export/import capabilities
   - Audit trail functionality

3. **Could Have (P3)**
   - Advanced filtering options
   - Mobile responsiveness
   - Integration APIs

### βœ… Acceptance Criteria Template
```
Given: User has appropriate permissions
When: User performs {message.split()[-1] if message.split() else 'action'}
Then: System should respond within 3 seconds
And: Changes should be logged for audit
```

### πŸ” Next Steps
- [ ] Stakeholder validation workshop
- [ ] Technical feasibility assessment  
- [ ] Resource estimation and timeline
"""
    
    elif any(word in keywords for word in ['process', 'workflow', 'procedure']):
        response = f"""## πŸ”„ Process Analysis for: "{message}"

### πŸ“Š Current State Assessment
**Process Complexity:** {'High' if 'complex' in keywords or 'multiple' in keywords else 'Medium'}
**Stakeholders Involved:** {'Multiple departments' if 'department' in keywords else 'Single team'}

### 🎯 Identified Pain Points
- Manual handoffs and delays
- Lack of visibility and tracking
- Inconsistent execution
- No performance metrics

### πŸš€ Recommended Solution
**Automation Level:** {80 if 'automate' in keywords else 60}%
**Expected Efficiency Gain:** {40 if 'improve' in keywords else 25}%
**Implementation Timeline:** {'3-6 months' if 'large' in keywords else '6-12 weeks'}

### πŸ“ˆ Process Optimization Steps
1. **Map Current State** (Week 1-2)
2. **Identify Bottlenecks** (Week 3)  
3. **Design Future State** (Week 4-5)
4. **Pilot Implementation** (Week 6-8)
5. **Full Rollout** (Week 9-12)

### 🎯 Success Metrics
- Process cycle time reduction: 50%+
- Error rate reduction: 80%+
- User satisfaction score: 4.5+/5
"""
    
    elif any(word in keywords for word in ['dashboard', 'report', 'visual', 'chart']):
        response = f"""## πŸ“Š Visualization Strategy for: "{message}"

### 🎨 Smart Chart Recommendations
**Data Type Detected:** {'Time-series' if 'time' in keywords or 'trend' in keywords else 'Categorical'}
**Audience Level:** {'Executive' if 'executive' in keywords else 'Operational'}

### πŸ“ˆ Recommended Visualizations
1. **KPI Dashboard**
   - Primary metrics with trend indicators
   - Color-coded status (Red/Yellow/Green)
   - Real-time data refresh

2. **Comparative Analysis**  
   - Bar charts for category comparison
   - Heat maps for correlation analysis
   - Scatter plots for relationship insights

3. **Trend Analysis**
   - Line charts for time-based data
   - Area charts for cumulative metrics
   - Waterfall charts for variance analysis

### 🎯 Dashboard Layout Strategy
```
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚          Executive Summary KPIs         β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚   Primary Chart β”‚    Filter Controls    β”‚
β”‚   (60% width)   β”‚    & Drill-downs     β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚           Supporting Analytics          β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
```

### πŸ’‘ Business Intelligence Features
- Interactive filtering and drill-down
- Automated insights and anomaly detection
- Mobile-responsive design
- Export capabilities (PDF, Excel, PowerPoint)
"""
    
    else:
        # Generic intelligent response
        response = f"""## 🧠 Smart Business Analysis for: "{message}"

### 🎯 Analysis Summary
**Domain:** {'Technology' if any(tech in keywords for tech in ['system', 'software', 'digital', 'ai', 'ml']) else 'Business Operations'}
**Complexity:** {'High' if len(message.split()) > 15 else 'Medium'}
**Urgency:** {'High' if any(urgent in keywords for urgent in ['urgent', 'asap', 'critical']) else 'Normal'}

### πŸ“Š Key Considerations
1. **Stakeholder Impact Analysis**
   - Primary users: Business operations team
   - Secondary users: Management and IT
   - Decision makers: Department heads

2. **Success Criteria Definition**  
   - Quantitative: ROI > 20%, Time savings > 30%
   - Qualitative: User satisfaction, Process efficiency
   - Timeline: {'6-8 weeks' if 'quick' in keywords else '3-4 months'}

3. **Risk Assessment**
   - Technical: {'Medium' if 'technical' in keywords else 'Low'}
   - Business: {'High' if 'change' in keywords else 'Medium'}  
   - Resource: Based on scope and timeline

### πŸš€ Recommended Action Plan
**Phase 1:** Requirement gathering and stakeholder alignment
**Phase 2:** Solution design and prototype development  
**Phase 3:** Implementation and testing
**Phase 4:** Deployment and change management

### πŸ’‘ Next Steps
- Schedule stakeholder interviews
- Define detailed acceptance criteria
- Create project timeline and milestones
- Identify potential risks and mitigation strategies
"""
    
    return response

# Streamlined Gradio Interface
with gr.Blocks(title="Smart Business Analyst GPT", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 Smart Business Analyst GPT\n## AI-Powered Dataset Analysis & Business Intelligence")
    
    with gr.Tabs():
        with gr.TabItem("πŸ’¬ Intelligent BA Assistant"):
            chatbot = gr.Chatbot(height=500, label="Smart BA Assistant")
            msg = gr.Textbox(placeholder="Describe your business challenge...", label="Your Question", lines=2)
            
            with gr.Row():
                submit = gr.Button("πŸš€ Get Smart Analysis", variant="primary")
                clear = gr.Button("Clear")
                
        with gr.TabItem("πŸ“Š Complete Dataset Analysis"):
            gr.Markdown("### Upload your dataset for comprehensive AI analysis")
            
            file_upload = gr.File(label="Upload CSV/Excel", file_types=[".csv", ".xlsx", ".xls"])
            analyze_btn = gr.Button("🧠 Complete Analysis", variant="primary", size="lg")
            analysis_output = gr.Textbox(label="Intelligent Analysis Report", lines=30, show_copy_button=True)
    
    # Event handlers
    def chat_respond(message, history):
        response = smart_ba_chat(message, history)
        history.append((message, response))
        return "", history
    
    msg.submit(chat_respond, [msg, chatbot], [msg, chatbot])
    submit.click(chat_respond, [msg, chatbot], [msg, chatbot])  
    clear.click(lambda: [], None, chatbot)
    analyze_btn.click(smart_dataset_analysis, file_upload, analysis_output)

if __name__ == "__main__":
    demo.launch()