Spaces:
Sleeping
Sleeping
File size: 17,413 Bytes
6ee1b87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 |
import gradio as gr
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
def smart_dataset_analysis(file):
"""Complete dataset analysis with intelligent recommendations"""
if not file: return "Upload a dataset to get intelligent analysis."
try:
# Smart file reading
df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name)
# Deep dataset profiling
profile = {
'shape': df.shape,
'numeric_cols': list(df.select_dtypes(include=[np.number]).columns),
'categorical_cols': list(df.select_dtypes(include=['object', 'category']).columns),
'datetime_cols': list(df.select_dtypes(include=['datetime64']).columns),
'missing_data': df.isnull().sum().to_dict(),
'duplicates': df.duplicated().sum(),
'memory_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2)
}
# Smart target detection with scoring
target_scores = {}
target_keywords = ['target', 'label', 'class', 'outcome', 'price', 'sales', 'churn', 'rating', 'status']
for col in df.columns:
score = 0
col_lower = col.lower()
# Keyword matching
score += 15 if any(kw in col_lower for kw in target_keywords) else 0
# Statistical characteristics
if col in profile['numeric_cols']:
unique_ratio = df[col].nunique() / len(df)
if 0.02 < unique_ratio < 0.95: # Good target range
score += 10
if df[col].std() > 0: # Has variance
score += 5
elif col in profile['categorical_cols']:
unique_count = df[col].nunique()
if 2 <= unique_count <= 20: # Good classification range
score += 12
score += 5 if unique_count == 2 else 0 # Binary bonus
# Position bias (targets often at end)
if list(df.columns).index(col) >= len(df.columns) - 3:
score += 3
target_scores[col] = score
# Get top targets
top_targets = sorted(target_scores.items(), key=lambda x: x[1], reverse=True)[:3]
smart_targets = [col for col, score in top_targets if score > 8]
# Generate ML recommendations for each target
ml_recommendations = []
for target in smart_targets[:2]: # Top 2 targets
target_type = 'numeric' if target in profile['numeric_cols'] else 'categorical'
unique_vals = df[target].nunique()
# Smart model selection based on actual data
if target_type == 'numeric':
skewness = df[target].skew()
if abs(skewness) > 2:
models = ['XGBoost Regressor', 'Random Forest', 'Gradient Boosting']
reason = f"Highly skewed data (skew: {skewness:.2f}) - tree models handle non-linearity"
else:
models = ['Linear Regression', 'Ridge/Lasso', 'Random Forest']
reason = f"Normal distribution - linear models suitable"
else:
if unique_vals == 2:
models = ['Logistic Regression', 'XGBoost Classifier', 'Random Forest']
reason = f"Binary classification - balanced approach recommended"
elif unique_vals <= 5:
models = ['Multinomial Logistic', 'XGBoost', 'Random Forest']
reason = f"{unique_vals} classes - multi-class classification"
else:
models = ['XGBoost', 'Neural Networks', 'Random Forest']
reason = f"{unique_vals} classes - complex multi-class problem"
ml_recommendations.append({
'target': target,
'type': target_type,
'unique_vals': unique_vals,
'models': models,
'reason': reason,
'features': [c for c in df.columns if c != target]
})
# Smart visualization recommendations
viz_recs = []
# Executive KPIs
for col in profile['numeric_cols'][:4]:
viz_recs.append(f"π KPI Card: {col} (Current: {df[col].iloc[-1]:.2f}, Trend: {'βοΈ' if df[col].corr(pd.Series(range(len(df)))) > 0 else 'βοΈ'})")
# Comparative analysis
for cat in profile['categorical_cols'][:2]:
for num in profile['numeric_cols'][:2]:
if df[cat].nunique() <= 10:
avg_by_cat = df.groupby(cat)[num].mean().round(2).to_dict()
viz_recs.append(f"π Bar Chart: {num} by {cat} - Averages: {avg_by_cat}")
# Correlation insights
if len(profile['numeric_cols']) >= 2:
corr_matrix = df[profile['numeric_cols']].corr()
strongest_corr = corr_matrix.abs().unstack().sort_values(ascending=False)
strongest_corr = strongest_corr[strongest_corr < 1.0].head(3)
for (var1, var2), corr_val in strongest_corr.items():
viz_recs.append(f"π Scatter Plot: {var1} vs {var2} (Correlation: {corr_val:.3f})")
# Distribution insights
for col in profile['numeric_cols'][:3]:
q1, q3 = df[col].quantile([0.25, 0.75])
outliers = len(df[(df[col] < q1 - 1.5*(q3-q1)) | (df[col] > q3 + 1.5*(q3-q1))])
viz_recs.append(f"π Histogram: {col} distribution (Outliers: {outliers}, Skew: {df[col].skew():.2f})")
# Generate comprehensive report
report = f"""# π§ INTELLIGENT DATASET ANALYSIS
## π Dataset Intelligence
**Rows:** {profile['shape'][0]:,} | **Columns:** {profile['shape'][1]} | **Size:** {profile['memory_mb']} MB | **Duplicates:** {profile['duplicates']}
**Data Quality Score:** {((len(df) - sum(profile['missing_data'].values())) / (len(df) * len(df.columns)) * 100):.1f}%
## π― SMART TARGET DETECTION
"""
for i, (target, score) in enumerate(top_targets[:3], 1):
target_info = f"**{i}. {target}** (Confidence: {score}/20)"
if target in smart_targets:
target_info += f" β
**RECOMMENDED**"
if target in profile['numeric_cols']:
target_info += f" | Values: {df[target].min():.2f} - {df[target].max():.2f} | Mean: {df[target].mean():.2f}"
else:
top_values = df[target].value_counts().head(3).to_dict()
target_info += f" | Top categories: {top_values}"
report += f"{target_info}\n"
report += f"\n## π€ ML MODEL RECOMMENDATIONS\n"
for i, rec in enumerate(ml_recommendations, 1):
report += f"""### Scenario {i}: Predict `{rec['target']}`
**Problem Type:** {rec['type'].upper()} {'REGRESSION' if rec['type'] == 'numeric' else 'CLASSIFICATION'}
**AI Reasoning:** {rec['reason']}
**Recommended Models:**
1. π₯ **{rec['models'][0]}** (Primary choice)
2. π₯ **{rec['models'][1]}** (Alternative)
3. π₯ **{rec['models'][2]}** (Backup option)
**Features Available:** {len(rec['features'])} variables
**Data Split:** {int(len(df)*0.8):,} train / {int(len(df)*0.2):,} test
"""
# Data preprocessing recommendations
missing_cols = [col for col, missing in profile['missing_data'].items() if missing > 0]
high_cardinality = [col for col in profile['categorical_cols'] if df[col].nunique() > 20]
report += f"""## βοΈ PREPROCESSING PIPELINE
**Missing Data:** {len(missing_cols)} columns need attention
"""
if missing_cols:
for col in missing_cols[:5]:
missing_pct = (profile['missing_data'][col] / len(df)) * 100
strategy = "Drop column" if missing_pct > 50 else ("Median fill" if col in profile['numeric_cols'] else "Mode fill")
report += f"- `{col}`: {missing_pct:.1f}% missing β {strategy}\n"
if high_cardinality:
report += f"**High Cardinality:** {len(high_cardinality)} categorical columns need encoding\n"
for col in high_cardinality[:3]:
report += f"- `{col}`: {df[col].nunique()} categories β Target encoding recommended\n"
report += f"\n## π SMART VISUALIZATIONS\n"
for viz in viz_recs:
report += f"{viz}\n"
# Performance predictions
sample_size_category = "Large" if len(df) > 50000 else ("Medium" if len(df) > 5000 else "Small")
feature_ratio = len(df.columns) / len(df)
report += f"""
## π IMPLEMENTATION ROADMAP
### Phase 1: Data Preparation (Week 1)
- Handle {len(missing_cols)} missing data issues
- Encode {len(profile['categorical_cols'])} categorical variables
- Feature scaling for {len(profile['numeric_cols'])} numeric variables
### Phase 2: Model Development (Week 2-3)
- {sample_size_category} dataset β Expected training time: {'Minutes' if len(df) < 10000 else 'Hours'}
- Feature importance analysis using top recommended models
- Cross-validation with {5 if len(df) > 1000 else 3}-fold strategy
### Phase 3: Production (Week 4)
- Model deployment pipeline
- Performance monitoring dashboard
- A/B testing framework
## π‘ BUSINESS IMPACT PREDICTION
**Model Accuracy Expectation:** {85 if len(df) > 10000 and len(missing_cols) < 3 else 75}%+
**ROI Timeline:** 3-6 months
**Automation Potential:** {min(90, len(profile['numeric_cols']) * 10)}% of manual analysis
"""
return report
except Exception as e:
return f"Analysis failed: {str(e)}\nEnsure file is valid CSV/Excel format."
def smart_ba_chat(message, history):
"""Intelligent BA assistant with contextual responses"""
# Smart response generation based on keywords
keywords = message.lower()
if any(word in keywords for word in ['requirement', 'functional', 'specification']):
response = f"""## π Smart Requirements Analysis for: "{message}"
### π― Identified Requirements Type
**Primary:** {'Functional' if 'function' in keywords else 'Business'} Requirements
**Complexity:** {'High' if len(message.split()) > 20 else 'Medium'}
### π Generated Requirements Framework
1. **Must Have (P1)**
- Core functionality: {message.split('.')[0] if '.' in message else message[:50]}
- User authentication and authorization
- Data validation and error handling
2. **Should Have (P2)**
- Reporting and analytics dashboard
- Export/import capabilities
- Audit trail functionality
3. **Could Have (P3)**
- Advanced filtering options
- Mobile responsiveness
- Integration APIs
### β
Acceptance Criteria Template
```
Given: User has appropriate permissions
When: User performs {message.split()[-1] if message.split() else 'action'}
Then: System should respond within 3 seconds
And: Changes should be logged for audit
```
### π Next Steps
- [ ] Stakeholder validation workshop
- [ ] Technical feasibility assessment
- [ ] Resource estimation and timeline
"""
elif any(word in keywords for word in ['process', 'workflow', 'procedure']):
response = f"""## π Process Analysis for: "{message}"
### π Current State Assessment
**Process Complexity:** {'High' if 'complex' in keywords or 'multiple' in keywords else 'Medium'}
**Stakeholders Involved:** {'Multiple departments' if 'department' in keywords else 'Single team'}
### π― Identified Pain Points
- Manual handoffs and delays
- Lack of visibility and tracking
- Inconsistent execution
- No performance metrics
### π Recommended Solution
**Automation Level:** {80 if 'automate' in keywords else 60}%
**Expected Efficiency Gain:** {40 if 'improve' in keywords else 25}%
**Implementation Timeline:** {'3-6 months' if 'large' in keywords else '6-12 weeks'}
### π Process Optimization Steps
1. **Map Current State** (Week 1-2)
2. **Identify Bottlenecks** (Week 3)
3. **Design Future State** (Week 4-5)
4. **Pilot Implementation** (Week 6-8)
5. **Full Rollout** (Week 9-12)
### π― Success Metrics
- Process cycle time reduction: 50%+
- Error rate reduction: 80%+
- User satisfaction score: 4.5+/5
"""
elif any(word in keywords for word in ['dashboard', 'report', 'visual', 'chart']):
response = f"""## π Visualization Strategy for: "{message}"
### π¨ Smart Chart Recommendations
**Data Type Detected:** {'Time-series' if 'time' in keywords or 'trend' in keywords else 'Categorical'}
**Audience Level:** {'Executive' if 'executive' in keywords else 'Operational'}
### π Recommended Visualizations
1. **KPI Dashboard**
- Primary metrics with trend indicators
- Color-coded status (Red/Yellow/Green)
- Real-time data refresh
2. **Comparative Analysis**
- Bar charts for category comparison
- Heat maps for correlation analysis
- Scatter plots for relationship insights
3. **Trend Analysis**
- Line charts for time-based data
- Area charts for cumulative metrics
- Waterfall charts for variance analysis
### π― Dashboard Layout Strategy
```
βββββββββββββββββββββββββββββββββββββββββββ
β Executive Summary KPIs β
βββββββββββββββββββ¬ββββββββββββββββββββββββ€
β Primary Chart β Filter Controls β
β (60% width) β & Drill-downs β
βββββββββββββββββββ΄ββββββββββββββββββββββββ€
β Supporting Analytics β
βββββββββββββββββββββββββββββββββββββββββββ
```
### π‘ Business Intelligence Features
- Interactive filtering and drill-down
- Automated insights and anomaly detection
- Mobile-responsive design
- Export capabilities (PDF, Excel, PowerPoint)
"""
else:
# Generic intelligent response
response = f"""## π§ Smart Business Analysis for: "{message}"
### π― Analysis Summary
**Domain:** {'Technology' if any(tech in keywords for tech in ['system', 'software', 'digital', 'ai', 'ml']) else 'Business Operations'}
**Complexity:** {'High' if len(message.split()) > 15 else 'Medium'}
**Urgency:** {'High' if any(urgent in keywords for urgent in ['urgent', 'asap', 'critical']) else 'Normal'}
### π Key Considerations
1. **Stakeholder Impact Analysis**
- Primary users: Business operations team
- Secondary users: Management and IT
- Decision makers: Department heads
2. **Success Criteria Definition**
- Quantitative: ROI > 20%, Time savings > 30%
- Qualitative: User satisfaction, Process efficiency
- Timeline: {'6-8 weeks' if 'quick' in keywords else '3-4 months'}
3. **Risk Assessment**
- Technical: {'Medium' if 'technical' in keywords else 'Low'}
- Business: {'High' if 'change' in keywords else 'Medium'}
- Resource: Based on scope and timeline
### π Recommended Action Plan
**Phase 1:** Requirement gathering and stakeholder alignment
**Phase 2:** Solution design and prototype development
**Phase 3:** Implementation and testing
**Phase 4:** Deployment and change management
### π‘ Next Steps
- Schedule stakeholder interviews
- Define detailed acceptance criteria
- Create project timeline and milestones
- Identify potential risks and mitigation strategies
"""
return response
# Streamlined Gradio Interface
with gr.Blocks(title="Smart Business Analyst GPT", theme=gr.themes.Soft()) as demo:
gr.Markdown("# π§ Smart Business Analyst GPT\n## AI-Powered Dataset Analysis & Business Intelligence")
with gr.Tabs():
with gr.TabItem("π¬ Intelligent BA Assistant"):
chatbot = gr.Chatbot(height=500, label="Smart BA Assistant")
msg = gr.Textbox(placeholder="Describe your business challenge...", label="Your Question", lines=2)
with gr.Row():
submit = gr.Button("π Get Smart Analysis", variant="primary")
clear = gr.Button("Clear")
with gr.TabItem("π Complete Dataset Analysis"):
gr.Markdown("### Upload your dataset for comprehensive AI analysis")
file_upload = gr.File(label="Upload CSV/Excel", file_types=[".csv", ".xlsx", ".xls"])
analyze_btn = gr.Button("π§ Complete Analysis", variant="primary", size="lg")
analysis_output = gr.Textbox(label="Intelligent Analysis Report", lines=30, show_copy_button=True)
# Event handlers
def chat_respond(message, history):
response = smart_ba_chat(message, history)
history.append((message, response))
return "", history
msg.submit(chat_respond, [msg, chatbot], [msg, chatbot])
submit.click(chat_respond, [msg, chatbot], [msg, chatbot])
clear.click(lambda: [], None, chatbot)
analyze_btn.click(smart_dataset_analysis, file_upload, analysis_output)
if __name__ == "__main__":
demo.launch() |