Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| def smart_dataset_analysis(file): | |
| """Complete dataset analysis with intelligent recommendations""" | |
| if not file: return "Upload a dataset to get intelligent analysis." | |
| try: | |
| # Smart file reading | |
| df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name) | |
| # Deep dataset profiling | |
| profile = { | |
| 'shape': df.shape, | |
| 'numeric_cols': list(df.select_dtypes(include=[np.number]).columns), | |
| 'categorical_cols': list(df.select_dtypes(include=['object', 'category']).columns), | |
| 'datetime_cols': list(df.select_dtypes(include=['datetime64']).columns), | |
| 'missing_data': df.isnull().sum().to_dict(), | |
| 'duplicates': df.duplicated().sum(), | |
| 'memory_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2) | |
| } | |
| # Smart target detection with scoring | |
| target_scores = {} | |
| target_keywords = ['target', 'label', 'class', 'outcome', 'price', 'sales', 'churn', 'rating', 'status'] | |
| for col in df.columns: | |
| score = 0 | |
| col_lower = col.lower() | |
| # Keyword matching | |
| score += 15 if any(kw in col_lower for kw in target_keywords) else 0 | |
| # Statistical characteristics | |
| if col in profile['numeric_cols']: | |
| unique_ratio = df[col].nunique() / len(df) | |
| if 0.02 < unique_ratio < 0.95: # Good target range | |
| score += 10 | |
| if df[col].std() > 0: # Has variance | |
| score += 5 | |
| elif col in profile['categorical_cols']: | |
| unique_count = df[col].nunique() | |
| if 2 <= unique_count <= 20: # Good classification range | |
| score += 12 | |
| score += 5 if unique_count == 2 else 0 # Binary bonus | |
| # Position bias (targets often at end) | |
| if list(df.columns).index(col) >= len(df.columns) - 3: | |
| score += 3 | |
| target_scores[col] = score | |
| # Get top targets | |
| top_targets = sorted(target_scores.items(), key=lambda x: x[1], reverse=True)[:3] | |
| smart_targets = [col for col, score in top_targets if score > 8] | |
| # Generate ML recommendations for each target | |
| ml_recommendations = [] | |
| for target in smart_targets[:2]: # Top 2 targets | |
| target_type = 'numeric' if target in profile['numeric_cols'] else 'categorical' | |
| unique_vals = df[target].nunique() | |
| # Smart model selection based on actual data | |
| if target_type == 'numeric': | |
| skewness = df[target].skew() | |
| if abs(skewness) > 2: | |
| models = ['XGBoost Regressor', 'Random Forest', 'Gradient Boosting'] | |
| reason = f"Highly skewed data (skew: {skewness:.2f}) - tree models handle non-linearity" | |
| else: | |
| models = ['Linear Regression', 'Ridge/Lasso', 'Random Forest'] | |
| reason = f"Normal distribution - linear models suitable" | |
| else: | |
| if unique_vals == 2: | |
| models = ['Logistic Regression', 'XGBoost Classifier', 'Random Forest'] | |
| reason = f"Binary classification - balanced approach recommended" | |
| elif unique_vals <= 5: | |
| models = ['Multinomial Logistic', 'XGBoost', 'Random Forest'] | |
| reason = f"{unique_vals} classes - multi-class classification" | |
| else: | |
| models = ['XGBoost', 'Neural Networks', 'Random Forest'] | |
| reason = f"{unique_vals} classes - complex multi-class problem" | |
| ml_recommendations.append({ | |
| 'target': target, | |
| 'type': target_type, | |
| 'unique_vals': unique_vals, | |
| 'models': models, | |
| 'reason': reason, | |
| 'features': [c for c in df.columns if c != target] | |
| }) | |
| # Smart visualization recommendations | |
| viz_recs = [] | |
| # Executive KPIs | |
| for col in profile['numeric_cols'][:4]: | |
| viz_recs.append(f"π KPI Card: {col} (Current: {df[col].iloc[-1]:.2f}, Trend: {'βοΈ' if df[col].corr(pd.Series(range(len(df)))) > 0 else 'βοΈ'})") | |
| # Comparative analysis | |
| for cat in profile['categorical_cols'][:2]: | |
| for num in profile['numeric_cols'][:2]: | |
| if df[cat].nunique() <= 10: | |
| avg_by_cat = df.groupby(cat)[num].mean().round(2).to_dict() | |
| viz_recs.append(f"π Bar Chart: {num} by {cat} - Averages: {avg_by_cat}") | |
| # Correlation insights | |
| if len(profile['numeric_cols']) >= 2: | |
| corr_matrix = df[profile['numeric_cols']].corr() | |
| strongest_corr = corr_matrix.abs().unstack().sort_values(ascending=False) | |
| strongest_corr = strongest_corr[strongest_corr < 1.0].head(3) | |
| for (var1, var2), corr_val in strongest_corr.items(): | |
| viz_recs.append(f"π Scatter Plot: {var1} vs {var2} (Correlation: {corr_val:.3f})") | |
| # Distribution insights | |
| for col in profile['numeric_cols'][:3]: | |
| q1, q3 = df[col].quantile([0.25, 0.75]) | |
| outliers = len(df[(df[col] < q1 - 1.5*(q3-q1)) | (df[col] > q3 + 1.5*(q3-q1))]) | |
| viz_recs.append(f"π Histogram: {col} distribution (Outliers: {outliers}, Skew: {df[col].skew():.2f})") | |
| # Generate comprehensive report | |
| report = f"""# π§ INTELLIGENT DATASET ANALYSIS | |
| ## π Dataset Intelligence | |
| **Rows:** {profile['shape'][0]:,} | **Columns:** {profile['shape'][1]} | **Size:** {profile['memory_mb']} MB | **Duplicates:** {profile['duplicates']} | |
| **Data Quality Score:** {((len(df) - sum(profile['missing_data'].values())) / (len(df) * len(df.columns)) * 100):.1f}% | |
| ## π― SMART TARGET DETECTION | |
| """ | |
| for i, (target, score) in enumerate(top_targets[:3], 1): | |
| target_info = f"**{i}. {target}** (Confidence: {score}/20)" | |
| if target in smart_targets: | |
| target_info += f" β **RECOMMENDED**" | |
| if target in profile['numeric_cols']: | |
| target_info += f" | Values: {df[target].min():.2f} - {df[target].max():.2f} | Mean: {df[target].mean():.2f}" | |
| else: | |
| top_values = df[target].value_counts().head(3).to_dict() | |
| target_info += f" | Top categories: {top_values}" | |
| report += f"{target_info}\n" | |
| report += f"\n## π€ ML MODEL RECOMMENDATIONS\n" | |
| for i, rec in enumerate(ml_recommendations, 1): | |
| report += f"""### Scenario {i}: Predict `{rec['target']}` | |
| **Problem Type:** {rec['type'].upper()} {'REGRESSION' if rec['type'] == 'numeric' else 'CLASSIFICATION'} | |
| **AI Reasoning:** {rec['reason']} | |
| **Recommended Models:** | |
| 1. π₯ **{rec['models'][0]}** (Primary choice) | |
| 2. π₯ **{rec['models'][1]}** (Alternative) | |
| 3. π₯ **{rec['models'][2]}** (Backup option) | |
| **Features Available:** {len(rec['features'])} variables | |
| **Data Split:** {int(len(df)*0.8):,} train / {int(len(df)*0.2):,} test | |
| """ | |
| # Data preprocessing recommendations | |
| missing_cols = [col for col, missing in profile['missing_data'].items() if missing > 0] | |
| high_cardinality = [col for col in profile['categorical_cols'] if df[col].nunique() > 20] | |
| report += f"""## βοΈ PREPROCESSING PIPELINE | |
| **Missing Data:** {len(missing_cols)} columns need attention | |
| """ | |
| if missing_cols: | |
| for col in missing_cols[:5]: | |
| missing_pct = (profile['missing_data'][col] / len(df)) * 100 | |
| strategy = "Drop column" if missing_pct > 50 else ("Median fill" if col in profile['numeric_cols'] else "Mode fill") | |
| report += f"- `{col}`: {missing_pct:.1f}% missing β {strategy}\n" | |
| if high_cardinality: | |
| report += f"**High Cardinality:** {len(high_cardinality)} categorical columns need encoding\n" | |
| for col in high_cardinality[:3]: | |
| report += f"- `{col}`: {df[col].nunique()} categories β Target encoding recommended\n" | |
| report += f"\n## π SMART VISUALIZATIONS\n" | |
| for viz in viz_recs: | |
| report += f"{viz}\n" | |
| # Performance predictions | |
| sample_size_category = "Large" if len(df) > 50000 else ("Medium" if len(df) > 5000 else "Small") | |
| feature_ratio = len(df.columns) / len(df) | |
| report += f""" | |
| ## π IMPLEMENTATION ROADMAP | |
| ### Phase 1: Data Preparation (Week 1) | |
| - Handle {len(missing_cols)} missing data issues | |
| - Encode {len(profile['categorical_cols'])} categorical variables | |
| - Feature scaling for {len(profile['numeric_cols'])} numeric variables | |
| ### Phase 2: Model Development (Week 2-3) | |
| - {sample_size_category} dataset β Expected training time: {'Minutes' if len(df) < 10000 else 'Hours'} | |
| - Feature importance analysis using top recommended models | |
| - Cross-validation with {5 if len(df) > 1000 else 3}-fold strategy | |
| ### Phase 3: Production (Week 4) | |
| - Model deployment pipeline | |
| - Performance monitoring dashboard | |
| - A/B testing framework | |
| ## π‘ BUSINESS IMPACT PREDICTION | |
| **Model Accuracy Expectation:** {85 if len(df) > 10000 and len(missing_cols) < 3 else 75}%+ | |
| **ROI Timeline:** 3-6 months | |
| **Automation Potential:** {min(90, len(profile['numeric_cols']) * 10)}% of manual analysis | |
| """ | |
| return report | |
| except Exception as e: | |
| return f"Analysis failed: {str(e)}\nEnsure file is valid CSV/Excel format." | |
| def smart_ba_chat(message, history): | |
| """Intelligent BA assistant with contextual responses""" | |
| # Smart response generation based on keywords | |
| keywords = message.lower() | |
| if any(word in keywords for word in ['requirement', 'functional', 'specification']): | |
| response = f"""## π Smart Requirements Analysis for: "{message}" | |
| ### π― Identified Requirements Type | |
| **Primary:** {'Functional' if 'function' in keywords else 'Business'} Requirements | |
| **Complexity:** {'High' if len(message.split()) > 20 else 'Medium'} | |
| ### π Generated Requirements Framework | |
| 1. **Must Have (P1)** | |
| - Core functionality: {message.split('.')[0] if '.' in message else message[:50]} | |
| - User authentication and authorization | |
| - Data validation and error handling | |
| 2. **Should Have (P2)** | |
| - Reporting and analytics dashboard | |
| - Export/import capabilities | |
| - Audit trail functionality | |
| 3. **Could Have (P3)** | |
| - Advanced filtering options | |
| - Mobile responsiveness | |
| - Integration APIs | |
| ### β Acceptance Criteria Template | |
| ``` | |
| Given: User has appropriate permissions | |
| When: User performs {message.split()[-1] if message.split() else 'action'} | |
| Then: System should respond within 3 seconds | |
| And: Changes should be logged for audit | |
| ``` | |
| ### π Next Steps | |
| - [ ] Stakeholder validation workshop | |
| - [ ] Technical feasibility assessment | |
| - [ ] Resource estimation and timeline | |
| """ | |
| elif any(word in keywords for word in ['process', 'workflow', 'procedure']): | |
| response = f"""## π Process Analysis for: "{message}" | |
| ### π Current State Assessment | |
| **Process Complexity:** {'High' if 'complex' in keywords or 'multiple' in keywords else 'Medium'} | |
| **Stakeholders Involved:** {'Multiple departments' if 'department' in keywords else 'Single team'} | |
| ### π― Identified Pain Points | |
| - Manual handoffs and delays | |
| - Lack of visibility and tracking | |
| - Inconsistent execution | |
| - No performance metrics | |
| ### π Recommended Solution | |
| **Automation Level:** {80 if 'automate' in keywords else 60}% | |
| **Expected Efficiency Gain:** {40 if 'improve' in keywords else 25}% | |
| **Implementation Timeline:** {'3-6 months' if 'large' in keywords else '6-12 weeks'} | |
| ### π Process Optimization Steps | |
| 1. **Map Current State** (Week 1-2) | |
| 2. **Identify Bottlenecks** (Week 3) | |
| 3. **Design Future State** (Week 4-5) | |
| 4. **Pilot Implementation** (Week 6-8) | |
| 5. **Full Rollout** (Week 9-12) | |
| ### π― Success Metrics | |
| - Process cycle time reduction: 50%+ | |
| - Error rate reduction: 80%+ | |
| - User satisfaction score: 4.5+/5 | |
| """ | |
| elif any(word in keywords for word in ['dashboard', 'report', 'visual', 'chart']): | |
| response = f"""## π Visualization Strategy for: "{message}" | |
| ### π¨ Smart Chart Recommendations | |
| **Data Type Detected:** {'Time-series' if 'time' in keywords or 'trend' in keywords else 'Categorical'} | |
| **Audience Level:** {'Executive' if 'executive' in keywords else 'Operational'} | |
| ### π Recommended Visualizations | |
| 1. **KPI Dashboard** | |
| - Primary metrics with trend indicators | |
| - Color-coded status (Red/Yellow/Green) | |
| - Real-time data refresh | |
| 2. **Comparative Analysis** | |
| - Bar charts for category comparison | |
| - Heat maps for correlation analysis | |
| - Scatter plots for relationship insights | |
| 3. **Trend Analysis** | |
| - Line charts for time-based data | |
| - Area charts for cumulative metrics | |
| - Waterfall charts for variance analysis | |
| ### π― Dashboard Layout Strategy | |
| ``` | |
| βββββββββββββββββββββββββββββββββββββββββββ | |
| β Executive Summary KPIs β | |
| βββββββββββββββββββ¬ββββββββββββββββββββββββ€ | |
| β Primary Chart β Filter Controls β | |
| β (60% width) β & Drill-downs β | |
| βββββββββββββββββββ΄ββββββββββββββββββββββββ€ | |
| β Supporting Analytics β | |
| βββββββββββββββββββββββββββββββββββββββββββ | |
| ``` | |
| ### π‘ Business Intelligence Features | |
| - Interactive filtering and drill-down | |
| - Automated insights and anomaly detection | |
| - Mobile-responsive design | |
| - Export capabilities (PDF, Excel, PowerPoint) | |
| """ | |
| else: | |
| # Generic intelligent response | |
| response = f"""## π§ Smart Business Analysis for: "{message}" | |
| ### π― Analysis Summary | |
| **Domain:** {'Technology' if any(tech in keywords for tech in ['system', 'software', 'digital', 'ai', 'ml']) else 'Business Operations'} | |
| **Complexity:** {'High' if len(message.split()) > 15 else 'Medium'} | |
| **Urgency:** {'High' if any(urgent in keywords for urgent in ['urgent', 'asap', 'critical']) else 'Normal'} | |
| ### π Key Considerations | |
| 1. **Stakeholder Impact Analysis** | |
| - Primary users: Business operations team | |
| - Secondary users: Management and IT | |
| - Decision makers: Department heads | |
| 2. **Success Criteria Definition** | |
| - Quantitative: ROI > 20%, Time savings > 30% | |
| - Qualitative: User satisfaction, Process efficiency | |
| - Timeline: {'6-8 weeks' if 'quick' in keywords else '3-4 months'} | |
| 3. **Risk Assessment** | |
| - Technical: {'Medium' if 'technical' in keywords else 'Low'} | |
| - Business: {'High' if 'change' in keywords else 'Medium'} | |
| - Resource: Based on scope and timeline | |
| ### π Recommended Action Plan | |
| **Phase 1:** Requirement gathering and stakeholder alignment | |
| **Phase 2:** Solution design and prototype development | |
| **Phase 3:** Implementation and testing | |
| **Phase 4:** Deployment and change management | |
| ### π‘ Next Steps | |
| - Schedule stakeholder interviews | |
| - Define detailed acceptance criteria | |
| - Create project timeline and milestones | |
| - Identify potential risks and mitigation strategies | |
| """ | |
| return response | |
| # Streamlined Gradio Interface | |
| with gr.Blocks(title="Smart Business Analyst GPT", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π§ Smart Business Analyst GPT\n## AI-Powered Dataset Analysis & Business Intelligence") | |
| with gr.Tabs(): | |
| with gr.TabItem("π¬ Intelligent BA Assistant"): | |
| chatbot = gr.Chatbot(height=500, label="Smart BA Assistant") | |
| msg = gr.Textbox(placeholder="Describe your business challenge...", label="Your Question", lines=2) | |
| with gr.Row(): | |
| submit = gr.Button("π Get Smart Analysis", variant="primary") | |
| clear = gr.Button("Clear") | |
| with gr.TabItem("π Complete Dataset Analysis"): | |
| gr.Markdown("### Upload your dataset for comprehensive AI analysis") | |
| file_upload = gr.File(label="Upload CSV/Excel", file_types=[".csv", ".xlsx", ".xls"]) | |
| analyze_btn = gr.Button("π§ Complete Analysis", variant="primary", size="lg") | |
| analysis_output = gr.Textbox(label="Intelligent Analysis Report", lines=30, show_copy_button=True) | |
| # Event handlers | |
| def chat_respond(message, history): | |
| response = smart_ba_chat(message, history) | |
| history.append((message, response)) | |
| return "", history | |
| msg.submit(chat_respond, [msg, chatbot], [msg, chatbot]) | |
| submit.click(chat_respond, [msg, chatbot], [msg, chatbot]) | |
| clear.click(lambda: [], None, chatbot) | |
| analyze_btn.click(smart_dataset_analysis, file_upload, analysis_output) | |
| if __name__ == "__main__": | |
| demo.launch() |