Spaces:

charanKompala
/

business-analyst-gpt

Sleeping

App Files Files Community

business-analyst-gpt / app.py

charanKompala

Rename app (3).py to app.py

d3a5eeb verified 7 months ago

raw

history blame contribute delete

17.4 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	from datetime import datetime
	import warnings
	warnings.filterwarnings('ignore')

	def smart_dataset_analysis(file):
	"""Complete dataset analysis with intelligent recommendations"""
	if not file: return "Upload a dataset to get intelligent analysis."

	try:
	# Smart file reading
	df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name)

	# Deep dataset profiling
	profile = {
	'shape': df.shape,
	'numeric_cols': list(df.select_dtypes(include=[np.number]).columns),
	'categorical_cols': list(df.select_dtypes(include=['object', 'category']).columns),
	'datetime_cols': list(df.select_dtypes(include=['datetime64']).columns),
	'missing_data': df.isnull().sum().to_dict(),
	'duplicates': df.duplicated().sum(),
	'memory_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2)
	}

	# Smart target detection with scoring
	target_scores = {}
	target_keywords = ['target', 'label', 'class', 'outcome', 'price', 'sales', 'churn', 'rating', 'status']

	for col in df.columns:
	score = 0
	col_lower = col.lower()

	# Keyword matching
	score += 15 if any(kw in col_lower for kw in target_keywords) else 0

	# Statistical characteristics
	if col in profile['numeric_cols']:
	unique_ratio = df[col].nunique() / len(df)
	if 0.02 < unique_ratio < 0.95: # Good target range
	score += 10
	if df[col].std() > 0: # Has variance
	score += 5
	elif col in profile['categorical_cols']:
	unique_count = df[col].nunique()
	if 2 <= unique_count <= 20: # Good classification range
	score += 12
	score += 5 if unique_count == 2 else 0 # Binary bonus

	# Position bias (targets often at end)
	if list(df.columns).index(col) >= len(df.columns) - 3:
	score += 3

	target_scores[col] = score

	# Get top targets
	top_targets = sorted(target_scores.items(), key=lambda x: x[1], reverse=True)[:3]
	smart_targets = [col for col, score in top_targets if score > 8]

	# Generate ML recommendations for each target
	ml_recommendations = []
	for target in smart_targets[:2]: # Top 2 targets
	target_type = 'numeric' if target in profile['numeric_cols'] else 'categorical'
	unique_vals = df[target].nunique()

	# Smart model selection based on actual data
	if target_type == 'numeric':
	skewness = df[target].skew()
	if abs(skewness) > 2:
	models = ['XGBoost Regressor', 'Random Forest', 'Gradient Boosting']
	reason = f"Highly skewed data (skew: {skewness:.2f}) - tree models handle non-linearity"
	else:
	models = ['Linear Regression', 'Ridge/Lasso', 'Random Forest']
	reason = f"Normal distribution - linear models suitable"
	else:
	if unique_vals == 2:
	models = ['Logistic Regression', 'XGBoost Classifier', 'Random Forest']
	reason = f"Binary classification - balanced approach recommended"
	elif unique_vals <= 5:
	models = ['Multinomial Logistic', 'XGBoost', 'Random Forest']
	reason = f"{unique_vals} classes - multi-class classification"
	else:
	models = ['XGBoost', 'Neural Networks', 'Random Forest']
	reason = f"{unique_vals} classes - complex multi-class problem"

	ml_recommendations.append({
	'target': target,
	'type': target_type,
	'unique_vals': unique_vals,
	'models': models,
	'reason': reason,
	'features': [c for c in df.columns if c != target]
	})

	# Smart visualization recommendations
	viz_recs = []

	# Executive KPIs
	for col in profile['numeric_cols'][:4]:
	viz_recs.append(f"📊 KPI Card: {col} (Current: {df[col].iloc[-1]:.2f}, Trend: {'↗️' if df[col].corr(pd.Series(range(len(df)))) > 0 else '↘️'})")

	# Comparative analysis
	for cat in profile['categorical_cols'][:2]:
	for num in profile['numeric_cols'][:2]:
	if df[cat].nunique() <= 10:
	avg_by_cat = df.groupby(cat)[num].mean().round(2).to_dict()
	viz_recs.append(f"📊 Bar Chart: {num} by {cat} - Averages: {avg_by_cat}")

	# Correlation insights
	if len(profile['numeric_cols']) >= 2:
	corr_matrix = df[profile['numeric_cols']].corr()
	strongest_corr = corr_matrix.abs().unstack().sort_values(ascending=False)
	strongest_corr = strongest_corr[strongest_corr < 1.0].head(3)
	for (var1, var2), corr_val in strongest_corr.items():
	viz_recs.append(f"🔗 Scatter Plot: {var1} vs {var2} (Correlation: {corr_val:.3f})")

	# Distribution insights
	for col in profile['numeric_cols'][:3]:
	q1, q3 = df[col].quantile([0.25, 0.75])
	outliers = len(df[(df[col] < q1 - 1.5(q3-q1)) \| (df[col] > q3 + 1.5(q3-q1))])
	viz_recs.append(f"📈 Histogram: {col} distribution (Outliers: {outliers}, Skew: {df[col].skew():.2f})")

	# Generate comprehensive report
	report = f"""# 🧠 INTELLIGENT DATASET ANALYSIS

	## 📊 Dataset Intelligence
	Rows: {profile['shape'][0]:,} \| Columns: {profile['shape'][1]} \| Size: {profile['memory_mb']} MB \| Duplicates: {profile['duplicates']}

	Data Quality Score: {((len(df) - sum(profile['missing_data'].values())) / (len(df) * len(df.columns)) * 100):.1f}%

	## 🎯 SMART TARGET DETECTION
	"""

	for i, (target, score) in enumerate(top_targets[:3], 1):
	target_info = f"{i}. {target} (Confidence: {score}/20)"
	if target in smart_targets:
	target_info += f" ✅ RECOMMENDED"
	if target in profile['numeric_cols']:
	target_info += f" \| Values: {df[target].min():.2f} - {df[target].max():.2f} \| Mean: {df[target].mean():.2f}"
	else:
	top_values = df[target].value_counts().head(3).to_dict()
	target_info += f" \| Top categories: {top_values}"
	report += f"{target_info}\n"

	report += f"\n## 🤖 ML MODEL RECOMMENDATIONS\n"

	for i, rec in enumerate(ml_recommendations, 1):
	report += f"""### Scenario {i}: Predict `{rec['target']}`
	Problem Type: {rec['type'].upper()} {'REGRESSION' if rec['type'] == 'numeric' else 'CLASSIFICATION'}
	AI Reasoning: {rec['reason']}
	Recommended Models:
	1. 🥇 {rec['models'][0]} (Primary choice)
	2. 🥈 {rec['models'][1]} (Alternative)
	3. 🥉 {rec['models'][2]} (Backup option)
	Features Available: {len(rec['features'])} variables
	Data Split: {int(len(df)0.8):,} train / {int(len(df)0.2):,} test

	"""

	# Data preprocessing recommendations
	missing_cols = [col for col, missing in profile['missing_data'].items() if missing > 0]
	high_cardinality = [col for col in profile['categorical_cols'] if df[col].nunique() > 20]

	report += f"""## ⚙️ PREPROCESSING PIPELINE
	Missing Data: {len(missing_cols)} columns need attention
	"""
	if missing_cols:
	for col in missing_cols[:5]:
	missing_pct = (profile['missing_data'][col] / len(df)) * 100
	strategy = "Drop column" if missing_pct > 50 else ("Median fill" if col in profile['numeric_cols'] else "Mode fill")
	report += f"- `{col}`: {missing_pct:.1f}% missing → {strategy}\n"

	if high_cardinality:
	report += f"High Cardinality: {len(high_cardinality)} categorical columns need encoding\n"
	for col in high_cardinality[:3]:
	report += f"- `{col}`: {df[col].nunique()} categories → Target encoding recommended\n"

	report += f"\n## 📊 SMART VISUALIZATIONS\n"
	for viz in viz_recs:
	report += f"{viz}\n"

	# Performance predictions
	sample_size_category = "Large" if len(df) > 50000 else ("Medium" if len(df) > 5000 else "Small")
	feature_ratio = len(df.columns) / len(df)

	report += f"""
	## 🚀 IMPLEMENTATION ROADMAP

	### Phase 1: Data Preparation (Week 1)
	- Handle {len(missing_cols)} missing data issues
	- Encode {len(profile['categorical_cols'])} categorical variables
	- Feature scaling for {len(profile['numeric_cols'])} numeric variables

	### Phase 2: Model Development (Week 2-3)
	- {sample_size_category} dataset → Expected training time: {'Minutes' if len(df) < 10000 else 'Hours'}
	- Feature importance analysis using top recommended models
	- Cross-validation with {5 if len(df) > 1000 else 3}-fold strategy

	### Phase 3: Production (Week 4)
	- Model deployment pipeline
	- Performance monitoring dashboard
	- A/B testing framework

	## 💡 BUSINESS IMPACT PREDICTION
	Model Accuracy Expectation: {85 if len(df) > 10000 and len(missing_cols) < 3 else 75}%+
	ROI Timeline: 3-6 months
	Automation Potential: {min(90, len(profile['numeric_cols']) * 10)}% of manual analysis
	"""

	return report

	except Exception as e:
	return f"Analysis failed: {str(e)}\nEnsure file is valid CSV/Excel format."

	def smart_ba_chat(message, history):
	"""Intelligent BA assistant with contextual responses"""

	# Smart response generation based on keywords
	keywords = message.lower()

	if any(word in keywords for word in ['requirement', 'functional', 'specification']):
	response = f"""## 📋 Smart Requirements Analysis for: "{message}"

	### 🎯 Identified Requirements Type
	Primary: {'Functional' if 'function' in keywords else 'Business'} Requirements
	Complexity: {'High' if len(message.split()) > 20 else 'Medium'}

	### 📝 Generated Requirements Framework
	1. Must Have (P1)
	- Core functionality: {message.split('.')[0] if '.' in message else message[:50]}
	- User authentication and authorization
	- Data validation and error handling

	2. Should Have (P2)
	- Reporting and analytics dashboard
	- Export/import capabilities
	- Audit trail functionality

	3. Could Have (P3)
	- Advanced filtering options
	- Mobile responsiveness
	- Integration APIs

	### ✅ Acceptance Criteria Template
	```
	Given: User has appropriate permissions
	When: User performs {message.split()[-1] if message.split() else 'action'}
	Then: System should respond within 3 seconds
	And: Changes should be logged for audit
	```

	### 🔍 Next Steps
	- [ ] Stakeholder validation workshop
	- [ ] Technical feasibility assessment
	- [ ] Resource estimation and timeline
	"""

	elif any(word in keywords for word in ['process', 'workflow', 'procedure']):
	response = f"""## 🔄 Process Analysis for: "{message}"

	### 📊 Current State Assessment
	Process Complexity: {'High' if 'complex' in keywords or 'multiple' in keywords else 'Medium'}
	Stakeholders Involved: {'Multiple departments' if 'department' in keywords else 'Single team'}

	### 🎯 Identified Pain Points
	- Manual handoffs and delays
	- Lack of visibility and tracking
	- Inconsistent execution
	- No performance metrics

	### 🚀 Recommended Solution
	Automation Level: {80 if 'automate' in keywords else 60}%
	Expected Efficiency Gain: {40 if 'improve' in keywords else 25}%
	Implementation Timeline: {'3-6 months' if 'large' in keywords else '6-12 weeks'}

	### 📈 Process Optimization Steps
	1. Map Current State (Week 1-2)
	2. Identify Bottlenecks (Week 3)
	3. Design Future State (Week 4-5)
	4. Pilot Implementation (Week 6-8)
	5. Full Rollout (Week 9-12)

	### 🎯 Success Metrics
	- Process cycle time reduction: 50%+
	- Error rate reduction: 80%+
	- User satisfaction score: 4.5+/5
	"""

	elif any(word in keywords for word in ['dashboard', 'report', 'visual', 'chart']):
	response = f"""## 📊 Visualization Strategy for: "{message}"

	### 🎨 Smart Chart Recommendations
	Data Type Detected: {'Time-series' if 'time' in keywords or 'trend' in keywords else 'Categorical'}
	Audience Level: {'Executive' if 'executive' in keywords else 'Operational'}

	### 📈 Recommended Visualizations
	1. KPI Dashboard
	- Primary metrics with trend indicators
	- Color-coded status (Red/Yellow/Green)
	- Real-time data refresh

	2. Comparative Analysis
	- Bar charts for category comparison
	- Heat maps for correlation analysis
	- Scatter plots for relationship insights

	3. Trend Analysis
	- Line charts for time-based data
	- Area charts for cumulative metrics
	- Waterfall charts for variance analysis

	### 🎯 Dashboard Layout Strategy
	```
	┌─────────────────────────────────────────┐
	│ Executive Summary KPIs │
	├─────────────────┬───────────────────────┤
	│ Primary Chart │ Filter Controls │
	│ (60% width) │ & Drill-downs │
	├─────────────────┴───────────────────────┤
	│ Supporting Analytics │
	└─────────────────────────────────────────┘
	```

	### 💡 Business Intelligence Features
	- Interactive filtering and drill-down
	- Automated insights and anomaly detection
	- Mobile-responsive design
	- Export capabilities (PDF, Excel, PowerPoint)
	"""

	else:
	# Generic intelligent response
	response = f"""## 🧠 Smart Business Analysis for: "{message}"

	### 🎯 Analysis Summary
	Domain: {'Technology' if any(tech in keywords for tech in ['system', 'software', 'digital', 'ai', 'ml']) else 'Business Operations'}
	Complexity: {'High' if len(message.split()) > 15 else 'Medium'}
	Urgency: {'High' if any(urgent in keywords for urgent in ['urgent', 'asap', 'critical']) else 'Normal'}

	### 📊 Key Considerations
	1. Stakeholder Impact Analysis
	- Primary users: Business operations team
	- Secondary users: Management and IT
	- Decision makers: Department heads

	2. Success Criteria Definition
	- Quantitative: ROI > 20%, Time savings > 30%
	- Qualitative: User satisfaction, Process efficiency
	- Timeline: {'6-8 weeks' if 'quick' in keywords else '3-4 months'}

	3. Risk Assessment
	- Technical: {'Medium' if 'technical' in keywords else 'Low'}
	- Business: {'High' if 'change' in keywords else 'Medium'}
	- Resource: Based on scope and timeline

	### 🚀 Recommended Action Plan
	Phase 1: Requirement gathering and stakeholder alignment
	Phase 2: Solution design and prototype development
	Phase 3: Implementation and testing
	Phase 4: Deployment and change management

	### 💡 Next Steps
	- Schedule stakeholder interviews
	- Define detailed acceptance criteria
	- Create project timeline and milestones
	- Identify potential risks and mitigation strategies
	"""

	return response

	# Streamlined Gradio Interface
	with gr.Blocks(title="Smart Business Analyst GPT", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🧠 Smart Business Analyst GPT\n## AI-Powered Dataset Analysis & Business Intelligence")

	with gr.Tabs():
	with gr.TabItem("💬 Intelligent BA Assistant"):
	chatbot = gr.Chatbot(height=500, label="Smart BA Assistant")
	msg = gr.Textbox(placeholder="Describe your business challenge...", label="Your Question", lines=2)

	with gr.Row():
	submit = gr.Button("🚀 Get Smart Analysis", variant="primary")
	clear = gr.Button("Clear")

	with gr.TabItem("📊 Complete Dataset Analysis"):
	gr.Markdown("### Upload your dataset for comprehensive AI analysis")

	file_upload = gr.File(label="Upload CSV/Excel", file_types=[".csv", ".xlsx", ".xls"])
	analyze_btn = gr.Button("🧠 Complete Analysis", variant="primary", size="lg")
	analysis_output = gr.Textbox(label="Intelligent Analysis Report", lines=30, show_copy_button=True)

	# Event handlers
	def chat_respond(message, history):
	response = smart_ba_chat(message, history)
	history.append((message, response))
	return "", history

	msg.submit(chat_respond, [msg, chatbot], [msg, chatbot])
	submit.click(chat_respond, [msg, chatbot], [msg, chatbot])
	clear.click(lambda: [], None, chatbot)
	analyze_btn.click(smart_dataset_analysis, file_upload, analysis_output)

	if __name__ == "__main__":
	demo.launch()