CodeCommunity commited on
Commit
da07baf
·
verified ·
1 Parent(s): 174a4e2

Create app/predictor.py

Browse files
Files changed (1) hide show
  1. app/predictor.py +739 -0
app/predictor.py ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # for the loading of CODE_BERT model
2
+
3
+ import logging
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from transformers import AutoModel, AutoTokenizer
7
+
8
+ # just use to log at terminal no big deal here
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class CodeClassifier:
14
+ # (initializing/downloading/caching) the codeBERT_model first time it will take some time as downloading (500_MB)
15
+ def __init__(self):
16
+ logger.info("⏳ Initializing AI Service...")
17
+
18
+ # Detect Hardware (Uses GPU on Victus, OR fallback if GPU is not available CPU/MPS)
19
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ if torch.backends.mps.is_available():
21
+ self.device = "mps"
22
+
23
+ logger.info(f"🚀 Running on device: {self.device}")
24
+
25
+ # This will download 'microsoft/codebert-base' from Hugging Face
26
+ # the first time it runs. It caches it locally afterwards.
27
+ try:
28
+ logger.info(
29
+ "📥 Loading CodeBERT Model (this may take a minute first time)..."
30
+ )
31
+ self.tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
32
+ self.model = AutoModel.from_pretrained("microsoft/codebert-base").to(
33
+ self.device
34
+ )
35
+ logger.info("✅ CodeBERT Loaded Successfully!")
36
+ except Exception as e:
37
+ logger.error(f"❌ Failed to load model: {e}")
38
+ raise e
39
+
40
+ # Initialize Semantic Anchors for Classification
41
+ self.labels = {
42
+ "Frontend": "import react component from view styles css html dom window document state props effect",
43
+ "Backend": "import express nest controller service entity repository database sql mongoose route api async await req res dto",
44
+ "Security": "import auth passport jwt strategy bcrypt verify token secret guard password user login session middleware",
45
+ "DevOps": "docker build image container kubernetes yaml env port host volume deploy pipeline stage steps runs-on",
46
+ "Testing": "describe it expect test mock spy jest beforeall aftereach suite spec assert",
47
+ }
48
+ self.label_embeddings = self._precompute_label_embeddings()
49
+
50
+ def _get_embedding(self, text: str):
51
+ """
52
+ Generates a 768-dim vector for the given text using CodeBERT.
53
+ """
54
+ # Truncate text to avoid model errors (CodeBERT max is 512 tokens)
55
+ inputs = self.tokenizer(
56
+ text, return_tensors="pt", padding=True, truncation=True, max_length=512
57
+ ).to(self.device)
58
+ with torch.no_grad():
59
+ outputs = self.model(**inputs)
60
+ # Use the [CLS] token embedding (index 0) as the sentence representation
61
+ return outputs.last_hidden_state[:, 0, :]
62
+
63
+ def _precompute_label_embeddings(self):
64
+ """
65
+ Computes embeddings for the category descriptions once at startup.
66
+ """
67
+ logger.info("🧠 Pre-computing semantic anchors for classification...")
68
+ embeddings = {}
69
+ for label, description in self.labels.items():
70
+ embeddings[label] = self._get_embedding(description)
71
+ return embeddings
72
+
73
+ # for pridicting our file name themes and then classiyfing them into related catagory
74
+ def predict(self, file_path: str, content: str = None) -> dict:
75
+ """
76
+ Determines the 'Layer' of a file (Frontend, Backend, etc.)
77
+ Returns: { "label": str, "confidence": float, "embedding": list[float] }
78
+ """
79
+ path = file_path.lower()
80
+
81
+ # Helper to standardize return format
82
+ def result(label, conf=1.0, emb=None):
83
+ return {
84
+ "label": label,
85
+ "confidence": conf,
86
+ "embedding": emb if emb is not None else [],
87
+ }
88
+
89
+ # 1. Fast Path: Rule-Based Checks (High Precision)
90
+ # We still keep this because it's instant and correct for obvious things.
91
+
92
+ # Frontend Indicators
93
+ if any(
94
+ x in path
95
+ for x in [
96
+ "/components/",
97
+ "/pages/",
98
+ "/views/",
99
+ ".jsx",
100
+ ".tsx",
101
+ ".css",
102
+ "tailwind",
103
+ ]
104
+ ):
105
+ return result("Frontend")
106
+
107
+ # Backend Indicators
108
+ if any(
109
+ x in path
110
+ for x in [
111
+ "/controllers/",
112
+ "/modules/",
113
+ "/services/",
114
+ ".controller.ts",
115
+ ".service.ts",
116
+ "dto",
117
+ ]
118
+ ):
119
+ return result("Backend")
120
+
121
+ # Security Indicators
122
+ if any(
123
+ x in path
124
+ for x in ["auth", "guard", "strategy", "jwt", "passport", "middleware"]
125
+ ):
126
+ return result("Security")
127
+
128
+ # DevOps/Config Indicators
129
+ if any(
130
+ x in path
131
+ for x in ["docker", "k8s", "github/workflows", "tsconfig", "package.json"]
132
+ ):
133
+ return result("DevOps")
134
+
135
+ # Testing Indicators
136
+ if any(x in path for x in ["test", "spec", "e2e", "jest"]):
137
+ return result("Testing")
138
+
139
+ # 2. Slow Path: AI-Powered Semantic Classification (High Recall)
140
+ # If the rules are unsure ("Generic"), we ask the model.
141
+ try:
142
+ # Decide what to analyze: Content (best) or Path (fallback)
143
+ text_to_analyze = content if content else file_path
144
+
145
+ # If analyzing content, take the first 1000 chars (approx enough tokens) to capture imports/class defs
146
+ if content:
147
+ text_to_analyze = content[:1000]
148
+
149
+ target_embedding_tensor = self._get_embedding(text_to_analyze)
150
+ target_embedding_list = (
151
+ target_embedding_tensor.tolist()
152
+ ) # Convert to list for JSON serialization
153
+
154
+ best_label = "Generic"
155
+ highest_score = -1.0
156
+
157
+ for label, anchor_embedding in self.label_embeddings.items():
158
+ # Cosine Similarity: -1 to 1
159
+ score = F.cosine_similarity(
160
+ target_embedding_tensor, anchor_embedding
161
+ ).item()
162
+ if score > highest_score:
163
+ highest_score = score
164
+ best_label = label
165
+
166
+ # Only accept the AI's opinion if it's somewhat confident
167
+ if highest_score > 0.25:
168
+ return result(best_label, highest_score, target_embedding_list)
169
+
170
+ return result("Generic", highest_score, target_embedding_list)
171
+
172
+ except Exception as e:
173
+ logger.error(f"AI Classification failed for {file_path}: {e}")
174
+ return result("Generic", 0.0)
175
+
176
+
177
+ class GuideGenerator:
178
+ def __init__(self):
179
+ self.tech_stacks = {
180
+ "React": ["react", "jsx", "tsx", "next.config.js"],
181
+ "Vue": ["vue", "nuxt.config.js"],
182
+ "Angular": ["angular.json"],
183
+ "Svelte": ["svelte.config.js"],
184
+ "NestJS": ["nest-cli.json", ".module.ts"],
185
+ "Express": ["express", "server.js", "app.js"],
186
+ "FastAPI": ["fastapi", "main.py"],
187
+ "Django": ["django", "manage.py"],
188
+ "Flask": ["flask", "app.py"],
189
+ "Spring Boot": ["pom.xml", "build.gradle", "src/main/java"],
190
+ "Go": ["go.mod", "main.go"],
191
+ "Rust": ["Cargo.toml", "src/main.rs"],
192
+ }
193
+
194
+ self.tools = {
195
+ "Docker": ["Dockerfile", "docker-compose.yml"],
196
+ "Kubernetes": ["k8s", "helm", "charts/"],
197
+ "TypeScript": ["tsconfig.json", ".ts"],
198
+ "Tailwind CSS": ["tailwind.config.js"],
199
+ "Prisma": ["schema.prisma"],
200
+ "GraphQL": [".graphql", "schema.gql"],
201
+ "PostgreSQL": ["postgresql", "pg"],
202
+ "MongoDB": ["mongoose", "mongodb"],
203
+ "Redis": ["redis"],
204
+ }
205
+
206
+ def detect_stack(self, files: list[str]) -> dict:
207
+ detected = {"languages": set(), "frameworks": set(), "tools": set()}
208
+
209
+ for file in files:
210
+ path = file.lower()
211
+
212
+ # Languages
213
+ if path.endswith(".ts") or path.endswith(".tsx"):
214
+ detected["languages"].add("TypeScript")
215
+ elif path.endswith(".js") or path.endswith(".jsx"):
216
+ detected["languages"].add("JavaScript")
217
+ elif path.endswith(".py"):
218
+ detected["languages"].add("Python")
219
+ elif path.endswith(".go"):
220
+ detected["languages"].add("Go")
221
+ elif path.endswith(".rs"):
222
+ detected["languages"].add("Rust")
223
+ elif path.endswith(".java"):
224
+ detected["languages"].add("Java")
225
+
226
+ # Frameworks
227
+ for framework, indicators in self.tech_stacks.items():
228
+ if any(ind in path for ind in indicators):
229
+ detected["frameworks"].add(framework)
230
+
231
+ # Tools
232
+ for tool, indicators in self.tools.items():
233
+ if any(ind in path for ind in indicators):
234
+ detected["tools"].add(tool)
235
+
236
+ return detected
237
+
238
+ def _generate_tree(self, files: list[str]) -> str:
239
+ """
240
+ Generates a clean ASCII tree of the project structure with architectural annotations.
241
+ """
242
+ tree = {}
243
+ relevant_files = []
244
+
245
+ # 1. Filter and normalize paths
246
+ for f in files:
247
+ parts = f.split("/")
248
+
249
+ # Skip noise
250
+ if any(
251
+ p
252
+ in [
253
+ "node_modules",
254
+ ".git",
255
+ "__pycache__",
256
+ "dist",
257
+ "build",
258
+ ".idea",
259
+ ".vscode",
260
+ ]
261
+ for p in parts
262
+ ):
263
+ continue
264
+ if f.endswith(".DS_Store"):
265
+ continue
266
+
267
+ relevant_files.append(f)
268
+
269
+ # 2. Build nested dictionary
270
+ for path in relevant_files:
271
+ parts = path.split("/")
272
+ if len(parts) > 3:
273
+ parts = parts[:3]
274
+
275
+ current = tree
276
+ for part in parts:
277
+ current = current.setdefault(part, {})
278
+
279
+ # 3. Define Annotations
280
+ descriptions = {
281
+ "src": "Core application source code",
282
+ "app": "Main application logic",
283
+ "components": "Reusable UI components",
284
+ "pages": "Route/Page definitions",
285
+ "api": "API endpoints and services",
286
+ "utils": "Utility functions and helpers",
287
+ "lib": "External libraries and configurations",
288
+ "test": "Unit and integration tests",
289
+ "tests": "Test suites",
290
+ "docs": "Project documentation",
291
+ "public": "Static assets (images, fonts)",
292
+ "assets": "Static media files",
293
+ "server": "Backend server code",
294
+ "client": "Frontend client application",
295
+ "config": "Configuration files",
296
+ "scripts": "Build and maintenance scripts",
297
+ "prisma": "Database schema and migrations",
298
+ "graphql": "GraphQL definitions",
299
+ }
300
+
301
+ # 4. Render tree
302
+ lines = []
303
+
304
+ def render(node, prefix=""):
305
+ keys = sorted(node.keys())
306
+
307
+ # Priority sorting: put 'src', 'app', 'server', 'client' first
308
+ priority = ["src", "app", "client", "server", "public"]
309
+ keys.sort(key=lambda k: (0 if k in priority else 1, k))
310
+
311
+ if len(keys) > 12:
312
+ keys = keys[:12] + ["..."]
313
+
314
+ for i, key in enumerate(keys):
315
+ is_last = i == len(keys) - 1
316
+ connector = "└── " if is_last else "├── "
317
+
318
+ # Add annotation if available and it's a folder (has children)
319
+ comment = ""
320
+ if key in descriptions and isinstance(node[key], dict) and node[key]:
321
+ comment = f" # {descriptions[key]}"
322
+
323
+ lines.append(f"{prefix}{connector}{key}{comment}")
324
+
325
+ if isinstance(node.get(key), dict) and node[key]:
326
+ extension = " " if is_last else "│ "
327
+ render(node[key], prefix + extension)
328
+
329
+ render(tree)
330
+ return "\n".join(lines[:50])
331
+
332
+ def generate_markdown(self, repo_name: str, files: list[str]) -> str:
333
+ # 1. Perform AI Analysis (DNA of the repo)
334
+ stats = {
335
+ "Frontend": 0,
336
+ "Backend": 0,
337
+ "Security": 0,
338
+ "DevOps": 0,
339
+ "Testing": 0,
340
+ "Generic": 0,
341
+ }
342
+ layer_map = {}
343
+
344
+ low_confidence_files = []
345
+ file_embeddings = {} # Path -> Tensor
346
+
347
+ for f in files:
348
+ # We use CodeBERT's path-embedding capability here since we don't have content for all files
349
+ prediction = classifier.predict(f)
350
+ layer = prediction["label"]
351
+ confidence = prediction["confidence"]
352
+
353
+ stats[layer] += 1
354
+ layer_map[f] = layer
355
+
356
+ if confidence < 0.4 and layer != "Generic":
357
+ low_confidence_files.append((f, confidence))
358
+
359
+ # Store embedding for coupling analysis (if available)
360
+ if prediction["embedding"] and len(prediction["embedding"]) > 0:
361
+ file_embeddings[f] = torch.tensor(prediction["embedding"])
362
+
363
+ total_files = len(files) if files else 1
364
+ primary_layer = max(stats, key=stats.get)
365
+
366
+ # Calculate Semantic Couplings (Top 5)
367
+ couplings = []
368
+ try:
369
+ paths = list(file_embeddings.keys())
370
+ # Limit to first 50 files for safety and performance
371
+ sample_paths = paths[:50]
372
+
373
+ for i in range(len(sample_paths)):
374
+ for j in range(i + 1, len(sample_paths)):
375
+ p1, p2 = sample_paths[i], sample_paths[j]
376
+ # Skip same folder
377
+ if p1.rsplit("/", 1)[0] == p2.rsplit("/", 1)[0]:
378
+ continue
379
+
380
+ # Ensure tensors are valid and on CPU for comparison
381
+ t1 = file_embeddings[p1].cpu()
382
+ t2 = file_embeddings[p2].cpu()
383
+
384
+ score = F.cosine_similarity(t1.unsqueeze(0), t2.unsqueeze(0)).item()
385
+ if score > 0.88:
386
+ couplings.append((p1, p2, score))
387
+ except Exception as e:
388
+ logger.error(f"Failed to calculate couplings: {e}")
389
+
390
+ couplings.sort(key=lambda x: x[2], reverse=True)
391
+ top_couplings = couplings[:5]
392
+
393
+ # Sort low confidence by lowest score
394
+ low_confidence_files.sort(key=lambda x: x[1])
395
+ top_refactors = low_confidence_files[:5]
396
+
397
+ # 2. Advanced Stack & Feature Detection
398
+ stack = self.detect_stack(files)
399
+ features = self._detect_features(files, stats)
400
+ dev_tools = self._detect_dev_tools(files)
401
+
402
+ install_cmd = "npm install"
403
+ run_cmd = "npm run dev"
404
+ test_cmd = "npm test"
405
+
406
+ if "Python" in stack["languages"]:
407
+ install_cmd = "pip install -r requirements.txt"
408
+ run_cmd = "python main.py"
409
+ test_cmd = "pytest"
410
+ elif "Go" in stack["languages"]:
411
+ install_cmd = "go mod download"
412
+ run_cmd = "go run main.go"
413
+ test_cmd = "go test ./..."
414
+ elif "Rust" in stack["languages"]:
415
+ install_cmd = "cargo build"
416
+ run_cmd = "cargo run"
417
+ test_cmd = "cargo test"
418
+
419
+ if "Django" in stack["frameworks"]:
420
+ run_cmd = "python manage.py runserver"
421
+ test_cmd = "python manage.py test"
422
+ elif "Spring Boot" in stack["frameworks"]:
423
+ install_cmd = "./mvnw install"
424
+ run_cmd = "./mvnw spring-boot:run"
425
+ if "Docker" in stack["tools"]:
426
+ run_cmd += "\n# Or using Docker\ndocker-compose up --build"
427
+
428
+ # 3. Assemble Guide
429
+ md = f"# {repo_name} Developer Guide\n\n"
430
+
431
+ md += "## AI Codebase Insights\n"
432
+ md += f"Analysis powered by **CodeBERT** semantic vectors.\n\n"
433
+ md += f"**Project DNA:** {self._get_project_dna(stats, total_files)}\n\n"
434
+ md += f"**Quality Check:** {self._get_testing_status(stats, total_files)}\n\n"
435
+
436
+ if top_refactors:
437
+ md += "### Code Health & Complexity\n"
438
+ md += "The AI flagged the following files as **Non-Standard** or **Complex** (Low Confidence).\n"
439
+ md += (
440
+ "These are good candidates for refactoring or documentation reviews:\n"
441
+ )
442
+ for f, score in top_refactors:
443
+ md += f"- `{f}` (Confidence: {int(score * 100)}%)\n"
444
+ md += "\n"
445
+
446
+ if top_couplings:
447
+ md += "### Logical Couplings\n"
448
+ md += "The AI detected strong semantic connections between these file pairs (they share logic but not folders):\n"
449
+ for p1, p2, score in top_couplings:
450
+ md += f"- `{p1}` <--> `{p2}` ({int(score * 100)}% match)\n"
451
+ md += "\n"
452
+
453
+ md += "### Layer Composition\n"
454
+ md += "| Layer | Composition | Status |\n"
455
+ md += "| :--- | :--- | :--- |\n"
456
+ for layer, count in stats.items():
457
+ if count > 0:
458
+ percentage = (count / total_files) * 100
459
+ status = "Primary" if layer == primary_layer else "Detected"
460
+ md += f"| {layer} | {percentage:.1f}% | {status} |\n"
461
+ md += "\n"
462
+
463
+ md += "## Key Features\n"
464
+ if features:
465
+ md += "The following capabilities were inferred from the codebase structure:\n\n"
466
+ for feature, description in features.items():
467
+ md += f"- **{feature}**: {description}\n"
468
+ else:
469
+ md += "No specific high-level features (Auth, Database, etc.) were explicitly detected from the file structure.\n"
470
+ md += "\n"
471
+
472
+ md += "## Architecture & Technologies\n"
473
+ md += "The project utilizes the following core technologies:\n\n"
474
+
475
+ if stack["languages"]:
476
+ md += "**Languages**: " + ", ".join(sorted(stack["languages"])) + "\n"
477
+ if stack["frameworks"]:
478
+ md += "**Frameworks**: " + ", ".join(sorted(stack["frameworks"])) + "\n"
479
+ if stack["tools"]:
480
+ md += "**Infrastructure**: " + ", ".join(sorted(stack["tools"])) + "\n"
481
+ if dev_tools:
482
+ md += "**Development Tools**: " + ", ".join(sorted(dev_tools)) + "\n"
483
+ md += "\n"
484
+
485
+ md += "## Getting Started\n\n"
486
+
487
+ # Config Section
488
+ if any(f.endswith(".env") or f.endswith(".env.example") for f in files):
489
+ md += "### Configuration\n"
490
+ md += "This project relies on environment variables. \n"
491
+ md += "1. Find the `.env.example` file in the root directory.\n"
492
+ md += "2. Copy it to create a new `.env` file.\n"
493
+ md += " `cp .env.example .env`\n"
494
+ md += "3. Fill in the required values (Database URL, API Keys, etc.).\n\n"
495
+
496
+ md += "### Prerequisites\n"
497
+ md += "Ensure you have the following installed:\n"
498
+ md += "- Git\n"
499
+ if any(x in ["TypeScript", "JavaScript"] for x in stack["languages"]):
500
+ md += "- Node.js (LTS)\n"
501
+ if "Python" in stack["languages"]:
502
+ md += "- Python 3.8+\n"
503
+ if "Docker" in stack["tools"]:
504
+ md += "- Docker Desktop\n"
505
+
506
+ md += "\n### Installation\n"
507
+ md += "1. Clone and enter the repository:\n"
508
+ md += " ```bash\n"
509
+ md += f" git clone https://github.com/OWNER/{repo_name}.git\n"
510
+ md += f" cd {repo_name}\n"
511
+ md += " ```\n\n"
512
+
513
+ md += f"2. Install project dependencies:\n"
514
+ md += " ```bash\n"
515
+ md += f" {install_cmd}\n"
516
+ md += " ```\n\n"
517
+
518
+ md += "### Execution\n"
519
+ md += "Start the development environment:\n"
520
+ md += "```bash\n"
521
+ md += f"{run_cmd}\n"
522
+ md += "```\n\n"
523
+
524
+ if stats["Testing"] > 0:
525
+ md += "## Testing\n"
526
+ md += "Automated tests were detected. Run them using:\n"
527
+ md += f"```bash\n{test_cmd}\n```\n\n"
528
+
529
+ md += "## Project Structure\n"
530
+ md += "Detailed tree view with AI-predicted layer labels:\n\n"
531
+ md += "```text\n"
532
+ md += self._generate_tree_with_ai(files, layer_map)
533
+ md += "\n```\n\n"
534
+
535
+ # ... (rest of the markdown)
536
+ md += "## Contribution Workflow\n\n"
537
+ md += "We welcome contributions! Please follow this detailed workflow to ensure smooth collaboration:\n\n"
538
+ md += "### 1. Find an Issue\n"
539
+ md += "- Browse the **Issues** tab for tasks.\n"
540
+ md += "- Look for labels like `good first issue` or `help wanted` if you are new.\n"
541
+ md += (
542
+ "- Comment on the issue to assign it to yourself before starting work.\n\n"
543
+ )
544
+ md += "### 2. Branching Strategy\n"
545
+ md += "Create a new branch from `main` using one of the following prefixes:\n"
546
+ md += "- `feat/`: For new features (e.g., `feat/user-auth`)\n"
547
+ md += "- `fix/`: For bug fixes (e.g., `fix/login-error`)\n"
548
+ md += "- `docs/`: For documentation changes (e.g., `docs/update-readme`)\n"
549
+ md += "- `refactor/`: For code improvements without logic changes\n"
550
+ md += "```bash\n"
551
+ md += "git checkout -b feat/your-feature-name\n"
552
+ md += "```\n\n"
553
+ md += "### 3. Development Standards\n"
554
+ if dev_tools:
555
+ md += "Before committing, ensure your code meets the project standards:\n"
556
+ if "ESLint" in dev_tools or "Prettier" in dev_tools:
557
+ md += "- **Linting**: Run `npm run lint` to fix style issues.\n"
558
+ if "Jest" in dev_tools or "pytest" in dev_tools:
559
+ md += "- **Testing**: Run tests to ensure no regressions.\n"
560
+ md += "- Keep pull requests small and focused on a single task.\n\n"
561
+ md += "### 4. Commit Messages\n"
562
+ md += "We follow the **Conventional Commits** specification:\n"
563
+ md += "- `feat: add new search component`\n"
564
+ md += "- `fix: handle null pointer exception`\n"
565
+ md += "- `chore: update dependencies`\n"
566
+ md += "```bash\n"
567
+ md += "git commit -m 'feat: implement amazing feature'\n"
568
+ md += "```\n\n"
569
+ md += "### 5. Pull Request Process\n"
570
+ md += "1. Push your branch: `git push origin feat/your-feature-name`\n"
571
+ md += "2. Open a Pull Request against the `main` branch.\n"
572
+ md += "3. Fill out the PR template with details about your changes.\n"
573
+ md += "4. Wait for code review and address any feedback.\n"
574
+ md += "5. Once approved, your changes will be merged!\n\n"
575
+
576
+ md += "## About this Guide\n"
577
+ md += "This documentation was automatically generated by **GitGud AI**.\n"
578
+ md += "- **Engine:** CodeBERT (Transformer-based Language Model)\n"
579
+ md += "- **Analysis:** Zero-shot semantic classification of file paths and content.\n"
580
+ md += "- **Accuracy:** The 'Project DNA' and 'Layer Composition' metrics are derived from vector embeddings of your codebase, providing a mathematical approximation of your architecture.\n"
581
+
582
+ return md
583
+
584
+ def _get_project_dna(self, stats: dict, total: int) -> str:
585
+ """
586
+ Interprets the layer statistics to give a high-level project description.
587
+ """
588
+ backend_pct = (stats["Backend"] / total) * 100
589
+ frontend_pct = (stats["Frontend"] / total) * 100
590
+ ops_pct = (stats["DevOps"] / total) * 100
591
+
592
+ if backend_pct > 50:
593
+ return "This project is a **Backend-focused service**, likely an API or microservice. The majority of the codebase is dedicated to business logic and data handling."
594
+ elif frontend_pct > 50:
595
+ return "This project is a **Frontend-heavy application**, focusing on UI/UX and client-side logic."
596
+ elif backend_pct > 30 and frontend_pct > 30:
597
+ return "This is a balanced **Full-Stack Application**, containing significant logic for both client and server components."
598
+ elif ops_pct > 40:
599
+ return "This repository appears to be an **Infrastructure or Configuration** project (IaC), heavily focused on deployment and orchestration."
600
+ else:
601
+ return "This is a **General purpose repository**, possibly a library or a mix of various utilities."
602
+
603
+ def _get_testing_status(self, stats: dict, total: int) -> str:
604
+ test_pct = (stats["Testing"] / total) * 100
605
+ if test_pct > 20:
606
+ return "[Excellent] **Excellent Test Coverage**. A significant portion of the codebase is dedicated to testing."
607
+ elif test_pct > 5:
608
+ return "[Moderate] **Moderate Testing**. Tests are present but may not cover all modules."
609
+ else:
610
+ return "[Low] **Low Test Coverage**. Very few test files were detected. Recommended to add more unit or integration tests."
611
+
612
+ def _detect_features(self, files: list[str], stats: dict) -> dict:
613
+ features = {}
614
+ files_str = " ".join(files).lower()
615
+
616
+ # Authentication - Only if CodeBERT detected Security logic
617
+ if stats["Security"] > 0:
618
+ auth_indicators = [
619
+ "/auth/",
620
+ "/login",
621
+ "/register",
622
+ "passport",
623
+ "jwt",
624
+ "session",
625
+ "bcrypt",
626
+ "strategy",
627
+ ]
628
+ if any(x in files_str for x in auth_indicators):
629
+ features["Authentication"] = (
630
+ "Implements user authentication (Login/Signup/Session management)."
631
+ )
632
+
633
+ # Database - Only if CodeBERT detected Backend logic
634
+ if stats["Backend"] > 0:
635
+ db_indicators = [
636
+ "schema.prisma",
637
+ "models.py",
638
+ "migration",
639
+ "sequelize",
640
+ "typeorm",
641
+ "mongoose",
642
+ "/db/",
643
+ "/database/",
644
+ ]
645
+ if any(x in files_str for x in db_indicators):
646
+ features["Database"] = (
647
+ "Includes database schema definitions or ORM models."
648
+ )
649
+
650
+ # API
651
+ api_indicators = [
652
+ "/api/",
653
+ "controllers",
654
+ "resolvers",
655
+ "routes",
656
+ "router",
657
+ "endpoint",
658
+ ]
659
+ if any(x in files_str for x in api_indicators):
660
+ features["API"] = "Exposes RESTful endpoints or GraphQL resolvers."
661
+
662
+ # Realtime
663
+ if any(x in files_str for x in ["socket", "websocket", "io.", "channel"]):
664
+ features["Real-time"] = "Uses WebSockets or real-time event channels."
665
+
666
+ # UI Architecture
667
+ if stats["Frontend"] > 0:
668
+ ui_indicators = ["components/", "views/", ".tsx", ".jsx", ".vue", "/pages/"]
669
+ if any(x in files_str for x in ui_indicators):
670
+ features["UI Architecture"] = "Modular component-based user interface."
671
+
672
+ return features
673
+
674
+ def _detect_dev_tools(self, files: list[str]) -> set:
675
+ tools = set()
676
+ files_str = " ".join(files).lower()
677
+
678
+ if "eslint" in files_str:
679
+ tools.add("ESLint")
680
+ if "prettier" in files_str:
681
+ tools.add("Prettier")
682
+ if "jest" in files_str:
683
+ tools.add("Jest")
684
+ if "cypress" in files_str:
685
+ tools.add("Cypress")
686
+ if "github/workflows" in files_str:
687
+ tools.add("GitHub Actions")
688
+ if "husky" in files_str:
689
+ tools.add("Husky")
690
+ if "tailwind" in files_str:
691
+ tools.add("Tailwind CSS")
692
+ if "vite" in files_str:
693
+ tools.add("Vite")
694
+ if "webpack" in files_str:
695
+ tools.add("Webpack")
696
+
697
+ return tools
698
+
699
+ def _generate_tree_with_ai(self, files: list[str], layer_map: dict) -> str:
700
+ tree = {}
701
+ for f in files:
702
+ parts = f.split("/")
703
+ if any(p in ["node_modules", ".git", "__pycache__", "dist"] for p in parts):
704
+ continue
705
+
706
+ curr = tree
707
+ for part in parts[:3]: # Depth 3
708
+ curr = curr.setdefault(part, {})
709
+
710
+ lines = []
711
+
712
+ def render(node, path_prefix="", tree_prefix=""):
713
+ keys = sorted(node.keys())
714
+ for i, key in enumerate(keys):
715
+ is_last = i == len(keys) - 1
716
+ full_path = f"{path_prefix}/{key}".strip("/")
717
+
718
+ # Get layer from map or predict if it's a folder
719
+ prediction = classifier.predict(full_path)
720
+ layer = layer_map.get(full_path, prediction["label"])
721
+ label = f" [{layer}]" if layer != "Generic" else ""
722
+
723
+ connector = "└── " if is_last else "├── "
724
+ lines.append(f"{tree_prefix}{connector}{key}{label}")
725
+
726
+ if node[key]:
727
+ render(
728
+ node[key],
729
+ full_path,
730
+ tree_prefix + (" " if is_last else "│ "),
731
+ )
732
+
733
+ render(tree)
734
+ return "\n".join(lines[:60])
735
+
736
+
737
+ # Create the global instance
738
+ classifier = CodeClassifier()
739
+ guide_generator = GuideGenerator()