ydshieh HF Staff commited on
Commit
e28e100
·
verified ·
1 Parent(s): b807b23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -22
app.py CHANGED
@@ -2,7 +2,6 @@ import json
2
  import os
3
  import re
4
  import sys
5
- from functools import lru_cache
6
  from typing import List, Optional, Tuple, Dict
7
 
8
  print("="*60)
@@ -84,14 +83,18 @@ MAX_ROWS = 200
84
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
85
  API = HfApi(token=HF_TOKEN)
86
 
 
 
 
87
 
88
- @lru_cache(maxsize=128)
89
- def _list_collection_files(pr_number: str) -> Tuple[str, ...]:
90
  """
91
- Return the `failure_summary.json` paths stored for a specific PR.
 
92
  """
93
  prefix = f"pr-{pr_number}"
94
- print(f"DEBUG: Looking for files with prefix: {prefix}")
95
  print(f"DEBUG: Dataset ID: {DATASET_ID}")
96
 
97
  try:
@@ -105,12 +108,12 @@ def _list_collection_files(pr_number: str) -> Tuple[str, ...]:
105
  print("DEBUG: Successfully called list_repo_tree")
106
  except HfHubHTTPError as error:
107
  print(f"ERROR: Failed to list repo tree: {error}")
108
- return tuple()
109
  except Exception as error:
110
  print(f"ERROR: Unexpected error in list_repo_tree: {error}")
111
  import traceback
112
  traceback.print_exc()
113
- return tuple()
114
 
115
  files = []
116
  matching_paths = []
@@ -122,7 +125,7 @@ def _list_collection_files(pr_number: str) -> Tuple[str, ...]:
122
  entry_type = getattr(entry, "type", type(entry).__name__)
123
  entry_path = getattr(entry, "path", str(entry))
124
 
125
- # Debug: show all entries
126
  if len(all_entries) <= 10:
127
  print(f"DEBUG: Entry {len(all_entries)}: {entry_path} (type: {entry_type})")
128
 
@@ -142,17 +145,68 @@ def _list_collection_files(pr_number: str) -> Tuple[str, ...]:
142
 
143
  if matching_paths and len(files) == 0:
144
  print(f"DEBUG: Sample matching paths (first 5): {matching_paths[:5]}")
145
- # Check if we're only getting folders
146
- folder_count = sum(1 for p in matching_paths if "RepoFolder" in str(type(p)))
147
- print(f"DEBUG: Folders in matching paths: {folder_count}")
148
 
149
  except Exception as error:
150
  print(f"ERROR: Error processing entries: {error}")
151
  import traceback
152
  traceback.print_exc()
153
- return tuple()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- return tuple(files)
156
 
157
 
158
  def _load_payload(path: str) -> Optional[dict]:
@@ -174,13 +228,6 @@ def _load_payload(path: str) -> Optional[dict]:
174
  return None
175
 
176
 
177
- def _extract_commit_from_path(path: str) -> str:
178
- parts = path.split("/")
179
- if len(parts) >= 2 and parts[1].startswith("sha-"):
180
- return parts[1][len("sha-") :]
181
- return "unknown"
182
-
183
-
184
  def _filter_records(repo: str, pr: str, sha: str) -> List[dict]:
185
  repo = repo.strip().lower()
186
  pr = pr.strip()
@@ -191,7 +238,8 @@ def _filter_records(repo: str, pr: str, sha: str) -> List[dict]:
191
  if not pr:
192
  return []
193
 
194
- file_paths = _list_collection_files(pr)
 
195
  print(f"DEBUG: Found {len(file_paths)} file paths")
196
 
197
  records: List[dict] = []
@@ -520,7 +568,7 @@ Files are organized as `pr-{{PR}}/sha-{{COMMIT}}/failure_summary.json`
520
 
521
 
522
  def refresh_dataset() -> str:
523
- _list_collection_files.cache_clear()
524
  return "✅ Cleared cached manifest. Data will be reloaded on next search."
525
 
526
 
 
2
  import os
3
  import re
4
  import sys
 
5
  from typing import List, Optional, Tuple, Dict
6
 
7
  print("="*60)
 
83
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
84
  API = HfApi(token=HF_TOKEN)
85
 
86
+ # Smart cache: stores PR -> list of file paths
87
+ # Fetches from remote when specific data not found
88
+ _pr_files_cache: Dict[str, List[str]] = {}
89
 
90
+
91
+ def _fetch_files_for_pr(pr_number: str) -> List[str]:
92
  """
93
+ Fetch all failure_summary.json files for a given PR from the remote dataset.
94
+ Returns a list of file paths.
95
  """
96
  prefix = f"pr-{pr_number}"
97
+ print(f"DEBUG: Fetching files for PR {pr_number} with prefix: {prefix}")
98
  print(f"DEBUG: Dataset ID: {DATASET_ID}")
99
 
100
  try:
 
108
  print("DEBUG: Successfully called list_repo_tree")
109
  except HfHubHTTPError as error:
110
  print(f"ERROR: Failed to list repo tree: {error}")
111
+ return []
112
  except Exception as error:
113
  print(f"ERROR: Unexpected error in list_repo_tree: {error}")
114
  import traceback
115
  traceback.print_exc()
116
+ return []
117
 
118
  files = []
119
  matching_paths = []
 
125
  entry_type = getattr(entry, "type", type(entry).__name__)
126
  entry_path = getattr(entry, "path", str(entry))
127
 
128
+ # Debug: show first few entries
129
  if len(all_entries) <= 10:
130
  print(f"DEBUG: Entry {len(all_entries)}: {entry_path} (type: {entry_type})")
131
 
 
145
 
146
  if matching_paths and len(files) == 0:
147
  print(f"DEBUG: Sample matching paths (first 5): {matching_paths[:5]}")
 
 
 
148
 
149
  except Exception as error:
150
  print(f"ERROR: Error processing entries: {error}")
151
  import traceback
152
  traceback.print_exc()
153
+ return []
154
+
155
+ return files
156
+
157
+
158
+ def _extract_commit_from_path(path: str) -> str:
159
+ """Extract commit SHA from file path."""
160
+ parts = path.split("/")
161
+ if len(parts) >= 2 and parts[1].startswith("sha-"):
162
+ return parts[1][len("sha-") :]
163
+ return "unknown"
164
+
165
+
166
+ def _get_files_for_pr_and_sha(pr_number: str, sha: str = "") -> List[str]:
167
+ """
168
+ Get files for a PR, with smart caching.
169
+
170
+ Strategy:
171
+ 1. If PR not in cache, fetch from remote and cache it
172
+ 2. If PR in cache but specific SHA not found, re-fetch and update cache
173
+ 3. Return matching files
174
+
175
+ Args:
176
+ pr_number: The PR number to query
177
+ sha: Optional commit SHA to filter for
178
+
179
+ Returns:
180
+ List of file paths for this PR (optionally filtered by SHA)
181
+ """
182
+ sha_lower = sha.lower() if sha else ""
183
+
184
+ # First check: Do we have this PR in cache?
185
+ if pr_number not in _pr_files_cache:
186
+ print(f"DEBUG: PR {pr_number} not in cache, fetching from remote...")
187
+ files = _fetch_files_for_pr(pr_number)
188
+ _pr_files_cache[pr_number] = files
189
+ print(f"DEBUG: Cached {len(files)} files for PR {pr_number}")
190
+ else:
191
+ files = _pr_files_cache[pr_number]
192
+ print(f"DEBUG: PR {pr_number} found in cache with {len(files)} files")
193
+
194
+ # Second check: If SHA specified, do we have it in cache?
195
+ if sha:
196
+ sha_found = False
197
+ for file_path in files:
198
+ commit = _extract_commit_from_path(file_path)
199
+ if commit.lower().startswith(sha_lower) or sha_lower.startswith(commit.lower()):
200
+ sha_found = True
201
+ break
202
+
203
+ if not sha_found:
204
+ print(f"DEBUG: SHA {sha} not found in cache for PR {pr_number}, re-fetching...")
205
+ files = _fetch_files_for_pr(pr_number)
206
+ _pr_files_cache[pr_number] = files
207
+ print(f"DEBUG: Updated cache with {len(files)} files for PR {pr_number}")
208
 
209
+ return files
210
 
211
 
212
  def _load_payload(path: str) -> Optional[dict]:
 
228
  return None
229
 
230
 
 
 
 
 
 
 
 
231
  def _filter_records(repo: str, pr: str, sha: str) -> List[dict]:
232
  repo = repo.strip().lower()
233
  pr = pr.strip()
 
238
  if not pr:
239
  return []
240
 
241
+ # Use smart cache that auto-fetches if needed
242
+ file_paths = _get_files_for_pr_and_sha(pr, sha)
243
  print(f"DEBUG: Found {len(file_paths)} file paths")
244
 
245
  records: List[dict] = []
 
568
 
569
 
570
  def refresh_dataset() -> str:
571
+ _pr_files_cache.clear()
572
  return "✅ Cleared cached manifest. Data will be reloaded on next search."
573
 
574