badaoui HF Staff commited on
Commit
b2f075d
·
1 Parent(s): 66be9e5
Files changed (1) hide show
  1. data.py +14 -109
data.py CHANGED
@@ -6,7 +6,6 @@ import threading
6
  import traceback
7
  import json
8
  import re
9
- import random
10
  from typing import List, Tuple, Optional, Dict
11
 
12
  # NOTE: if caching is an issue, try adding `use_listings_cache=False`
@@ -61,11 +60,6 @@ KEYS_TO_KEEP = [
61
  # HELPER FUNCTIONS
62
  # ============================================================================
63
 
64
- def generate_fake_dates(num_days: int = 7) -> List[str]:
65
- """Generate fake dates for the last N days."""
66
- today = datetime.now()
67
- return [(today - timedelta(days=i)).strftime("%Y-%m-%d") for i in range(num_days)]
68
-
69
  def parse_json_field(value) -> dict:
70
  """Safely parse a JSON field that might be a string or dict."""
71
  if value is None or pd.isna(value):
@@ -106,8 +100,6 @@ def log_dataframe_link(link: str) -> str:
106
  Adds the link to the dataset in the logs, modifies it to get a clockable link and then returns the date of the
107
  report.
108
  """
109
- if link.startswith("sample_"):
110
- return "9999-99-99"
111
  logger.info(f"Reading df located at {link}")
112
  # Make sure the links starts with an http adress
113
  if link.startswith("hf://"):
@@ -183,7 +175,6 @@ def get_available_dates() -> List[str]:
183
  return common_dates[:30] # Limit to last 30 days
184
 
185
  # No real dates available - log warning and return empty list
186
- # This will allow the system to fall back to sample data properly
187
  logger.warning("No common dates found between AMD and NVIDIA datasets")
188
  return []
189
 
@@ -252,15 +243,11 @@ def get_data_for_date(target_date: str) -> tuple[pd.DataFrame, str]:
252
 
253
  except Exception as e:
254
  logger.error(f"Error getting data for date {target_date}: {e}")
255
- # Return empty dataframe instead of sample data for historical functionality
256
  return pd.DataFrame(), target_date
257
 
258
 
259
- def get_historical_data(start_date: str, end_date: str, sample_data = False) -> pd.DataFrame:
260
  """Get historical data for a date range."""
261
- if sample_data:
262
- return get_fake_historical_data(start_date, end_date)
263
-
264
  try:
265
  start_dt = datetime.strptime(start_date, "%Y-%m-%d")
266
  end_dt = datetime.strptime(end_date, "%Y-%m-%d")
@@ -284,7 +271,7 @@ def get_historical_data(start_date: str, end_date: str, sample_data = False) ->
284
 
285
  except Exception as e:
286
  logger.error(f"Error getting historical data: {e}")
287
- return get_fake_historical_data(start_date, end_date)
288
 
289
 
290
  def get_distant_data() -> tuple[pd.DataFrame, str]:
@@ -316,65 +303,6 @@ def get_distant_data() -> tuple[pd.DataFrame, str]:
316
  return filtered_joined, latest_update_msg
317
 
318
 
319
- def get_sample_data() -> tuple[pd.DataFrame, str]:
320
- # Retrieve sample dataframes
321
- df_amd, _ = read_one_dataframe("sample_amd.json", "amd")
322
- df_nvidia, _ = read_one_dataframe("sample_nvidia.json", "nvidia")
323
- # Join both dataframes
324
- joined = df_amd.join(df_nvidia, rsuffix="_nvidia", lsuffix="_amd", how="outer")
325
- joined = joined[KEYS_TO_KEEP]
326
- joined.index = joined.index.str.replace("^models_", "", regex=True)
327
- # Fitler out all but important models
328
- important_models_lower = [model.lower() for model in IMPORTANT_MODELS]
329
- filtered_joined = joined[joined.index.str.lower().isin(important_models_lower)]
330
- # Prefix all model names with "sample_"
331
- filtered_joined.index = "sample_" + filtered_joined.index
332
- return filtered_joined, "sample data was loaded"
333
-
334
-
335
- def get_fake_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
336
- """Generate fake historical data for a date range when real data loading fails."""
337
- try:
338
- start_dt = datetime.strptime(start_date, "%Y-%m-%d")
339
- end_dt = datetime.strptime(end_date, "%Y-%m-%d")
340
- sample_df, _ = get_sample_data()
341
- historical_data = []
342
-
343
- # Generate data for each date
344
- current_dt = start_dt
345
- while current_dt <= end_dt:
346
- date_df = sample_df.copy()
347
- date_df['date'] = current_dt.strftime("%Y-%m-%d")
348
-
349
- # Add random variations to make it realistic
350
- for idx in date_df.index:
351
- # Vary success/skipped counts (±20%)
352
- for col in ['success_amd', 'success_nvidia', 'skipped_amd', 'skipped_nvidia']:
353
- if col in date_df.columns and pd.notna(date_df.loc[idx, col]):
354
- val = date_df.loc[idx, col]
355
- if val > 0:
356
- date_df.loc[idx, col] = max(0, int(val * random.uniform(0.8, 1.2)))
357
-
358
- # Vary failure counts more dramatically (±50-100%)
359
- for col in ['failed_multi_no_amd', 'failed_multi_no_nvidia', 'failed_single_no_amd', 'failed_single_no_nvidia']:
360
- if col in date_df.columns and pd.notna(date_df.loc[idx, col]):
361
- val = date_df.loc[idx, col]
362
- date_df.loc[idx, col] = max(0, int(val * random.uniform(0.5, 2.0)))
363
-
364
- historical_data.append(date_df)
365
- current_dt += timedelta(days=1)
366
-
367
- if not historical_data:
368
- return pd.DataFrame()
369
-
370
- combined_df = pd.concat(historical_data, ignore_index=False)
371
- logger.info(f"Generated fake historical data: {len(combined_df)} records from {start_date} to {end_date}")
372
- return combined_df
373
-
374
- except Exception as e:
375
- logger.error(f"Error generating fake historical data: {e}")
376
- return pd.DataFrame()
377
-
378
  def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
379
  """Find the first date when a specific test failure appeared in historical data."""
380
  if historical_df is None or historical_df.empty:
@@ -510,48 +438,25 @@ class CIResults:
510
  self.available_dates = []
511
  self.historical_df = pd.DataFrame()
512
  self.all_historical_data = pd.DataFrame() # Store all historical data at startup
513
- self.sample_data = False
514
 
515
  def load_data(self) -> None:
516
  """Load data from the data source."""
517
- # Try loading the distant data, and fall back on sample data for local tinkering
518
- try:
519
- logger.info("Loading distant data...")
520
- new_df, latest_update_msg = get_distant_data()
521
- self.latest_update_msg = latest_update_msg
522
- self.sample_data = False
523
- except Exception as e:
524
- error_msg = [
525
- "Loading data failed:",
526
- "-" * 120,
527
- traceback.format_exc(),
528
- "-" * 120,
529
- "Falling back on sample data."
530
- ]
531
- logger.error("\n".join(error_msg))
532
- self.sample_data = True
533
- new_df, latest_update_msg = get_sample_data()
534
- self.latest_update_msg = latest_update_msg
535
 
536
- # Try to get available dates
537
  try:
538
- if not self.sample_data:
539
- self.available_dates = get_available_dates()
540
- logger.info(f"Available dates: {len(self.available_dates)} dates")
541
- if self.available_dates:
542
- logger.info(f"Date range: {self.available_dates[-1]} to {self.available_dates[0]}")
543
- else:
544
- logger.warning("No available dates found")
545
- self.available_dates = []
546
  else:
547
- # Generate fake dates for sample data historical functionality
548
- self.available_dates = generate_fake_dates()
549
  except Exception as e:
550
  logger.warning(f"Failed to get available dates: {e}")
551
- if self.sample_data:
552
- self.available_dates = generate_fake_dates()
553
- else:
554
- self.available_dates = []
555
 
556
  # Update attributes
557
  self.df = new_df
@@ -587,7 +492,7 @@ class CIResults:
587
 
588
  logger.info(f"Loading all historical data for {len(self.available_dates)} dates...")
589
  start_date, end_date = self.available_dates[-1], self.available_dates[0]
590
- self.all_historical_data = get_historical_data(start_date, end_date, self.sample_data)
591
  logger.info(f"All historical data loaded: {len(self.all_historical_data)} records")
592
  except Exception as e:
593
  logger.error(f"Error loading all historical data: {e}")
 
6
  import traceback
7
  import json
8
  import re
 
9
  from typing import List, Tuple, Optional, Dict
10
 
11
  # NOTE: if caching is an issue, try adding `use_listings_cache=False`
 
60
  # HELPER FUNCTIONS
61
  # ============================================================================
62
 
 
 
 
 
 
63
  def parse_json_field(value) -> dict:
64
  """Safely parse a JSON field that might be a string or dict."""
65
  if value is None or pd.isna(value):
 
100
  Adds the link to the dataset in the logs, modifies it to get a clockable link and then returns the date of the
101
  report.
102
  """
 
 
103
  logger.info(f"Reading df located at {link}")
104
  # Make sure the links starts with an http adress
105
  if link.startswith("hf://"):
 
175
  return common_dates[:30] # Limit to last 30 days
176
 
177
  # No real dates available - log warning and return empty list
 
178
  logger.warning("No common dates found between AMD and NVIDIA datasets")
179
  return []
180
 
 
243
 
244
  except Exception as e:
245
  logger.error(f"Error getting data for date {target_date}: {e}")
 
246
  return pd.DataFrame(), target_date
247
 
248
 
249
+ def get_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
250
  """Get historical data for a date range."""
 
 
 
251
  try:
252
  start_dt = datetime.strptime(start_date, "%Y-%m-%d")
253
  end_dt = datetime.strptime(end_date, "%Y-%m-%d")
 
271
 
272
  except Exception as e:
273
  logger.error(f"Error getting historical data: {e}")
274
+ return pd.DataFrame()
275
 
276
 
277
  def get_distant_data() -> tuple[pd.DataFrame, str]:
 
303
  return filtered_joined, latest_update_msg
304
 
305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
307
  """Find the first date when a specific test failure appeared in historical data."""
308
  if historical_df is None or historical_df.empty:
 
438
  self.available_dates = []
439
  self.historical_df = pd.DataFrame()
440
  self.all_historical_data = pd.DataFrame() # Store all historical data at startup
 
441
 
442
  def load_data(self) -> None:
443
  """Load data from the data source."""
444
+ logger.info("Loading distant data...")
445
+ new_df, latest_update_msg = get_distant_data()
446
+ self.latest_update_msg = latest_update_msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
+ # Get available dates
449
  try:
450
+ self.available_dates = get_available_dates()
451
+ logger.info(f"Available dates: {len(self.available_dates)} dates")
452
+ if self.available_dates:
453
+ logger.info(f"Date range: {self.available_dates[-1]} to {self.available_dates[0]}")
 
 
 
 
454
  else:
455
+ logger.warning("No available dates found")
456
+ self.available_dates = []
457
  except Exception as e:
458
  logger.warning(f"Failed to get available dates: {e}")
459
+ self.available_dates = []
 
 
 
460
 
461
  # Update attributes
462
  self.df = new_df
 
492
 
493
  logger.info(f"Loading all historical data for {len(self.available_dates)} dates...")
494
  start_date, end_date = self.available_dates[-1], self.available_dates[0]
495
+ self.all_historical_data = get_historical_data(start_date, end_date)
496
  logger.info(f"All historical data loaded: {len(self.all_historical_data)} records")
497
  except Exception as e:
498
  logger.error(f"Error loading all historical data: {e}")