Tcid

Sleeping

App Files Files Community

badaoui HF Staff commited on about 1 month ago

Commit

b2f075d

1 Parent(s): 66be9e5

fix

Browse files

Files changed (1) hide show

data.py +14 -109

data.py CHANGED Viewed

@@ -6,7 +6,6 @@ import threading
 import traceback
 import json
 import re
-import random
 from typing import List, Tuple, Optional, Dict
 # NOTE: if caching is an issue, try adding `use_listings_cache=False`
@@ -61,11 +60,6 @@ KEYS_TO_KEEP = [
 # HELPER FUNCTIONS
 # ============================================================================
-def generate_fake_dates(num_days: int = 7) -> List[str]:
-    """Generate fake dates for the last N days."""
-    today = datetime.now()
-    return [(today - timedelta(days=i)).strftime("%Y-%m-%d") for i in range(num_days)]
 def parse_json_field(value) -> dict:
     """Safely parse a JSON field that might be a string or dict."""
     if value is None or pd.isna(value):
@@ -106,8 +100,6 @@ def log_dataframe_link(link: str) -> str:
     Adds the link to the dataset in the logs, modifies it to get a clockable link and then returns the date of the
     report.
     """
-    if link.startswith("sample_"):
-        return "9999-99-99"
     logger.info(f"Reading df located at {link}")
     # Make sure the links starts with an http adress
     if link.startswith("hf://"):
@@ -183,7 +175,6 @@ def get_available_dates() -> List[str]:
             return common_dates[:30]  # Limit to last 30 days
         # No real dates available - log warning and return empty list
-        # This will allow the system to fall back to sample data properly
         logger.warning("No common dates found between AMD and NVIDIA datasets")
         return []
@@ -252,15 +243,11 @@ def get_data_for_date(target_date: str) -> tuple[pd.DataFrame, str]:
     except Exception as e:
         logger.error(f"Error getting data for date {target_date}: {e}")
-        # Return empty dataframe instead of sample data for historical functionality
         return pd.DataFrame(), target_date
-def get_historical_data(start_date: str, end_date: str, sample_data = False) -> pd.DataFrame:
     """Get historical data for a date range."""
-    if sample_data:
-        return get_fake_historical_data(start_date, end_date)
     try:
         start_dt = datetime.strptime(start_date, "%Y-%m-%d")
         end_dt = datetime.strptime(end_date, "%Y-%m-%d")
@@ -284,7 +271,7 @@ def get_historical_data(start_date: str, end_date: str, sample_data = False) ->
     except Exception as e:
         logger.error(f"Error getting historical data: {e}")
-        return get_fake_historical_data(start_date, end_date)
 def get_distant_data() -> tuple[pd.DataFrame, str]:
@@ -316,65 +303,6 @@ def get_distant_data() -> tuple[pd.DataFrame, str]:
     return filtered_joined, latest_update_msg
-def get_sample_data() -> tuple[pd.DataFrame, str]:
-    # Retrieve sample dataframes
-    df_amd, _ = read_one_dataframe("sample_amd.json", "amd")
-    df_nvidia, _ = read_one_dataframe("sample_nvidia.json", "nvidia")
-    # Join both dataframes
-    joined = df_amd.join(df_nvidia, rsuffix="_nvidia", lsuffix="_amd", how="outer")
-    joined = joined[KEYS_TO_KEEP]
-    joined.index = joined.index.str.replace("^models_", "", regex=True)
-    # Fitler out all but important models
-    important_models_lower = [model.lower() for model in IMPORTANT_MODELS]
-    filtered_joined = joined[joined.index.str.lower().isin(important_models_lower)]
-    # Prefix all model names with "sample_"
-    filtered_joined.index = "sample_" + filtered_joined.index
-    return filtered_joined, "sample data was loaded"
-def get_fake_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
-    """Generate fake historical data for a date range when real data loading fails."""
-    try:
-        start_dt = datetime.strptime(start_date, "%Y-%m-%d")
-        end_dt = datetime.strptime(end_date, "%Y-%m-%d")
-        sample_df, _ = get_sample_data()
-        historical_data = []
-        # Generate data for each date
-        current_dt = start_dt
-        while current_dt <= end_dt:
-            date_df = sample_df.copy()
-            date_df['date'] = current_dt.strftime("%Y-%m-%d")
-            # Add random variations to make it realistic
-            for idx in date_df.index:
-                # Vary success/skipped counts (±20%)
-                for col in ['success_amd', 'success_nvidia', 'skipped_amd', 'skipped_nvidia']:
-                    if col in date_df.columns and pd.notna(date_df.loc[idx, col]):
-                        val = date_df.loc[idx, col]
-                        if val > 0:
-                            date_df.loc[idx, col] = max(0, int(val * random.uniform(0.8, 1.2)))
-                # Vary failure counts more dramatically (±50-100%)
-                for col in ['failed_multi_no_amd', 'failed_multi_no_nvidia', 'failed_single_no_amd', 'failed_single_no_nvidia']:
-                    if col in date_df.columns and pd.notna(date_df.loc[idx, col]):
-                        val = date_df.loc[idx, col]
-                        date_df.loc[idx, col] = max(0, int(val * random.uniform(0.5, 2.0)))
-            historical_data.append(date_df)
-            current_dt += timedelta(days=1)
-        if not historical_data:
-            return pd.DataFrame()
-        combined_df = pd.concat(historical_data, ignore_index=False)
-        logger.info(f"Generated fake historical data: {len(combined_df)} records from {start_date} to {end_date}")
-        return combined_df
-    except Exception as e:
-        logger.error(f"Error generating fake historical data: {e}")
-        return pd.DataFrame()
 def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
     """Find the first date when a specific test failure appeared in historical data."""
     if historical_df is None or historical_df.empty:
@@ -510,48 +438,25 @@ class CIResults:
         self.available_dates = []
         self.historical_df = pd.DataFrame()
         self.all_historical_data = pd.DataFrame()  # Store all historical data at startup
-        self.sample_data = False
     def load_data(self) -> None:
         """Load data from the data source."""
-        # Try loading the distant data, and fall back on sample data for local tinkering
-        try:
-            logger.info("Loading distant data...")
-            new_df, latest_update_msg = get_distant_data()
-            self.latest_update_msg = latest_update_msg
-            self.sample_data = False
-        except Exception as e:
-            error_msg = [
-                "Loading data failed:",
-                "-" * 120,
-                traceback.format_exc(),
-                "-" * 120,
-                "Falling back on sample data."
-            ]
-            logger.error("\n".join(error_msg))
-            self.sample_data = True
-            new_df, latest_update_msg = get_sample_data()
-            self.latest_update_msg = latest_update_msg
-        # Try to get available dates
         try:
-            if not self.sample_data:
-                self.available_dates = get_available_dates()
-                logger.info(f"Available dates: {len(self.available_dates)} dates")
-                if self.available_dates:
-                    logger.info(f"Date range: {self.available_dates[-1]} to {self.available_dates[0]}")
-                else:
-                    logger.warning("No available dates found")
-                    self.available_dates = []
             else:
-                # Generate fake dates for sample data historical functionality
-                self.available_dates = generate_fake_dates()
         except Exception as e:
             logger.warning(f"Failed to get available dates: {e}")
-            if self.sample_data:
-                self.available_dates = generate_fake_dates()
-            else:
-                self.available_dates = []
         # Update attributes
         self.df = new_df
@@ -587,7 +492,7 @@ class CIResults:
             logger.info(f"Loading all historical data for {len(self.available_dates)} dates...")
             start_date, end_date = self.available_dates[-1], self.available_dates[0]
-            self.all_historical_data = get_historical_data(start_date, end_date, self.sample_data)
             logger.info(f"All historical data loaded: {len(self.all_historical_data)} records")
         except Exception as e:
             logger.error(f"Error loading all historical data: {e}")

 import traceback
 import json
 import re
 from typing import List, Tuple, Optional, Dict
 # NOTE: if caching is an issue, try adding `use_listings_cache=False`
 # HELPER FUNCTIONS
 # ============================================================================
 def parse_json_field(value) -> dict:
     """Safely parse a JSON field that might be a string or dict."""
     if value is None or pd.isna(value):
     Adds the link to the dataset in the logs, modifies it to get a clockable link and then returns the date of the
     report.
     """
     logger.info(f"Reading df located at {link}")
     # Make sure the links starts with an http adress
     if link.startswith("hf://"):
             return common_dates[:30]  # Limit to last 30 days
         # No real dates available - log warning and return empty list
         logger.warning("No common dates found between AMD and NVIDIA datasets")
         return []
     except Exception as e:
         logger.error(f"Error getting data for date {target_date}: {e}")
         return pd.DataFrame(), target_date
+def get_historical_data(start_date: str, end_date: str) -> pd.DataFrame:
     """Get historical data for a date range."""
     try:
         start_dt = datetime.strptime(start_date, "%Y-%m-%d")
         end_dt = datetime.strptime(end_date, "%Y-%m-%d")
     except Exception as e:
         logger.error(f"Error getting historical data: {e}")
+        return pd.DataFrame()
 def get_distant_data() -> tuple[pd.DataFrame, str]:
     return filtered_joined, latest_update_msg
 def find_failure_first_seen(historical_df: pd.DataFrame, model_name: str, test_name: str, device: str, gpu_type: str) -> Optional[str]:
     """Find the first date when a specific test failure appeared in historical data."""
     if historical_df is None or historical_df.empty:
         self.available_dates = []
         self.historical_df = pd.DataFrame()
         self.all_historical_data = pd.DataFrame()  # Store all historical data at startup
     def load_data(self) -> None:
         """Load data from the data source."""
+        logger.info("Loading distant data...")
+        new_df, latest_update_msg = get_distant_data()
+        self.latest_update_msg = latest_update_msg
+        # Get available dates
         try:
+            self.available_dates = get_available_dates()
+            logger.info(f"Available dates: {len(self.available_dates)} dates")
+            if self.available_dates:
+                logger.info(f"Date range: {self.available_dates[-1]} to {self.available_dates[0]}")
             else:
+                logger.warning("No available dates found")
+                self.available_dates = []
         except Exception as e:
             logger.warning(f"Failed to get available dates: {e}")
+            self.available_dates = []
         # Update attributes
         self.df = new_df
             logger.info(f"Loading all historical data for {len(self.available_dates)} dates...")
             start_date, end_date = self.available_dates[-1], self.available_dates[0]
+            self.all_historical_data = get_historical_data(start_date, end_date)
             logger.info(f"All historical data loaded: {len(self.all_historical_data)} records")
         except Exception as e:
             logger.error(f"Error loading all historical data: {e}")