Spaces:
Running
Running
| """Data preprocessing and feature engineering""" | |
| import pandas as pd | |
| import numpy as np | |
| from miner_specs import MINER_SPECS, ELECTRICITY_RATES | |
| from fetch_blockchain_data import get_days_since_halving | |
| from electricity_prices import get_electricity_rate | |
| def engineer_features(blockchain_df): | |
| """Engineer features from blockchain data - keep it simple""" | |
| df = blockchain_df.copy().sort_values('date').reset_index(drop=True) | |
| df['date'] = pd.to_datetime(df['date']) | |
| # Just return as is, we'll select features later | |
| return df | |
| def prepare_miner_features( | |
| blockchain_df, | |
| miner_name, | |
| miner_price, | |
| region="texas", | |
| machine_hashrate=None, | |
| power=None, | |
| efficiency=None, | |
| electricity_rate=None, | |
| release_date=None, | |
| ): | |
| """ | |
| Add miner-specific features - EXACTLY 14 features. | |
| Now uses user-specified: | |
| - machine_price | |
| - machine_hashrate | |
| - power | |
| - efficiency | |
| - electricity_rate | |
| If any of these are None, we fall back to MINER_SPECS / region, | |
| but for your app you will always pass explicit values. | |
| """ | |
| df = blockchain_df.copy() | |
| specs = MINER_SPECS[miner_name] | |
| # Keep only these columns from blockchain data | |
| df = df[[ | |
| "date", | |
| "bitcoin_price", | |
| "difficulty", | |
| "fees", | |
| "hashrate", | |
| "revenue", | |
| "block_reward", | |
| ]].copy() | |
| df["date"] = pd.to_datetime(df["date"]) | |
| # ---- user-provided constants (same value for all 30 days) ---- | |
| df["machine_price"] = float(miner_price) | |
| if machine_hashrate is not None: | |
| df["machine_hashrate"] = float(machine_hashrate) | |
| else: | |
| df["machine_hashrate"] = specs["hashrate"] | |
| if power is not None: | |
| df["power"] = float(power) | |
| else: | |
| df["power"] = specs["power"] | |
| if efficiency is not None: | |
| df["efficiency"] = float(efficiency) | |
| else: | |
| df["efficiency"] = specs["efficiency"] | |
| # ----- Age of the machine in days ----- | |
| # If the user supplied a release_date, use that; otherwise fall back | |
| # to the miner spec's release_date (S19 Pro default). | |
| if release_date is not None and str(release_date).strip() != "": | |
| release_str = str(release_date).strip() | |
| else: | |
| release_str = specs.get("release_date", "2020-01-01") | |
| release_dt = pd.to_datetime(release_str) | |
| df["age_days"] = (df["date"] - release_dt).dt.days | |
| # Days since halving | |
| df["days_since_halving"] = df["date"].apply(get_days_since_halving) | |
| # Revenue potential (same as your original code) | |
| hashrate_hs = df["machine_hashrate"] * 1e12 | |
| btc_per_day = ( | |
| (hashrate_hs * 86400) | |
| / (df["difficulty"] * (2**32)) | |
| * (df["block_reward"] + (df["fees"] / 144)) | |
| ) | |
| df["Revenue_Potential"] = btc_per_day * df["bitcoin_price"] | |
| # ---- electricity_rate constant across all rows ---- | |
| if electricity_rate is not None: | |
| df['electricity_rate'] = float(electricity_rate) | |
| else: | |
| df['electricity_rate'] = df['date'].dt.date.apply( | |
| lambda day: get_electricity_rate(region, day) | |
| ) | |
| return df | |
| def get_latest_sequence( | |
| blockchain_df, | |
| miner_name, | |
| miner_price, | |
| region="texas", | |
| window_size=30, | |
| machine_hashrate=None, | |
| power=None, | |
| efficiency=None, | |
| electricity_rate=None, | |
| release_date=None | |
| ): | |
| """ | |
| Get the most recent sequence for prediction - EXACTLY 14 features in CORRECT ORDER. | |
| Now also accepts user-specified: | |
| - machine_hashrate | |
| - power | |
| - efficiency | |
| - electricity_rate | |
| """ | |
| df_features = engineer_features(blockchain_df) | |
| df_miner = prepare_miner_features( | |
| df_features, | |
| miner_name, | |
| miner_price, | |
| region, | |
| machine_hashrate=machine_hashrate, | |
| power=power, | |
| efficiency=efficiency, | |
| electricity_rate=electricity_rate, | |
| release_date=release_date, | |
| ) | |
| # CRITICAL: This order MUST match your training data CSV exactly! | |
| feature_cols = [ | |
| "bitcoin_price", # 1 | |
| "difficulty", # 2 | |
| "fees", # 3 | |
| "hashrate", # 4 | |
| "revenue", # 5 | |
| "machine_price", # 6 | |
| "machine_hashrate", # 7 | |
| "power", # 8 | |
| "efficiency", # 9 | |
| "block_reward", # 10 | |
| "age_days", # 11 | |
| "days_since_halving", # 12 | |
| "Revenue_Potential", # 13 | |
| "electricity_rate", # 14 | |
| ] | |
| # --------------------------------------------------------- | |
| # Ensure we only use rows with complete core blockchain data. | |
| # This avoids including a "today" row where, e.g., difficulty or | |
| # hashrate are still NaN while price is already updated. | |
| # --------------------------------------------------------- | |
| df_miner = df_miner.sort_values("date").reset_index(drop=True) | |
| core_cols = ["bitcoin_price", "difficulty", "fees", "hashrate", "revenue"] | |
| df_miner = df_miner.dropna(subset=core_cols) | |
| if len(df_miner) < window_size: | |
| raise ValueError( | |
| f"Not enough data to build a {window_size}-day window after dropping NaNs. " | |
| f"Have {len(df_miner)} rows, need at least {window_size}." | |
| ) | |
| # Take the last `window_size` fully-populated days | |
| df_window = df_miner.tail(window_size).reset_index(drop=True) | |
| sequence = df_window[feature_cols].values.astype(float) | |
| pred_date = df_window["date"].iloc[-1] | |
| return sequence, df_window, pred_date | |
| if __name__ == "__main__": | |
| from fetch_blockchain_data import get_latest_blockchain_data | |
| print("\n" + "="*80) | |
| print("TESTING PREPROCESSING PIPELINE") | |
| print("="*80 + "\n") | |
| # Fetch blockchain data | |
| print("π‘ Fetching blockchain data...") | |
| blockchain_df = get_latest_blockchain_data(days=90) | |
| if blockchain_df is None: | |
| print("β Failed to fetch blockchain data") | |
| exit(1) | |
| print(f"β Fetched {len(blockchain_df)} days of data\n") | |
| # Test configuration | |
| miner_name = 's19pro' | |
| miner_price = 2500 | |
| region = 'texas' | |
| window_size = 30 | |
| print("βοΈ Test Configuration:") | |
| print(f" Miner: {MINER_SPECS[miner_name]['full_name']}") | |
| print(f" Price: ${miner_price:,}") | |
| print(f" Region: {region.title()}") | |
| print(f" Window: {window_size} days") | |
| print(f" Electricity: ${ELECTRICITY_RATES[region]}/kWh\n") | |
| # Step 1: Engineer features | |
| print("="*80) | |
| print("STEP 1: ENGINEER FEATURES") | |
| print("="*80) | |
| df_engineered = engineer_features(blockchain_df) | |
| print(f"β Engineered features") | |
| print(f" Shape: {df_engineered.shape}") | |
| print(f" Columns: {list(df_engineered.columns)}\n") | |
| print("First 3 rows:") | |
| print(df_engineered.head(3)) | |
| print("\nLast 3 rows:") | |
| print(df_engineered.tail(3)) | |
| # Step 2: Prepare miner features | |
| print("\n" + "="*80) | |
| print("STEP 2: PREPARE MINER FEATURES") | |
| print("="*80) | |
| df_miner = prepare_miner_features(df_engineered, miner_name, miner_price, region) | |
| print(f"β Added miner-specific features") | |
| print(f" Shape: {df_miner.shape}") | |
| print(f" Columns: {list(df_miner.columns)}\n") | |
| print("Miner-specific values (constant across all days):") | |
| print(f" machine_hashrate: {df_miner['machine_hashrate'].iloc[0]} TH/s") | |
| print(f" power: {df_miner['power'].iloc[0]} W") | |
| print(f" efficiency: {df_miner['efficiency'].iloc[0]} W/TH") | |
| print(f" machine_price: ${df_miner['machine_price'].iloc[0]:,.2f}") | |
| print(f" electricity_rate: ${df_miner['electricity_rate'].iloc[0]:.4f}/kWh") | |
| print("\nDynamic values (change over time):") | |
| print(f" age_days: {df_miner['age_days'].iloc[0]} β {df_miner['age_days'].iloc[-1]} days") | |
| print(f" days_since_halving: {df_miner['days_since_halving'].iloc[0]} β {df_miner['days_since_halving'].iloc[-1]} days") | |
| print(f" Revenue_Potential: ${df_miner['Revenue_Potential'].iloc[0]:.2f} β ${df_miner['Revenue_Potential'].iloc[-1]:.2f}/day") | |
| print("\nFirst 3 rows:") | |
| print(df_miner.head(3)) | |
| print("\nLast 3 rows:") | |
| print(df_miner.tail(3)) | |
| # Step 3: Get latest sequence | |
| print("\n" + "="*80) | |
| print("STEP 3: GET LATEST SEQUENCE") | |
| print("="*80) | |
| sequence, feature_cols, latest_date = get_latest_sequence(blockchain_df, miner_name, miner_price, region, window_size) | |
| print(f"β Created sequence for model") | |
| print(f" Shape: {sequence.shape}") | |
| print(f" Expected: ({window_size}, 14)") | |
| print(f" Latest date: {latest_date.strftime('%Y-%m-%d')}\n") | |
| print("14 Features (in order):") | |
| for i, col in enumerate(feature_cols, 1): | |
| print(f" {i:2d}. {col:25s} β First: {sequence[0, i-1]:>15.2f} Last: {sequence[-1, i-1]:>15.2f}") | |
| print("\n" + "="*80) | |
| print("SEQUENCE STATISTICS") | |
| print("="*80) | |
| print("\nFirst day in sequence:") | |
| for i, col in enumerate(feature_cols): | |
| print(f" {col:25s} = {sequence[0, i]:>15.2f}") | |
| print(f"\nLast day in sequence (for prediction on {latest_date.strftime('%Y-%m-%d')}):") | |
| for i, col in enumerate(feature_cols): | |
| print(f" {col:25s} = {sequence[-1, i]:>15.2f}") | |
| # Show some statistics | |
| print("\n" + "="*80) | |
| print("FEATURE RANGES") | |
| print("="*80) | |
| for i, col in enumerate(feature_cols): | |
| min_val = sequence[:, i].min() | |
| max_val = sequence[:, i].max() | |
| mean_val = sequence[:, i].mean() | |
| print(f"{col:25s} β Min: {min_val:>12.2f} Max: {max_val:>12.2f} Mean: {mean_val:>12.2f}") | |
| print("\n" + "="*80) | |
| print("β PREPROCESSING PIPELINE TEST COMPLETE") | |
| print("="*80 + "\n") |