"""Data preprocessing and feature engineering""" import pandas as pd import numpy as np from miner_specs import MINER_SPECS, ELECTRICITY_RATES from fetch_blockchain_data import get_days_since_halving from electricity_prices import get_electricity_rate def engineer_features(blockchain_df): """Engineer features from blockchain data - keep it simple""" df = blockchain_df.copy().sort_values('date').reset_index(drop=True) df['date'] = pd.to_datetime(df['date']) # Just return as is, we'll select features later return df def prepare_miner_features( blockchain_df, miner_name, miner_price, region="texas", machine_hashrate=None, power=None, efficiency=None, electricity_rate=None, release_date=None, ): """ Add miner-specific features - EXACTLY 14 features. Now uses user-specified: - machine_price - machine_hashrate - power - efficiency - electricity_rate If any of these are None, we fall back to MINER_SPECS / region, but for your app you will always pass explicit values. """ df = blockchain_df.copy() specs = MINER_SPECS[miner_name] # Keep only these columns from blockchain data df = df[[ "date", "bitcoin_price", "difficulty", "fees", "hashrate", "revenue", "block_reward", ]].copy() df["date"] = pd.to_datetime(df["date"]) # ---- user-provided constants (same value for all 30 days) ---- df["machine_price"] = float(miner_price) if machine_hashrate is not None: df["machine_hashrate"] = float(machine_hashrate) else: df["machine_hashrate"] = specs["hashrate"] if power is not None: df["power"] = float(power) else: df["power"] = specs["power"] if efficiency is not None: df["efficiency"] = float(efficiency) else: df["efficiency"] = specs["efficiency"] # ----- Age of the machine in days ----- # If the user supplied a release_date, use that; otherwise fall back # to the miner spec's release_date (S19 Pro default). if release_date is not None and str(release_date).strip() != "": release_str = str(release_date).strip() else: release_str = specs.get("release_date", "2020-01-01") release_dt = pd.to_datetime(release_str) df["age_days"] = (df["date"] - release_dt).dt.days # Days since halving df["days_since_halving"] = df["date"].apply(get_days_since_halving) # Revenue potential (same as your original code) hashrate_hs = df["machine_hashrate"] * 1e12 btc_per_day = ( (hashrate_hs * 86400) / (df["difficulty"] * (2**32)) * (df["block_reward"] + (df["fees"] / 144)) ) df["Revenue_Potential"] = btc_per_day * df["bitcoin_price"] # ---- electricity_rate constant across all rows ---- if electricity_rate is not None: df['electricity_rate'] = float(electricity_rate) else: df['electricity_rate'] = df['date'].dt.date.apply( lambda day: get_electricity_rate(region, day) ) return df def get_latest_sequence( blockchain_df, miner_name, miner_price, region="texas", window_size=30, machine_hashrate=None, power=None, efficiency=None, electricity_rate=None, release_date=None ): """ Get the most recent sequence for prediction - EXACTLY 14 features in CORRECT ORDER. Now also accepts user-specified: - machine_hashrate - power - efficiency - electricity_rate """ df_features = engineer_features(blockchain_df) df_miner = prepare_miner_features( df_features, miner_name, miner_price, region, machine_hashrate=machine_hashrate, power=power, efficiency=efficiency, electricity_rate=electricity_rate, release_date=release_date, ) # CRITICAL: This order MUST match your training data CSV exactly! feature_cols = [ "bitcoin_price", # 1 "difficulty", # 2 "fees", # 3 "hashrate", # 4 "revenue", # 5 "machine_price", # 6 "machine_hashrate", # 7 "power", # 8 "efficiency", # 9 "block_reward", # 10 "age_days", # 11 "days_since_halving", # 12 "Revenue_Potential", # 13 "electricity_rate", # 14 ] # --------------------------------------------------------- # Ensure we only use rows with complete core blockchain data. # This avoids including a "today" row where, e.g., difficulty or # hashrate are still NaN while price is already updated. # --------------------------------------------------------- df_miner = df_miner.sort_values("date").reset_index(drop=True) core_cols = ["bitcoin_price", "difficulty", "fees", "hashrate", "revenue"] df_miner = df_miner.dropna(subset=core_cols) if len(df_miner) < window_size: raise ValueError( f"Not enough data to build a {window_size}-day window after dropping NaNs. " f"Have {len(df_miner)} rows, need at least {window_size}." ) # Take the last `window_size` fully-populated days df_window = df_miner.tail(window_size).reset_index(drop=True) sequence = df_window[feature_cols].values.astype(float) pred_date = df_window["date"].iloc[-1] return sequence, df_window, pred_date if __name__ == "__main__": from fetch_blockchain_data import get_latest_blockchain_data print("\n" + "="*80) print("TESTING PREPROCESSING PIPELINE") print("="*80 + "\n") # Fetch blockchain data print("📡 Fetching blockchain data...") blockchain_df = get_latest_blockchain_data(days=90) if blockchain_df is None: print("❌ Failed to fetch blockchain data") exit(1) print(f"✅ Fetched {len(blockchain_df)} days of data\n") # Test configuration miner_name = 's19pro' miner_price = 2500 region = 'texas' window_size = 30 print("⚙️ Test Configuration:") print(f" Miner: {MINER_SPECS[miner_name]['full_name']}") print(f" Price: ${miner_price:,}") print(f" Region: {region.title()}") print(f" Window: {window_size} days") print(f" Electricity: ${ELECTRICITY_RATES[region]}/kWh\n") # Step 1: Engineer features print("="*80) print("STEP 1: ENGINEER FEATURES") print("="*80) df_engineered = engineer_features(blockchain_df) print(f"✅ Engineered features") print(f" Shape: {df_engineered.shape}") print(f" Columns: {list(df_engineered.columns)}\n") print("First 3 rows:") print(df_engineered.head(3)) print("\nLast 3 rows:") print(df_engineered.tail(3)) # Step 2: Prepare miner features print("\n" + "="*80) print("STEP 2: PREPARE MINER FEATURES") print("="*80) df_miner = prepare_miner_features(df_engineered, miner_name, miner_price, region) print(f"✅ Added miner-specific features") print(f" Shape: {df_miner.shape}") print(f" Columns: {list(df_miner.columns)}\n") print("Miner-specific values (constant across all days):") print(f" machine_hashrate: {df_miner['machine_hashrate'].iloc[0]} TH/s") print(f" power: {df_miner['power'].iloc[0]} W") print(f" efficiency: {df_miner['efficiency'].iloc[0]} W/TH") print(f" machine_price: ${df_miner['machine_price'].iloc[0]:,.2f}") print(f" electricity_rate: ${df_miner['electricity_rate'].iloc[0]:.4f}/kWh") print("\nDynamic values (change over time):") print(f" age_days: {df_miner['age_days'].iloc[0]} → {df_miner['age_days'].iloc[-1]} days") print(f" days_since_halving: {df_miner['days_since_halving'].iloc[0]} → {df_miner['days_since_halving'].iloc[-1]} days") print(f" Revenue_Potential: ${df_miner['Revenue_Potential'].iloc[0]:.2f} → ${df_miner['Revenue_Potential'].iloc[-1]:.2f}/day") print("\nFirst 3 rows:") print(df_miner.head(3)) print("\nLast 3 rows:") print(df_miner.tail(3)) # Step 3: Get latest sequence print("\n" + "="*80) print("STEP 3: GET LATEST SEQUENCE") print("="*80) sequence, feature_cols, latest_date = get_latest_sequence(blockchain_df, miner_name, miner_price, region, window_size) print(f"✅ Created sequence for model") print(f" Shape: {sequence.shape}") print(f" Expected: ({window_size}, 14)") print(f" Latest date: {latest_date.strftime('%Y-%m-%d')}\n") print("14 Features (in order):") for i, col in enumerate(feature_cols, 1): print(f" {i:2d}. {col:25s} → First: {sequence[0, i-1]:>15.2f} Last: {sequence[-1, i-1]:>15.2f}") print("\n" + "="*80) print("SEQUENCE STATISTICS") print("="*80) print("\nFirst day in sequence:") for i, col in enumerate(feature_cols): print(f" {col:25s} = {sequence[0, i]:>15.2f}") print(f"\nLast day in sequence (for prediction on {latest_date.strftime('%Y-%m-%d')}):") for i, col in enumerate(feature_cols): print(f" {col:25s} = {sequence[-1, i]:>15.2f}") # Show some statistics print("\n" + "="*80) print("FEATURE RANGES") print("="*80) for i, col in enumerate(feature_cols): min_val = sequence[:, i].min() max_val = sequence[:, i].max() mean_val = sequence[:, i].mean() print(f"{col:25s} → Min: {min_val:>12.2f} Max: {max_val:>12.2f} Mean: {mean_val:>12.2f}") print("\n" + "="*80) print("✅ PREPROCESSING PIPELINE TEST COMPLETE") print("="*80 + "\n")