Spaces:
Running
Running
File size: 7,781 Bytes
f481275 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
"""Data preprocessing and feature engineering"""
import pandas as pd
import numpy as np
from miner_specs import MINER_SPECS, ELECTRICITY_RATES
from fetch_blockchain_data import get_days_since_halving
from electricity_prices import get_electricity_rate
def engineer_features(blockchain_df):
"""Engineer features from blockchain data - keep it simple"""
df = blockchain_df.copy().sort_values('date').reset_index(drop=True)
df['date'] = pd.to_datetime(df['date'])
# Just return as is, we'll select features later
return df
def prepare_miner_features(blockchain_df, miner_name, miner_price, region='texas'):
"""Add miner-specific features - EXACTLY 14 features"""
df = blockchain_df.copy()
specs = MINER_SPECS[miner_name]
# Keep only these columns from blockchain data
df = df[['date', 'bitcoin_price', 'difficulty', 'fees', 'hashrate', 'revenue', 'block_reward']].copy()
df['date'] = pd.to_datetime(df['date'])
# Add miner features
df['machine_price'] = miner_price
df['machine_hashrate'] = specs['hashrate']
df['power'] = specs['power']
df['efficiency'] = specs['efficiency']
# Calculate age_days (days since miner was released)
release_date = pd.to_datetime(specs['release_date'])
df['age_days'] = (df['date'] - release_date).dt.days
# Days since halving
df['days_since_halving'] = df['date'].apply(get_days_since_halving)
# Revenue potential
hashrate_hs = df['machine_hashrate'] * 1e12
btc_per_day = (hashrate_hs * 86400) / (df['difficulty'] * (2**32)) * (df['block_reward'] + (df['fees']/144))
df['Revenue_Potential'] = btc_per_day * df['bitcoin_price']
# Electricity rate
# df['electricity_rate'] = ELECTRICITY_RATES.get(region, 0.10)
df['electricity_rate'] = df['date'].dt.date.apply(
lambda day: get_electricity_rate(region, day)
)
return df
def get_latest_sequence(blockchain_df, miner_name, miner_price, region='texas', window_size=30):
"""Get the most recent sequence for prediction - EXACTLY 14 features in CORRECT ORDER"""
df_features = engineer_features(blockchain_df)
df_miner = prepare_miner_features(df_features, miner_name, miner_price, region)
# CRITICAL: This order MUST match your training data CSV exactly!
# Your training CSV: bitcoin_price,difficulty,fees,hashrate,revenue,machine_price,machine_hashrate,power,efficiency,block_reward,age_days,days_since_halving,Revenue_Potential,electricity_rate
feature_cols = [
'bitcoin_price', # 1
'difficulty', # 2
'fees', # 3
'hashrate', # 4
'revenue', # 5
'machine_price', # 6
'machine_hashrate', # 7
'power', # 8
'efficiency', # 9
'block_reward', # 10
'age_days', # 11
'days_since_halving',# 12
'Revenue_Potential', # 13
'electricity_rate' # 14
]
df_miner = df_miner.dropna().reset_index(drop=True)
if len(df_miner) < window_size:
raise ValueError(f"Not enough data: need {window_size} days, have {len(df_miner)}")
# Get last window_size days with exactly 14 features
sequence = df_miner[feature_cols].values[-window_size:]
latest_date = df_miner['date'].iloc[-1]
# Verify shape
if sequence.shape[1] != 14:
raise ValueError(f"Expected 14 features, got {sequence.shape[1]}")
return sequence, feature_cols, latest_date
if __name__ == "__main__":
from fetch_blockchain_data import get_latest_blockchain_data
print("\n" + "="*80)
print("TESTING PREPROCESSING PIPELINE")
print("="*80 + "\n")
# Fetch blockchain data
print("π‘ Fetching blockchain data...")
blockchain_df = get_latest_blockchain_data(days=90)
if blockchain_df is None:
print("β Failed to fetch blockchain data")
exit(1)
print(f"β
Fetched {len(blockchain_df)} days of data\n")
# Test configuration
miner_name = 's19pro'
miner_price = 2500
region = 'texas'
window_size = 30
print("βοΈ Test Configuration:")
print(f" Miner: {MINER_SPECS[miner_name]['full_name']}")
print(f" Price: ${miner_price:,}")
print(f" Region: {region.title()}")
print(f" Window: {window_size} days")
print(f" Electricity: ${ELECTRICITY_RATES[region]}/kWh\n")
# Step 1: Engineer features
print("="*80)
print("STEP 1: ENGINEER FEATURES")
print("="*80)
df_engineered = engineer_features(blockchain_df)
print(f"β
Engineered features")
print(f" Shape: {df_engineered.shape}")
print(f" Columns: {list(df_engineered.columns)}\n")
print("First 3 rows:")
print(df_engineered.head(3))
print("\nLast 3 rows:")
print(df_engineered.tail(3))
# Step 2: Prepare miner features
print("\n" + "="*80)
print("STEP 2: PREPARE MINER FEATURES")
print("="*80)
df_miner = prepare_miner_features(df_engineered, miner_name, miner_price, region)
print(f"β
Added miner-specific features")
print(f" Shape: {df_miner.shape}")
print(f" Columns: {list(df_miner.columns)}\n")
print("Miner-specific values (constant across all days):")
print(f" machine_hashrate: {df_miner['machine_hashrate'].iloc[0]} TH/s")
print(f" power: {df_miner['power'].iloc[0]} W")
print(f" efficiency: {df_miner['efficiency'].iloc[0]} W/TH")
print(f" machine_price: ${df_miner['machine_price'].iloc[0]:,.2f}")
print(f" electricity_rate: ${df_miner['electricity_rate'].iloc[0]:.4f}/kWh")
print("\nDynamic values (change over time):")
print(f" age_days: {df_miner['age_days'].iloc[0]} β {df_miner['age_days'].iloc[-1]} days")
print(f" days_since_halving: {df_miner['days_since_halving'].iloc[0]} β {df_miner['days_since_halving'].iloc[-1]} days")
print(f" Revenue_Potential: ${df_miner['Revenue_Potential'].iloc[0]:.2f} β ${df_miner['Revenue_Potential'].iloc[-1]:.2f}/day")
print("\nFirst 3 rows:")
print(df_miner.head(3))
print("\nLast 3 rows:")
print(df_miner.tail(3))
# Step 3: Get latest sequence
print("\n" + "="*80)
print("STEP 3: GET LATEST SEQUENCE")
print("="*80)
sequence, feature_cols, latest_date = get_latest_sequence(blockchain_df, miner_name, miner_price, region, window_size)
print(f"β
Created sequence for model")
print(f" Shape: {sequence.shape}")
print(f" Expected: ({window_size}, 14)")
print(f" Latest date: {latest_date.strftime('%Y-%m-%d')}\n")
print("14 Features (in order):")
for i, col in enumerate(feature_cols, 1):
print(f" {i:2d}. {col:25s} β First: {sequence[0, i-1]:>15.2f} Last: {sequence[-1, i-1]:>15.2f}")
print("\n" + "="*80)
print("SEQUENCE STATISTICS")
print("="*80)
print("\nFirst day in sequence:")
for i, col in enumerate(feature_cols):
print(f" {col:25s} = {sequence[0, i]:>15.2f}")
print(f"\nLast day in sequence (for prediction on {latest_date.strftime('%Y-%m-%d')}):")
for i, col in enumerate(feature_cols):
print(f" {col:25s} = {sequence[-1, i]:>15.2f}")
# Show some statistics
print("\n" + "="*80)
print("FEATURE RANGES")
print("="*80)
for i, col in enumerate(feature_cols):
min_val = sequence[:, i].min()
max_val = sequence[:, i].max()
mean_val = sequence[:, i].mean()
print(f"{col:25s} β Min: {min_val:>12.2f} Max: {max_val:>12.2f} Mean: {mean_val:>12.2f}")
print("\n" + "="*80)
print("β
PREPROCESSING PIPELINE TEST COMPLETE")
print("="*80 + "\n") |