Spaces:
Running
Running
File size: 8,922 Bytes
f481275 a8a9175 f481275 a8a9175 f481275 a8a9175 f481275 a8a9175 f481275 a8a9175 f481275 a8a9175 a8de46a a8a9175 f481275 a8a9175 f481275 a8a9175 f481275 a8a9175 f481275 a8a9175 f481275 a8a9175 f481275 a8a9175 f481275 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
"""Data preprocessing and feature engineering"""
import pandas as pd
import numpy as np
from miner_specs import MINER_SPECS, ELECTRICITY_RATES
from fetch_blockchain_data import get_days_since_halving
from electricity_prices import get_electricity_rate
def engineer_features(blockchain_df):
"""Engineer features from blockchain data - keep it simple"""
df = blockchain_df.copy().sort_values('date').reset_index(drop=True)
df['date'] = pd.to_datetime(df['date'])
# Just return as is, we'll select features later
return df
def prepare_miner_features(
blockchain_df,
miner_name,
miner_price,
region="texas",
machine_hashrate=None,
power=None,
efficiency=None,
electricity_rate=None,
):
"""
Add miner-specific features - EXACTLY 14 features.
Now uses user-specified:
- machine_price
- machine_hashrate
- power
- efficiency
- electricity_rate
If any of these are None, we fall back to MINER_SPECS / region,
but for your app you will always pass explicit values.
"""
df = blockchain_df.copy()
specs = MINER_SPECS[miner_name]
# Keep only these columns from blockchain data
df = df[[
"date",
"bitcoin_price",
"difficulty",
"fees",
"hashrate",
"revenue",
"block_reward",
]].copy()
df["date"] = pd.to_datetime(df["date"])
# ---- user-provided constants (same value for all 30 days) ----
df["machine_price"] = float(miner_price)
if machine_hashrate is not None:
df["machine_hashrate"] = float(machine_hashrate)
else:
df["machine_hashrate"] = specs["hashrate"]
if power is not None:
df["power"] = float(power)
else:
df["power"] = specs["power"]
if efficiency is not None:
df["efficiency"] = float(efficiency)
else:
df["efficiency"] = specs["efficiency"]
# Calculate age_days (days since miner was released)
release_date = pd.to_datetime(specs["release_date"])
df["age_days"] = (df["date"] - release_date).dt.days
# Days since halving
df["days_since_halving"] = df["date"].apply(get_days_since_halving)
# Revenue potential (same as your original code)
hashrate_hs = df["machine_hashrate"] * 1e12
btc_per_day = (
(hashrate_hs * 86400)
/ (df["difficulty"] * (2**32))
* (df["block_reward"] + (df["fees"] / 144))
)
df["Revenue_Potential"] = btc_per_day * df["bitcoin_price"]
# ---- electricity_rate constant across all rows ----
if electricity_rate is not None:
df["electricity_rate"] = float(electricity_rate)
else:
df["electricity_rate"] = specs["electricity_rate"]
# # fallback: keep old behaviour if not provided
# df["electricity_rate"] = df["date"].dt.date.apply(
# lambda day: get_electricity_rate(region, day)
# )
return df
def get_latest_sequence(
blockchain_df,
miner_name,
miner_price,
region="texas",
window_size=30,
machine_hashrate=None,
power=None,
efficiency=None,
electricity_rate=None,
):
"""
Get the most recent sequence for prediction - EXACTLY 14 features in CORRECT ORDER.
Now also accepts user-specified:
- machine_hashrate
- power
- efficiency
- electricity_rate
"""
df_features = engineer_features(blockchain_df)
df_miner = prepare_miner_features(
df_features,
miner_name,
miner_price,
region,
machine_hashrate=machine_hashrate,
power=power,
efficiency=efficiency,
electricity_rate=electricity_rate,
)
# CRITICAL: This order MUST match your training data CSV exactly!
feature_cols = [
"bitcoin_price", # 1
"difficulty", # 2
"fees", # 3
"hashrate", # 4
"revenue", # 5
"machine_price", # 6
"machine_hashrate", # 7
"power", # 8
"efficiency", # 9
"block_reward", # 10
"age_days", # 11
"days_since_halving", # 12
"Revenue_Potential", # 13
"electricity_rate", # 14
]
df_miner = df_miner.dropna().reset_index(drop=True)
if len(df_miner) < window_size:
raise ValueError(
f"Not enough data to build a {window_size}-day window, got {len(df_miner)} rows."
)
df_window = df_miner.tail(window_size).reset_index(drop=True)
sequence = df_window[feature_cols].values.astype(float)
pred_date = df_window["date"].iloc[-1]
return sequence, df_window, pred_date
if __name__ == "__main__":
from fetch_blockchain_data import get_latest_blockchain_data
print("\n" + "="*80)
print("TESTING PREPROCESSING PIPELINE")
print("="*80 + "\n")
# Fetch blockchain data
print("π‘ Fetching blockchain data...")
blockchain_df = get_latest_blockchain_data(days=90)
if blockchain_df is None:
print("β Failed to fetch blockchain data")
exit(1)
print(f"β
Fetched {len(blockchain_df)} days of data\n")
# Test configuration
miner_name = 's19pro'
miner_price = 2500
region = 'texas'
window_size = 30
print("βοΈ Test Configuration:")
print(f" Miner: {MINER_SPECS[miner_name]['full_name']}")
print(f" Price: ${miner_price:,}")
print(f" Region: {region.title()}")
print(f" Window: {window_size} days")
print(f" Electricity: ${ELECTRICITY_RATES[region]}/kWh\n")
# Step 1: Engineer features
print("="*80)
print("STEP 1: ENGINEER FEATURES")
print("="*80)
df_engineered = engineer_features(blockchain_df)
print(f"β
Engineered features")
print(f" Shape: {df_engineered.shape}")
print(f" Columns: {list(df_engineered.columns)}\n")
print("First 3 rows:")
print(df_engineered.head(3))
print("\nLast 3 rows:")
print(df_engineered.tail(3))
# Step 2: Prepare miner features
print("\n" + "="*80)
print("STEP 2: PREPARE MINER FEATURES")
print("="*80)
df_miner = prepare_miner_features(df_engineered, miner_name, miner_price, region)
print(f"β
Added miner-specific features")
print(f" Shape: {df_miner.shape}")
print(f" Columns: {list(df_miner.columns)}\n")
print("Miner-specific values (constant across all days):")
print(f" machine_hashrate: {df_miner['machine_hashrate'].iloc[0]} TH/s")
print(f" power: {df_miner['power'].iloc[0]} W")
print(f" efficiency: {df_miner['efficiency'].iloc[0]} W/TH")
print(f" machine_price: ${df_miner['machine_price'].iloc[0]:,.2f}")
print(f" electricity_rate: ${df_miner['electricity_rate'].iloc[0]:.4f}/kWh")
print("\nDynamic values (change over time):")
print(f" age_days: {df_miner['age_days'].iloc[0]} β {df_miner['age_days'].iloc[-1]} days")
print(f" days_since_halving: {df_miner['days_since_halving'].iloc[0]} β {df_miner['days_since_halving'].iloc[-1]} days")
print(f" Revenue_Potential: ${df_miner['Revenue_Potential'].iloc[0]:.2f} β ${df_miner['Revenue_Potential'].iloc[-1]:.2f}/day")
print("\nFirst 3 rows:")
print(df_miner.head(3))
print("\nLast 3 rows:")
print(df_miner.tail(3))
# Step 3: Get latest sequence
print("\n" + "="*80)
print("STEP 3: GET LATEST SEQUENCE")
print("="*80)
sequence, feature_cols, latest_date = get_latest_sequence(blockchain_df, miner_name, miner_price, region, window_size)
print(f"β
Created sequence for model")
print(f" Shape: {sequence.shape}")
print(f" Expected: ({window_size}, 14)")
print(f" Latest date: {latest_date.strftime('%Y-%m-%d')}\n")
print("14 Features (in order):")
for i, col in enumerate(feature_cols, 1):
print(f" {i:2d}. {col:25s} β First: {sequence[0, i-1]:>15.2f} Last: {sequence[-1, i-1]:>15.2f}")
print("\n" + "="*80)
print("SEQUENCE STATISTICS")
print("="*80)
print("\nFirst day in sequence:")
for i, col in enumerate(feature_cols):
print(f" {col:25s} = {sequence[0, i]:>15.2f}")
print(f"\nLast day in sequence (for prediction on {latest_date.strftime('%Y-%m-%d')}):")
for i, col in enumerate(feature_cols):
print(f" {col:25s} = {sequence[-1, i]:>15.2f}")
# Show some statistics
print("\n" + "="*80)
print("FEATURE RANGES")
print("="*80)
for i, col in enumerate(feature_cols):
min_val = sequence[:, i].min()
max_val = sequence[:, i].max()
mean_val = sequence[:, i].mean()
print(f"{col:25s} β Min: {min_val:>12.2f} Max: {max_val:>12.2f} Mean: {mean_val:>12.2f}")
print("\n" + "="*80)
print("β
PREPROCESSING PIPELINE TEST COMPLETE")
print("="*80 + "\n") |