MineROI-Net / preprocessing.py
sithuWiki's picture
Update preprocessing.py
430a480 verified
"""Data preprocessing and feature engineering"""
import pandas as pd
import numpy as np
from miner_specs import MINER_SPECS, ELECTRICITY_RATES
from fetch_blockchain_data import get_days_since_halving
from electricity_prices import get_electricity_rate
def engineer_features(blockchain_df):
"""Engineer features from blockchain data - keep it simple"""
df = blockchain_df.copy().sort_values('date').reset_index(drop=True)
df['date'] = pd.to_datetime(df['date'])
# Just return as is, we'll select features later
return df
def prepare_miner_features(
blockchain_df,
miner_name,
miner_price,
region="texas",
machine_hashrate=None,
power=None,
efficiency=None,
electricity_rate=None,
release_date=None,
):
"""
Add miner-specific features - EXACTLY 14 features.
Now uses user-specified:
- machine_price
- machine_hashrate
- power
- efficiency
- electricity_rate
If any of these are None, we fall back to MINER_SPECS / region,
but for your app you will always pass explicit values.
"""
df = blockchain_df.copy()
specs = MINER_SPECS[miner_name]
# Keep only these columns from blockchain data
df = df[[
"date",
"bitcoin_price",
"difficulty",
"fees",
"hashrate",
"revenue",
"block_reward",
]].copy()
df["date"] = pd.to_datetime(df["date"])
# ---- user-provided constants (same value for all 30 days) ----
df["machine_price"] = float(miner_price)
if machine_hashrate is not None:
df["machine_hashrate"] = float(machine_hashrate)
else:
df["machine_hashrate"] = specs["hashrate"]
if power is not None:
df["power"] = float(power)
else:
df["power"] = specs["power"]
if efficiency is not None:
df["efficiency"] = float(efficiency)
else:
df["efficiency"] = specs["efficiency"]
# ----- Age of the machine in days -----
# If the user supplied a release_date, use that; otherwise fall back
# to the miner spec's release_date (S19 Pro default).
if release_date is not None and str(release_date).strip() != "":
release_str = str(release_date).strip()
else:
release_str = specs.get("release_date", "2020-01-01")
release_dt = pd.to_datetime(release_str)
df["age_days"] = (df["date"] - release_dt).dt.days
# Days since halving
df["days_since_halving"] = df["date"].apply(get_days_since_halving)
# Revenue potential (same as your original code)
hashrate_hs = df["machine_hashrate"] * 1e12
btc_per_day = (
(hashrate_hs * 86400)
/ (df["difficulty"] * (2**32))
* (df["block_reward"] + (df["fees"] / 144))
)
df["Revenue_Potential"] = btc_per_day * df["bitcoin_price"]
# ---- electricity_rate constant across all rows ----
if electricity_rate is not None:
df['electricity_rate'] = float(electricity_rate)
else:
df['electricity_rate'] = df['date'].dt.date.apply(
lambda day: get_electricity_rate(region, day)
)
return df
def get_latest_sequence(
blockchain_df,
miner_name,
miner_price,
region="texas",
window_size=30,
machine_hashrate=None,
power=None,
efficiency=None,
electricity_rate=None,
release_date=None
):
"""
Get the most recent sequence for prediction - EXACTLY 14 features in CORRECT ORDER.
Now also accepts user-specified:
- machine_hashrate
- power
- efficiency
- electricity_rate
"""
df_features = engineer_features(blockchain_df)
df_miner = prepare_miner_features(
df_features,
miner_name,
miner_price,
region,
machine_hashrate=machine_hashrate,
power=power,
efficiency=efficiency,
electricity_rate=electricity_rate,
release_date=release_date,
)
# CRITICAL: This order MUST match your training data CSV exactly!
feature_cols = [
"bitcoin_price", # 1
"difficulty", # 2
"fees", # 3
"hashrate", # 4
"revenue", # 5
"machine_price", # 6
"machine_hashrate", # 7
"power", # 8
"efficiency", # 9
"block_reward", # 10
"age_days", # 11
"days_since_halving", # 12
"Revenue_Potential", # 13
"electricity_rate", # 14
]
# ---------------------------------------------------------
# Ensure we only use rows with complete core blockchain data.
# This avoids including a "today" row where, e.g., difficulty or
# hashrate are still NaN while price is already updated.
# ---------------------------------------------------------
df_miner = df_miner.sort_values("date").reset_index(drop=True)
core_cols = ["bitcoin_price", "difficulty", "fees", "hashrate", "revenue"]
df_miner = df_miner.dropna(subset=core_cols)
if len(df_miner) < window_size:
raise ValueError(
f"Not enough data to build a {window_size}-day window after dropping NaNs. "
f"Have {len(df_miner)} rows, need at least {window_size}."
)
# Take the last `window_size` fully-populated days
df_window = df_miner.tail(window_size).reset_index(drop=True)
sequence = df_window[feature_cols].values.astype(float)
pred_date = df_window["date"].iloc[-1]
return sequence, df_window, pred_date
if __name__ == "__main__":
from fetch_blockchain_data import get_latest_blockchain_data
print("\n" + "="*80)
print("TESTING PREPROCESSING PIPELINE")
print("="*80 + "\n")
# Fetch blockchain data
print("πŸ“‘ Fetching blockchain data...")
blockchain_df = get_latest_blockchain_data(days=90)
if blockchain_df is None:
print("❌ Failed to fetch blockchain data")
exit(1)
print(f"βœ… Fetched {len(blockchain_df)} days of data\n")
# Test configuration
miner_name = 's19pro'
miner_price = 2500
region = 'texas'
window_size = 30
print("βš™οΈ Test Configuration:")
print(f" Miner: {MINER_SPECS[miner_name]['full_name']}")
print(f" Price: ${miner_price:,}")
print(f" Region: {region.title()}")
print(f" Window: {window_size} days")
print(f" Electricity: ${ELECTRICITY_RATES[region]}/kWh\n")
# Step 1: Engineer features
print("="*80)
print("STEP 1: ENGINEER FEATURES")
print("="*80)
df_engineered = engineer_features(blockchain_df)
print(f"βœ… Engineered features")
print(f" Shape: {df_engineered.shape}")
print(f" Columns: {list(df_engineered.columns)}\n")
print("First 3 rows:")
print(df_engineered.head(3))
print("\nLast 3 rows:")
print(df_engineered.tail(3))
# Step 2: Prepare miner features
print("\n" + "="*80)
print("STEP 2: PREPARE MINER FEATURES")
print("="*80)
df_miner = prepare_miner_features(df_engineered, miner_name, miner_price, region)
print(f"βœ… Added miner-specific features")
print(f" Shape: {df_miner.shape}")
print(f" Columns: {list(df_miner.columns)}\n")
print("Miner-specific values (constant across all days):")
print(f" machine_hashrate: {df_miner['machine_hashrate'].iloc[0]} TH/s")
print(f" power: {df_miner['power'].iloc[0]} W")
print(f" efficiency: {df_miner['efficiency'].iloc[0]} W/TH")
print(f" machine_price: ${df_miner['machine_price'].iloc[0]:,.2f}")
print(f" electricity_rate: ${df_miner['electricity_rate'].iloc[0]:.4f}/kWh")
print("\nDynamic values (change over time):")
print(f" age_days: {df_miner['age_days'].iloc[0]} β†’ {df_miner['age_days'].iloc[-1]} days")
print(f" days_since_halving: {df_miner['days_since_halving'].iloc[0]} β†’ {df_miner['days_since_halving'].iloc[-1]} days")
print(f" Revenue_Potential: ${df_miner['Revenue_Potential'].iloc[0]:.2f} β†’ ${df_miner['Revenue_Potential'].iloc[-1]:.2f}/day")
print("\nFirst 3 rows:")
print(df_miner.head(3))
print("\nLast 3 rows:")
print(df_miner.tail(3))
# Step 3: Get latest sequence
print("\n" + "="*80)
print("STEP 3: GET LATEST SEQUENCE")
print("="*80)
sequence, feature_cols, latest_date = get_latest_sequence(blockchain_df, miner_name, miner_price, region, window_size)
print(f"βœ… Created sequence for model")
print(f" Shape: {sequence.shape}")
print(f" Expected: ({window_size}, 14)")
print(f" Latest date: {latest_date.strftime('%Y-%m-%d')}\n")
print("14 Features (in order):")
for i, col in enumerate(feature_cols, 1):
print(f" {i:2d}. {col:25s} β†’ First: {sequence[0, i-1]:>15.2f} Last: {sequence[-1, i-1]:>15.2f}")
print("\n" + "="*80)
print("SEQUENCE STATISTICS")
print("="*80)
print("\nFirst day in sequence:")
for i, col in enumerate(feature_cols):
print(f" {col:25s} = {sequence[0, i]:>15.2f}")
print(f"\nLast day in sequence (for prediction on {latest_date.strftime('%Y-%m-%d')}):")
for i, col in enumerate(feature_cols):
print(f" {col:25s} = {sequence[-1, i]:>15.2f}")
# Show some statistics
print("\n" + "="*80)
print("FEATURE RANGES")
print("="*80)
for i, col in enumerate(feature_cols):
min_val = sequence[:, i].min()
max_val = sequence[:, i].max()
mean_val = sequence[:, i].mean()
print(f"{col:25s} β†’ Min: {min_val:>12.2f} Max: {max_val:>12.2f} Mean: {mean_val:>12.2f}")
print("\n" + "="*80)
print("βœ… PREPROCESSING PIPELINE TEST COMPLETE")
print("="*80 + "\n")