Spaces:

amaai-lab
/

MineROI-Net

Running

App Files Files Community

MineROI-Net / preprocessing.py

sithuWiki

Update preprocessing.py

430a480 verified 1 day ago

raw

history blame

9.75 kB

	"""Data preprocessing and feature engineering"""

	import pandas as pd
	import numpy as np
	from miner_specs import MINER_SPECS, ELECTRICITY_RATES
	from fetch_blockchain_data import get_days_since_halving
	from electricity_prices import get_electricity_rate


	def engineer_features(blockchain_df):
	"""Engineer features from blockchain data - keep it simple"""

	df = blockchain_df.copy().sort_values('date').reset_index(drop=True)
	df['date'] = pd.to_datetime(df['date'])

	# Just return as is, we'll select features later
	return df


	def prepare_miner_features(
	blockchain_df,
	miner_name,
	miner_price,
	region="texas",
	machine_hashrate=None,
	power=None,
	efficiency=None,
	electricity_rate=None,
	release_date=None,
	):
	"""
	Add miner-specific features - EXACTLY 14 features.

	Now uses user-specified:
	- machine_price
	- machine_hashrate
	- power
	- efficiency
	- electricity_rate

	If any of these are None, we fall back to MINER_SPECS / region,
	but for your app you will always pass explicit values.
	"""
	df = blockchain_df.copy()
	specs = MINER_SPECS[miner_name]

	# Keep only these columns from blockchain data
	df = df[[
	"date",
	"bitcoin_price",
	"difficulty",
	"fees",
	"hashrate",
	"revenue",
	"block_reward",
	]].copy()
	df["date"] = pd.to_datetime(df["date"])

	# ---- user-provided constants (same value for all 30 days) ----
	df["machine_price"] = float(miner_price)

	if machine_hashrate is not None:
	df["machine_hashrate"] = float(machine_hashrate)
	else:
	df["machine_hashrate"] = specs["hashrate"]

	if power is not None:
	df["power"] = float(power)
	else:
	df["power"] = specs["power"]

	if efficiency is not None:
	df["efficiency"] = float(efficiency)
	else:
	df["efficiency"] = specs["efficiency"]

	# ----- Age of the machine in days -----
	# If the user supplied a release_date, use that; otherwise fall back
	# to the miner spec's release_date (S19 Pro default).
	if release_date is not None and str(release_date).strip() != "":
	release_str = str(release_date).strip()
	else:
	release_str = specs.get("release_date", "2020-01-01")

	release_dt = pd.to_datetime(release_str)
	df["age_days"] = (df["date"] - release_dt).dt.days

	# Days since halving
	df["days_since_halving"] = df["date"].apply(get_days_since_halving)

	# Revenue potential (same as your original code)
	hashrate_hs = df["machine_hashrate"] * 1e12
	btc_per_day = (
	(hashrate_hs * 86400)
	/ (df["difficulty"] * (2**32))
	* (df["block_reward"] + (df["fees"] / 144))
	)
	df["Revenue_Potential"] = btc_per_day * df["bitcoin_price"]

	# ---- electricity_rate constant across all rows ----
	if electricity_rate is not None:
	df['electricity_rate'] = float(electricity_rate)
	else:
	df['electricity_rate'] = df['date'].dt.date.apply(
	lambda day: get_electricity_rate(region, day)
	)

	return df



	def get_latest_sequence(
	blockchain_df,
	miner_name,
	miner_price,
	region="texas",
	window_size=30,
	machine_hashrate=None,
	power=None,
	efficiency=None,
	electricity_rate=None,
	release_date=None
	):
	"""
	Get the most recent sequence for prediction - EXACTLY 14 features in CORRECT ORDER.

	Now also accepts user-specified:
	- machine_hashrate
	- power
	- efficiency
	- electricity_rate
	"""
	df_features = engineer_features(blockchain_df)
	df_miner = prepare_miner_features(
	df_features,
	miner_name,
	miner_price,
	region,
	machine_hashrate=machine_hashrate,
	power=power,
	efficiency=efficiency,
	electricity_rate=electricity_rate,
	release_date=release_date,
	)

	# CRITICAL: This order MUST match your training data CSV exactly!
	feature_cols = [
	"bitcoin_price", # 1
	"difficulty", # 2
	"fees", # 3
	"hashrate", # 4
	"revenue", # 5
	"machine_price", # 6
	"machine_hashrate", # 7
	"power", # 8
	"efficiency", # 9
	"block_reward", # 10
	"age_days", # 11
	"days_since_halving", # 12
	"Revenue_Potential", # 13
	"electricity_rate", # 14
	]

	# ---------------------------------------------------------
	# Ensure we only use rows with complete core blockchain data.
	# This avoids including a "today" row where, e.g., difficulty or
	# hashrate are still NaN while price is already updated.
	# ---------------------------------------------------------
	df_miner = df_miner.sort_values("date").reset_index(drop=True)

	core_cols = ["bitcoin_price", "difficulty", "fees", "hashrate", "revenue"]
	df_miner = df_miner.dropna(subset=core_cols)

	if len(df_miner) < window_size:
	raise ValueError(
	f"Not enough data to build a {window_size}-day window after dropping NaNs. "
	f"Have {len(df_miner)} rows, need at least {window_size}."
	)

	# Take the last `window_size` fully-populated days
	df_window = df_miner.tail(window_size).reset_index(drop=True)

	sequence = df_window[feature_cols].values.astype(float)
	pred_date = df_window["date"].iloc[-1]

	return sequence, df_window, pred_date




	if __name__ == "__main__":
	from fetch_blockchain_data import get_latest_blockchain_data

	print("\n" + "="*80)
	print("TESTING PREPROCESSING PIPELINE")
	print("="*80 + "\n")

	# Fetch blockchain data
	print("📡 Fetching blockchain data...")
	blockchain_df = get_latest_blockchain_data(days=90)

	if blockchain_df is None:
	print("❌ Failed to fetch blockchain data")
	exit(1)

	print(f"✅ Fetched {len(blockchain_df)} days of data\n")

	# Test configuration
	miner_name = 's19pro'
	miner_price = 2500
	region = 'texas'
	window_size = 30

	print("⚙️ Test Configuration:")
	print(f" Miner: {MINER_SPECS[miner_name]['full_name']}")
	print(f" Price: ${miner_price:,}")
	print(f" Region: {region.title()}")
	print(f" Window: {window_size} days")
	print(f" Electricity: ${ELECTRICITY_RATES[region]}/kWh\n")

	# Step 1: Engineer features
	print("="*80)
	print("STEP 1: ENGINEER FEATURES")
	print("="*80)
	df_engineered = engineer_features(blockchain_df)
	print(f"✅ Engineered features")
	print(f" Shape: {df_engineered.shape}")
	print(f" Columns: {list(df_engineered.columns)}\n")
	print("First 3 rows:")
	print(df_engineered.head(3))
	print("\nLast 3 rows:")
	print(df_engineered.tail(3))

	# Step 2: Prepare miner features
	print("\n" + "="*80)
	print("STEP 2: PREPARE MINER FEATURES")
	print("="*80)
	df_miner = prepare_miner_features(df_engineered, miner_name, miner_price, region)
	print(f"✅ Added miner-specific features")
	print(f" Shape: {df_miner.shape}")
	print(f" Columns: {list(df_miner.columns)}\n")

	print("Miner-specific values (constant across all days):")
	print(f" machine_hashrate: {df_miner['machine_hashrate'].iloc[0]} TH/s")
	print(f" power: {df_miner['power'].iloc[0]} W")
	print(f" efficiency: {df_miner['efficiency'].iloc[0]} W/TH")
	print(f" machine_price: ${df_miner['machine_price'].iloc[0]:,.2f}")
	print(f" electricity_rate: ${df_miner['electricity_rate'].iloc[0]:.4f}/kWh")

	print("\nDynamic values (change over time):")
	print(f" age_days: {df_miner['age_days'].iloc[0]} → {df_miner['age_days'].iloc[-1]} days")
	print(f" days_since_halving: {df_miner['days_since_halving'].iloc[0]} → {df_miner['days_since_halving'].iloc[-1]} days")
	print(f" Revenue_Potential: ${df_miner['Revenue_Potential'].iloc[0]:.2f} → ${df_miner['Revenue_Potential'].iloc[-1]:.2f}/day")

	print("\nFirst 3 rows:")
	print(df_miner.head(3))
	print("\nLast 3 rows:")
	print(df_miner.tail(3))

	# Step 3: Get latest sequence
	print("\n" + "="*80)
	print("STEP 3: GET LATEST SEQUENCE")
	print("="*80)
	sequence, feature_cols, latest_date = get_latest_sequence(blockchain_df, miner_name, miner_price, region, window_size)

	print(f"✅ Created sequence for model")
	print(f" Shape: {sequence.shape}")
	print(f" Expected: ({window_size}, 14)")
	print(f" Latest date: {latest_date.strftime('%Y-%m-%d')}\n")

	print("14 Features (in order):")
	for i, col in enumerate(feature_cols, 1):
	print(f" {i:2d}. {col:25s} → First: {sequence[0, i-1]:>15.2f} Last: {sequence[-1, i-1]:>15.2f}")

	print("\n" + "="*80)
	print("SEQUENCE STATISTICS")
	print("="*80)
	print("\nFirst day in sequence:")
	for i, col in enumerate(feature_cols):
	print(f" {col:25s} = {sequence[0, i]:>15.2f}")

	print(f"\nLast day in sequence (for prediction on {latest_date.strftime('%Y-%m-%d')}):")
	for i, col in enumerate(feature_cols):
	print(f" {col:25s} = {sequence[-1, i]:>15.2f}")

	# Show some statistics
	print("\n" + "="*80)
	print("FEATURE RANGES")
	print("="*80)
	for i, col in enumerate(feature_cols):
	min_val = sequence[:, i].min()
	max_val = sequence[:, i].max()
	mean_val = sequence[:, i].mean()
	print(f"{col:25s} → Min: {min_val:>12.2f} Max: {max_val:>12.2f} Mean: {mean_val:>12.2f}")

	print("\n" + "="*80)
	print("✅ PREPROCESSING PIPELINE TEST COMPLETE")
	print("="*80 + "\n")