SECTION 1: SETUP AND WHOLE-BUILDING DATA PREPARATION¶

In [1]:
# =============================================================================
# SECTION 1: SETUP AND WHOLE-BUILDING DATA PREPARATION
# =============================================================================
#
# OBJECTIVE:
# To load the raw, minute-level energy data from all seven floors, consolidate
# them into a single, clean, hourly time-series representing the total energy
# consumption of the entire building. This will serve as the foundational
# dataset for our probabilistic forecasting task.
#
# =============================================================================


# --- 1.1. Import Core Libraries ---
#
# Import the essential libraries for data manipulation, numerical operations,
# and visualization. We set a consistent style to improve readability.
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style for consistency across all plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 7)

print("Libraries imported successfully.")


# --- 1.2. Load All Floor Datasets ---
#
# We will load the CSV file for each of the seven floors into a dictionary.
# This approach is scalable and keeps the raw data organized. A try-except
# block ensures the code handles missing files gracefully.
#
floor_files = {f'Floor{i}': f'Floor{i}.csv' for i in range(1, 8)}
all_floors_raw = {}
print("--- Loading Raw Floor Data ---")
for floor_name, file_name in floor_files.items():
    try:
        all_floors_raw[floor_name] = pd.read_csv(file_name)
        print(f"[SUCCESS] Loaded {file_name} ({len(all_floors_raw[floor_name])} rows)")
    except FileNotFoundError:
        print(f"[ERROR] {file_name} not found. Please check the file path.")
print("--------------------------------")


# --- 1.3. Consolidate and Aggregate Building-Wide Data ---
#
# This is the core data preparation step. We will loop through each raw floor
# dataframe, clean it, resample it to hourly energy (kWh), and then
# combine all floors to create a single, building-wide time series.
#
print("\n--- Processing and Aggregating Data for All Floors ---")
all_floors_hourly_kwh = []

for floor_name, df_raw in all_floors_raw.items():
    print(f"Processing {floor_name}...")
    
    # Create a copy to avoid modifying the original raw dataframe
    df_proc = df_raw.copy()
    
    # Step 1: Standardize 'Date' column and set as index
    df_proc['Date'] = pd.to_datetime(df_proc['Date'])
    df_proc.set_index('Date', inplace=True)
    
    # Step 2: Isolate only the power consumption (kW) columns
    energy_cols = [col for col in df_proc.columns if col.endswith('(kW)')]
    df_energy = df_proc[energy_cols].copy()
    
    # Step 3: Handle missing values within each floor's data
    # ffill() followed by bfill() is a robust method for time-series imputation
    initial_missing = df_energy.isnull().sum().sum()
    df_energy.ffill(inplace=True)
    df_energy.bfill(inplace=True)
    final_missing = df_energy.isnull().sum().sum()
    print(f"  > Missing values handled: {initial_missing} -> {final_missing}")
    
    # Step 4: Sum all kW columns to get the total power for the floor
    df_energy['Total_kW'] = df_energy.sum(axis=1)
    
    # Step 5: Resample from minute-level power (kW) to hourly energy (kWh)
    # The mean power over one hour is numerically equivalent to the energy in kWh.
    df_hourly = df_energy[['Total_kW']].resample('h').mean()
    df_hourly.rename(columns={'Total_kW': f'{floor_name}_kWh'}, inplace=True)
    
    all_floors_hourly_kwh.append(df_hourly)

# Step 6: Concatenate all processed floor dataframes into one
df_building = pd.concat(all_floors_hourly_kwh, axis=1)

# Step 7: Create the final building-wide target variable
df_building['Building_Total_kWh'] = df_building.sum(axis=1)

# Step 8: Final check for any remaining missing values at the building level
if df_building['Building_Total_kWh'].isnull().any():
    print("\n[WARNING] Missing values detected after aggregation. Applying forward/backward fill.")
    df_building.ffill(inplace=True)
    df_building.bfill(inplace=True)

print("\n[SUCCESS] Building-wide hourly energy consumption profile created.")
print(f"Final DataFrame shape: {df_building.shape}")
print("Final DataFrame columns:", df_building.columns.tolist())


# --- 1.4. Initial Visualization of the Target Variable ---
#
# Before proceeding, we must visualize the final time series to understand
# its overall characteristics, such as trends, seasonality, and any
# obvious anomalies.
#
plt.figure(figsize=(18, 8))
plt.plot(df_building.index, df_building['Building_Total_kWh'], label='Total Building kWh', color='navy')
plt.title('Total Hourly Energy Consumption - Full Building (July 2018 - Dec 2019)', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Energy Consumption (kWh)', fontsize=12)
plt.legend()
plt.show()

# Display the first few rows of the final dataframe to confirm the structure
print("\n--- First 5 rows of the final building-wide dataframe ---")
print(df_building.head())
print("\n---------------------------------------------------------")
Libraries imported successfully.
--- Loading Raw Floor Data ---
[SUCCESS] Loaded Floor1.csv (790560 rows)
[SUCCESS] Loaded Floor2.csv (790560 rows)
[SUCCESS] Loaded Floor3.csv (790560 rows)
[SUCCESS] Loaded Floor4.csv (790560 rows)
[SUCCESS] Loaded Floor5.csv (790560 rows)
[SUCCESS] Loaded Floor6.csv (689128 rows)
[SUCCESS] Loaded Floor7.csv (790560 rows)
--------------------------------

--- Processing and Aggregating Data for All Floors ---
Processing Floor1...
  > Missing values handled: 49456 -> 0
Processing Floor2...
  > Missing values handled: 120755 -> 0
Processing Floor3...
  > Missing values handled: 1232820 -> 0
Processing Floor4...
  > Missing values handled: 1154770 -> 0
Processing Floor5...
  > Missing values handled: 341071 -> 0
Processing Floor6...
  > Missing values handled: 27558 -> 0
Processing Floor7...
  > Missing values handled: 174160 -> 0

[SUCCESS] Building-wide hourly energy consumption profile created.
Final DataFrame shape: (13176, 8)
Final DataFrame columns: ['Floor1_kWh', 'Floor2_kWh', 'Floor3_kWh', 'Floor4_kWh', 'Floor5_kWh', 'Floor6_kWh', 'Floor7_kWh', 'Building_Total_kWh']
No description has been provided for this image
--- First 5 rows of the final building-wide dataframe ---
                     Floor1_kWh  Floor2_kWh  Floor3_kWh  Floor4_kWh  \
Date                                                                  
2018-07-01 00:00:00  155.625000    7.811667    0.643167    9.820167   
2018-07-01 01:00:00  156.687000    7.862333    0.557167    9.882667   
2018-07-01 02:00:00  156.282500    7.842500    0.578167    9.843000   
2018-07-01 03:00:00  154.346500    7.874833    0.563000    9.913000   
2018-07-01 04:00:00  162.414833    7.957833    0.580833    9.797167   

                     Floor5_kWh  Floor6_kWh  Floor7_kWh  Building_Total_kWh  
Date                                                                         
2018-07-01 00:00:00    1.066667    0.758167    0.540333          176.265167  
2018-07-01 01:00:00    1.030333    0.695167    0.587333          177.302000  
2018-07-01 02:00:00    1.100167    0.764667    0.638500          177.049500  
2018-07-01 03:00:00    1.159000    0.697833    0.575000          175.129167  
2018-07-01 04:00:00    1.128167    0.758500    0.537667          183.175000  

---------------------------------------------------------

PROJECT UTILITIES AND CUSTOM FUNCTIONS¶

In [2]:
# =============================================================================
# PROJECT UTILITIES AND CUSTOM FUNCTIONS
# =============================================================================
#
# OBJECTIVE:
# To define reusable functions and set global configurations to ensure
# consistency and reproducibility throughout the entire workflow.
#
# =============================================================================


# --- Utility: Global Seeds for Reproducibility ---
#
# By setting the random seeds for TensorFlow and NumPy at the start, we ensure
# that any stochastic process (e.g., model weight initialization, dropout,
# tuner search patterns) is deterministic and will produce the same results
# on every run. This is crucial for scientific validity.
#
import tensorflow as tf
import numpy as np

tf.random.set_seed(42)
np.random.seed(42)

print("Global random seeds for TensorFlow and NumPy have been set.")


# --- Utility: Sequence Creation Function ---
#
# This function converts our flat dataframe into sequences suitable for
# time-series models like LSTMs and Transformers.
#
def create_sequences(features, target, time_steps):
    """Converts arrays of features and a target into sequences."""
    X, y = [], []
    for i in range(len(features) - time_steps):
        X.append(features[i:(i + time_steps)])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)

print("Sequence creation utility function defined.")


# --- Utility: Custom Keras Layer for Positional Encoding ---
#
# This layer is essential for the Transformer model. Since Transformers do not
# have a recurrent structure, this layer explicitly injects information
# about the position of each "patch" in the sequence.
#
class PositionalEncoding(tf.keras.layers.Layer):
    """Injects positional information into the input embeddings."""
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Pre-calculates the positional encoding matrix
        pos_encoding = self.create_positional_encoding(max_len, d_model)
        self.pos_encoding = tf.cast(pos_encoding, dtype=tf.float32)

    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def create_positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis],
                                     np.arange(d_model)[np.newaxis, :],
                                     d_model)
        # Apply sin to even indices
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        # Apply cos to odd indices
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]
        return pos_encoding

    def call(self, inputs):
        # Adds the positional encoding to the input tensor
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

print("PositionalEncoding custom layer defined.")
Global random seeds for TensorFlow and NumPy have been set.
Sequence creation utility function defined.
PositionalEncoding custom layer defined.

SECTION 2: PREPROCESSING AND FEATURE ENGINEERING¶

In [3]:
# =============================================================================
# SECTION 2: PREPROCESSING AND FEATURE ENGINEERING
# =============================================================================
#
# OBJECTIVE:
# To prepare the aggregated, building-wide dataset for a time-series model.
# This involves a chronological train-test split, robust outlier handling,
# comprehensive feature engineering to capture temporal patterns, and finally,
# data scaling and sequencing. This pipeline is intentionally designed to
# prevent data leakage.
#
# =============================================================================

# --- 2.1. Chronological Train-Test Split ---
#
# It is critical to split time-series data chronologically to simulate a
# real-world scenario where we train on the past to predict the future.
#
df_model_input = df_building[['Building_Total_kWh']].copy()

split_date = '2019-07-18'
train_df = df_model_input.loc[df_model_input.index < split_date].copy()
test_df = df_model_input.loc[df_model_input.index >= split_date].copy()

print("--- Data Splitting ---")
print(f"Training set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")
print("------------------------")


# --- 2.2. Advanced Outlier Detection and Capping ---
#
# To prevent data leakage, the IsolationForest is fitted ONLY on the training
# data. The learned patterns of "normal" consumption are then used to
# identify outliers in both the training and test sets.
#
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.01, random_state=42)

# Fit on training data
train_df['outlier'] = iso_forest.fit_predict(train_df[['Building_Total_kWh']])

# Predict on test data
test_df['outlier'] = iso_forest.predict(test_df[['Building_Total_kWh']])

# Define the capping value based ONLY on the training data's distribution
capping_value = train_df['Building_Total_kWh'].quantile(0.995)

print(f"\n--- Outlier Handling ---")
print(f"Identified outliers will be capped at: {capping_value:.2f} kWh")

# Apply the capping to both sets
train_df['Building_Total_kWh'] = train_df.apply(
    lambda row: capping_value if row['outlier'] == -1 else row['Building_Total_kWh'], axis=1
)
test_df['Building_Total_kWh'] = test_df.apply(
    lambda row: capping_value if row['outlier'] == -1 else row['Building_Total_kWh'], axis=1
)

# Clean up by dropping the helper column
train_df.drop(columns=['outlier'], inplace=True)
test_df.drop(columns=['outlier'], inplace=True)

print("Outlier handling complete.")
print("--------------------------")


# --- 2.3. Feature Engineering ---
#
# We engineer a rich set of temporal features to provide the model with
# explicit context about the time of a given energy reading.
#
def create_temporal_features(df):
    """Creates a suite of time-series features from a datetime index."""
    df = df.copy() # Use .copy() to avoid SettingWithCopyWarning
    df['Hour'] = df.index.hour
    df['DayOfWeek'] = df.index.dayofweek
    df['Month'] = df.index.month
    df['WeekOfYear'] = df.index.isocalendar().week.astype(int)
    df['IsWeekend'] = (df.index.dayofweek >= 5).astype(int)
    return df

def encode_cyclical(df, col, max_val):
    """Encodes a cyclical feature into sin and cos components."""
    df = df.copy() # Use .copy() to avoid SettingWithCopyWarning
    df[col + '_sin'] = np.sin(2 * np.pi * df[col] / max_val)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col] / max_val)
    return df

# Define the list of holidays
holidays = pd.to_datetime([
    '2018-07-27', '2018-07-30', '2018-08-13', '2018-10-23', '2018-12-05',
    '2018-12-10', '2018-12-31', '2019-01-01', '2019-02-19', '2019-04-08',
    '2019-04-13', '2019-04-14', '2019-04-15', '2019-05-06', '2019-05-18',
    '2019-07-16', '2019-07-28', '2019-08-12', '2019-10-14', '2019-10-23',
    '2019-12-05', '2019-12-10', '2019-12-31'
])

# *** CORRECTION IS HERE: Apply all feature engineering steps to BOTH dataframes ***

print("\n--- Feature Engineering ---")

# Apply to training set
train_df = create_temporal_features(train_df)
train_df['IsHoliday'] = train_df.index.normalize().isin(holidays).astype(int)
train_df = encode_cyclical(train_df, 'Hour', 24)
train_df = encode_cyclical(train_df, 'DayOfWeek', 7)
train_df = encode_cyclical(train_df, 'Month', 12)
train_df = encode_cyclical(train_df, 'WeekOfYear', 52)
print("Features created for the training set.")

# Apply the EXACT SAME steps to the test set
test_df = create_temporal_features(test_df)
test_df['IsHoliday'] = test_df.index.normalize().isin(holidays).astype(int)
test_df = encode_cyclical(test_df, 'Hour', 24)
test_df = encode_cyclical(test_df, 'DayOfWeek', 7)
test_df = encode_cyclical(test_df, 'Month', 12)
test_df = encode_cyclical(test_df, 'WeekOfYear', 52)
print("Features created for the test set.")


print("\nFinal training dataframe columns:", train_df.columns.tolist())
print("Final testing dataframe columns:", test_df.columns.tolist())
print("-----------------------------")


# --- 2.4. Data Scaling and Sequencing ---
#
# We scale all features and the target variable to a [0, 1] range.
# Crucially, the scalers are fitted ONLY on the training data.
# Then, we create the final input sequences for the Transformer model.
#
from sklearn.preprocessing import MinMaxScaler

feature_columns = [
    'Hour_sin', 'Hour_cos',
    'DayOfWeek_sin', 'DayOfWeek_cos',
    'Month_sin', 'Month_cos',
    'WeekOfYear_sin', 'WeekOfYear_cos',
    'IsWeekend', 'IsHoliday',
    'Building_Total_kWh'  # Past kWh is an autoregressive feature
]
target_column = 'Building_Total_kWh'

# Initialize separate scalers for features and target
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

# Fit and transform the training data
X_train_scaled = scaler_features.fit_transform(train_df[feature_columns])
y_train_scaled = scaler_target.fit_transform(train_df[[target_column]])

# Only transform the test data using the already-fitted scalers
X_test_scaled = scaler_features.transform(test_df[feature_columns])
y_test_scaled = scaler_target.transform(test_df[[target_column]])

print("\n--- Data Scaling & Sequencing ---")
print("Data scaled correctly without leakage.")

# Define the look-back window (context for the model)
TIME_STEPS = 24

# Create supervised learning sequences
def create_sequences(features, target, time_steps):
    """Converts arrays of features and a target into sequences for time-series models."""
    X, y = [], []
    for i in range(len(features) - time_steps):
        X.append(features[i:(i + time_steps)])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)

X_train, y_train = create_sequences(X_train_scaled, y_train_scaled.ravel(), TIME_STEPS)
X_test, y_test = create_sequences(X_test_scaled, y_test_scaled.ravel(), TIME_STEPS)

print(f"Created sequences with a look-back window of {TIME_STEPS} hours.")
print(f'Final X_train shape: {X_train.shape}')
print(f'Final y_train shape: {y_train.shape}')
print(f'Final X_test shape: {X_test.shape}')
print(f'Final y_test shape: {y_test.shape}')
print("---------------------------------")
--- Data Splitting ---
Training set shape: (9168, 1)
Testing set shape: (4008, 1)
------------------------

--- Outlier Handling ---
Identified outliers will be capped at: 760.63 kWh
Outlier handling complete.
--------------------------

--- Feature Engineering ---
Features created for the training set.
Features created for the test set.

Final training dataframe columns: ['Building_Total_kWh', 'Hour', 'DayOfWeek', 'Month', 'WeekOfYear', 'IsWeekend', 'IsHoliday', 'Hour_sin', 'Hour_cos', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Month_sin', 'Month_cos', 'WeekOfYear_sin', 'WeekOfYear_cos']
Final testing dataframe columns: ['Building_Total_kWh', 'Hour', 'DayOfWeek', 'Month', 'WeekOfYear', 'IsWeekend', 'IsHoliday', 'Hour_sin', 'Hour_cos', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Month_sin', 'Month_cos', 'WeekOfYear_sin', 'WeekOfYear_cos']
-----------------------------

--- Data Scaling & Sequencing ---
Data scaled correctly without leakage.
Created sequences with a look-back window of 24 hours.
Final X_train shape: (9144, 24, 11)
Final y_train shape: (9144,)
Final X_test shape: (3984, 24, 11)
Final y_test shape: (3984,)
---------------------------------

SECTION 3: DEFINING THE PROBABILISTIC HYPERMODEL¶

In [4]:
# =============================================================================
# SECTION 3: DEFINING THE PROBABILISTIC HYPERMODEL
# =============================================================================
#
# OBJECTIVE:
# To define the Quantile Regression Patch Transformer (QR-PatchTST) as a
# tunable HyperModel. This involves creating a custom loss function and then
# building a class that allows KerasTuner to systematically search for the
# optimal model architecture and learning rate.
#
# =============================================================================


# --- 3.1. Import Libraries and Define Probabilistic Parameters ---
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D, Add, Reshape
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt

# Define the target quantiles for our 80% prediction interval and the median
QUANTILES = [0.1, 0.5, 0.9]
NUM_QUANTILES = len(QUANTILES)
print(f"Model will be trained to predict {NUM_QUANTILES} quantiles: {QUANTILES}")


# --- 3.2. Implement the Custom Pinball Loss Function ---
# This is the objective function for quantile regression. It must be defined
# globally so the tuner can access it.
def pinball_loss(y_true, y_pred):
    """Calculates the pinball loss for quantile regression."""
    loss = 0.0
    for i, q in enumerate(QUANTILES):
        error = y_true - y_pred[:, i]
        loss += tf.reduce_mean(tf.maximum(q * error, (q - 1) * error), axis=-1)
    return loss

print("Custom pinball loss function defined successfully.")


# --- 3.3. Define the QR-PatchTST HyperModel ---
# We encapsulate the entire model-building logic within a KerasTuner HyperModel class.
# This allows us to define search spaces for key architectural parameters.
#
class QRPatchTSTHyperModel(kt.HyperModel):
    def __init__(self, input_shape, time_steps):
        self.input_shape = input_shape
        self.time_steps = time_steps

    def build(self, hp):
        """Builds a tunable QR-PatchTST model."""

        # --- Tunable Hyperparameters ---
        patch_len = hp.Choice('patch_len', values=[2, 4, 6])
        d_model = hp.Int('d_model', min_value=32, max_value=128, step=32)
        num_heads = hp.Choice('num_heads', values=[2, 4, 8])
        num_blocks = hp.Int('num_blocks', min_value=1, max_value=3, step=1)
        dropout_rate = hp.Float('dropout', min_value=0.1, max_value=0.3, step=0.1)
        learning_rate = hp.Choice('learning_rate', values=[1e-3, 5e-4])
        
        # Calculate number of patches based on the chosen patch_len
        num_patches = (self.time_steps - patch_len) // patch_len + 1
        num_features = self.input_shape[1]

        # --- Model Architecture ---
        inputs = Input(shape=self.input_shape)
        
        # Patching and Embedding
        patches = Reshape((num_patches, patch_len * num_features))(inputs)
        x = Dense(d_model)(patches)
        
        # Positional Encoding (using the utility class)
        # We need to ensure the PositionalEncoding class is defined in a utility cell.
        try:
            x = PositionalEncoding(d_model, max_len=num_patches)(x)
        except NameError:
            raise RuntimeError("The 'PositionalEncoding' class must be defined in a preceding utility cell.")

        # Transformer Encoder Blocks
        for _ in range(num_blocks):
            attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(x, x)
            attn_output = Dropout(dropout_rate)(attn_output)
            x = Add()([x, attn_output])
            x = LayerNormalization(epsilon=1e-6)(x)
            
            ffn_output = Dense(d_model * 2, activation="relu")(x) # ff_dim is often 2x d_model
            ffn_output = Dense(d_model)(ffn_output)
            ffn_output = Dropout(dropout_rate)(ffn_output)
            x = Add()([x, ffn_output])
            x = LayerNormalization(epsilon=1e-6)(x)
            
        # Pooling and Final Head
        x = GlobalAveragePooling1D()(x)
        
        # Quantile Regression Head
        outputs = Dense(NUM_QUANTILES)(x)
        
        model = Model(inputs=inputs, outputs=outputs, name='QR_PatchTST_HyperModel')
        
        # Compile the model
        model.compile(
            optimizer=Adam(learning_rate=learning_rate),
            loss=pinball_loss
        )
        
        return model

print("\nQR-PatchTST HyperModel class defined successfully.")
Model will be trained to predict 3 quantiles: [0.1, 0.5, 0.9]
Custom pinball loss function defined successfully.

QR-PatchTST HyperModel class defined successfully.

SECTION 4: HYPERPARAMETER TUNING AND FINAL MODEL TRAINING¶

In [ ]:
# # =============================================================================
# # SECTION 4: HYPERPARAMETER TUNING AND FINAL MODEL TRAINING
# # =============================================================================
# #
# # OBJECTIVE:
# # To systematically find the optimal set of hyperparameters for our QR-PatchTST
# # model using KerasTuner with a Bayesian Optimization strategy. After the search
# # is complete, we will build the best model and train it on the full training
# # dataset to create our final, optimized probabilistic forecasting model.
# #
# # =============================================================================

# # --- 4.1. Import Additional Libraries and Instantiate the Tuner ---
# #
# # We need the 'time' module to measure the duration of the tuning process.
# #
# import time
# import keras_tuner as kt

# # Instantiate our custom hypermodel
# # (Assumes the QRPatchTSTHyperModel class from Section 3 is in memory)
# input_shape = (X_train.shape[1], X_train.shape[2])
# hypermodel = QRPatchTSTHyperModel(input_shape=input_shape, time_steps=TIME_STEPS)

# # Configure the tuner
# tuner = kt.BayesianOptimization(
#     hypermodel,
#     objective='val_loss',
#     max_trials=15,  # The number of different hyperparameter combinations to test
#     executions_per_trial=1, # How many times to train each model for robustness
#     directory='conference_paper_tuning',
#     project_name='qr_patch_tst_tuning',
#     overwrite=True,
#     seed=42 # Ensures the search process is reproducible
# )

# # Display a summary of the search space
# tuner.search_space_summary()


# # --- 4.2. Run the Hyperparameter Search ---
# #
# # This is the most computationally intensive step. The tuner will build and
# # train 'max_trials' different versions of the model to find the one that
# # performs best on the validation data. We use an EarlyStopping callback
# # to prevent wasting time on unpromising trials.
# #
# print("\n--- Starting Hyperparameter Search for QR-PatchTST Model ---")
# start_time_tuning = time.time()

# tuner.search(
#     X_train, y_train,
#     epochs=50,  # A sufficient number of epochs for each trial
#     batch_size=64,
#     validation_split=0.2, # Use the last 20% of training data for validation during the search
#     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min')],
#     verbose=1
# )

# end_time_tuning = time.time()
# print(f"\nHyperparameter search completed in {(end_time_tuning - start_time_tuning)/60:.2f} minutes.")
# print("---------------------------------------------------------------")


# # --- 4.3. Retrieve the Best Model and Print Optimal Hyperparameters ---
# #
# # After the search, we can retrieve the best hyperparameters and build the
# # final model.
# #
# best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# print("\n--- Optimal Hyperparameters Found ---")
# print(f"Patch Length: {best_hps.get('patch_len')} hours")
# print(f"Model Dimension (d_model): {best_hps.get('d_model')}")
# print(f"Number of Attention Heads: {best_hps.get('num_heads')}")
# print(f"Number of Transformer Blocks: {best_hps.get('num_blocks')}")
# print(f"Dropout Rate: {best_hps.get('dropout'):.2f}")
# print(f"Learning Rate: {best_hps.get('learning_rate')}")
# print("-------------------------------------")

# # Build the final model with the optimal hyperparameters
# final_model = tuner.hypermodel.build(best_hps)


# # --- 4.4. Train the Final Model ---
# #
# # We now train this single, best-configured model on the entire training
# # dataset for a longer duration to achieve its best possible performance.
# # We will use the test set as our final, unseen validation set.
# #
# print("\n--- Training the Final, Optimized QR-PatchTST Model ---")
# start_time_final_train = time.time()

# history = final_model.fit(
#     X_train, y_train,
#     epochs=100, # Train for more epochs on the final model
#     batch_size=64,
#     validation_data=(X_test, y_test),
#     callbacks=[
#         tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, mode='min', restore_best_weights=True),
#         tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
#     ],
#     verbose=1
# )

# end_time_final_train = time.time()
# print(f"\nFinal model training completed in {(end_time_final_train - start_time_final_train)/60:.2f} minutes.")
# print("------------------------------------------------------")
Trial 15 Complete [00h 03m 10s]
val_loss: 0.10098206251859665

Best val_loss So Far: 0.052302710711956024
Total elapsed time: 00h 55m 57s

Hyperparameter search completed in 55.94 minutes.
---------------------------------------------------------------

--- Optimal Hyperparameters Found ---
Patch Length: 2 hours
Model Dimension (d_model): 128
Number of Attention Heads: 8
Number of Transformer Blocks: 1
Dropout Rate: 0.20
Learning Rate: 0.0005
-------------------------------------

--- Training the Final, Optimized QR-PatchTST Model ---
Epoch 1/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 20s 102ms/step - loss: 0.2391 - val_loss: 0.1664 - learning_rate: 5.0000e-04
Epoch 2/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 71ms/step - loss: 0.1162 - val_loss: 0.1111 - learning_rate: 5.0000e-04
Epoch 3/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 71ms/step - loss: 0.1007 - val_loss: 0.0777 - learning_rate: 5.0000e-04
Epoch 4/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 12s 84ms/step - loss: 0.0858 - val_loss: 0.0740 - learning_rate: 5.0000e-04
Epoch 5/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 12s 81ms/step - loss: 0.0745 - val_loss: 0.0657 - learning_rate: 5.0000e-04
Epoch 6/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 12s 84ms/step - loss: 0.0691 - val_loss: 0.0662 - learning_rate: 5.0000e-04
Epoch 7/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 75ms/step - loss: 0.0623 - val_loss: 0.0753 - learning_rate: 5.0000e-04
Epoch 8/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 73ms/step - loss: 0.0623 - val_loss: 0.0720 - learning_rate: 5.0000e-04
Epoch 9/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 76ms/step - loss: 0.0571 - val_loss: 0.0924 - learning_rate: 5.0000e-04
Epoch 10/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 79ms/step - loss: 0.0555 - val_loss: 0.0786 - learning_rate: 5.0000e-04
Epoch 11/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 73ms/step - loss: 0.0468 - val_loss: 0.0423 - learning_rate: 1.0000e-04
Epoch 12/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 13s 92ms/step - loss: 0.0462 - val_loss: 0.0403 - learning_rate: 1.0000e-04
Epoch 13/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 74ms/step - loss: 0.0455 - val_loss: 0.0447 - learning_rate: 1.0000e-04
Epoch 14/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 79ms/step - loss: 0.0457 - val_loss: 0.0406 - learning_rate: 1.0000e-04
Epoch 15/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 69ms/step - loss: 0.0457 - val_loss: 0.0391 - learning_rate: 1.0000e-04
Epoch 16/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 67ms/step - loss: 0.0450 - val_loss: 0.0456 - learning_rate: 1.0000e-04
Epoch 17/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 74ms/step - loss: 0.0452 - val_loss: 0.0458 - learning_rate: 1.0000e-04
Epoch 18/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 73ms/step - loss: 0.0460 - val_loss: 0.0404 - learning_rate: 1.0000e-04
Epoch 19/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 70ms/step - loss: 0.0459 - val_loss: 0.0420 - learning_rate: 1.0000e-04
Epoch 20/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 69ms/step - loss: 0.0460 - val_loss: 0.0479 - learning_rate: 1.0000e-04
Epoch 21/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 68ms/step - loss: 0.0437 - val_loss: 0.0461 - learning_rate: 2.0000e-05
Epoch 22/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 66ms/step - loss: 0.0437 - val_loss: 0.0462 - learning_rate: 2.0000e-05
Epoch 23/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 68ms/step - loss: 0.0438 - val_loss: 0.0465 - learning_rate: 2.0000e-05
Epoch 24/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 66ms/step - loss: 0.0440 - val_loss: 0.0437 - learning_rate: 2.0000e-05
Epoch 25/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 67ms/step - loss: 0.0436 - val_loss: 0.0468 - learning_rate: 2.0000e-05

Final model training completed in 4.56 minutes.
------------------------------------------------------
In [5]:
# =============================================================================
# SECTION 4: FINAL MODEL DEFINITION AND TRAINING
# =============================================================================
#
# OBJECTIVE:
# To build and train the final, optimized QR-PatchTST model using the best
# hyperparameters discovered in the systematic search. This section allows for
# rapid re-runs of the workflow without repeating the time-intensive
# hyperparameter tuning process.
#
# =============================================================================


# --- 4.1. Import Libraries and Define Best Hyperparameters ---
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D, Add, Reshape
from tensorflow.keras.optimizers import Adam
import time

# These are the optimal hyperparameters found by the BayesianOptimization search.
# We are hardcoding them here for reproducibility and efficiency.
BEST_HPS = {
    'patch_len': 2,
    'd_model': 128,
    'num_heads': 8,
    'num_blocks': 1,
    'dropout': 0.2,
    'learning_rate': 0.0005
}

print("--- Using Pre-defined Optimal Hyperparameters ---")
for key, value in BEST_HPS.items():
    print(f"{key}: {value}")
print("---------------------------------------------")


# --- 4.2. Build the Final, Optimized Model ---
#
# We use the same model-building function from Section 3 but pass the
# hardcoded optimal hyperparameters directly.
#
input_shape = (X_train.shape[1], X_train.shape[2])

# Recalculate number of patches based on the optimal patch_len
num_patches = (TIME_STEPS - BEST_HPS['patch_len']) // BEST_HPS['patch_len'] + 1

# Instantiate the final model using the best hyperparameter configuration
# NOTE: Ensure the `build_qr_patch_tst_model` function is defined and available.
# Since we are no longer using the HyperModel class directly, we will use a
# standalone build function for clarity.

def build_final_qr_patch_tst_model(input_shape, patch_len, num_patches, hps):
    """Builds the final Quantile Regression Patch Transformer model."""
    num_features = input_shape[1]
    inputs = Input(shape=input_shape)
    patches = Reshape((num_patches, patch_len * num_features))(inputs)
    x = Dense(hps['d_model'])(patches)
    x = PositionalEncoding(hps['d_model'], max_len=num_patches)(x)
    
    for _ in range(hps['num_blocks']):
        attn_output = MultiHeadAttention(num_heads=hps['num_heads'], key_dim=hps['d_model'])(x, x)
        attn_output = Dropout(hps['dropout'])(attn_output)
        x = Add()([x, attn_output])
        x = LayerNormalization(epsilon=1e-6)(x)
        ffn_output = Dense(hps['d_model'] * 2, activation="relu")(x)
        ffn_output = Dense(hps['d_model'])(ffn_output)
        ffn_output = Dropout(hps['dropout'])(ffn_output)
        x = Add()([x, ffn_output])
        x = LayerNormalization(epsilon=1e-6)(x)
        
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.2)(x)
    outputs = Dense(NUM_QUANTILES)(x)
    model = Model(inputs=inputs, outputs=outputs, name='Final_QR_PatchTST')
    return model

# Build the final model
final_model = build_final_qr_patch_tst_model(
    input_shape=input_shape,
    patch_len=BEST_HPS['patch_len'],
    num_patches=num_patches,
    hps=BEST_HPS
)

# Compile the model
final_model.compile(
    optimizer=Adam(learning_rate=BEST_HPS['learning_rate']),
    loss=pinball_loss
)

print("\n--- Final QR-PatchTST Model Summary ---")
final_model.summary()
print("---------------------------------------")


# --- 4.3. Train the Final Model ---
#
# We now train this single, best-configured model on the entire training
# dataset to achieve its best possible performance.
#
print("\n--- Training the Final, Optimized QR-PatchTST Model ---")
start_time_final_train = time.time()

history = final_model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, mode='min', restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=1e-6)
    ],
    verbose=1
)

end_time_final_train = time.time()
print(f"\nFinal model training completed in {(end_time_final_train - start_time_final_train)/60:.2f} minutes.")
print("------------------------------------------------------")
--- Using Pre-defined Optimal Hyperparameters ---
patch_len: 2
d_model: 128
num_heads: 8
num_blocks: 1
dropout: 0.2
learning_rate: 0.0005
---------------------------------------------
2025-08-04 14:03:42.419548: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2025-08-04 14:03:42.419620: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-08-04 14:03:42.419629: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-08-04 14:03:42.419823: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-08-04 14:03:42.419844: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
--- Final QR-PatchTST Model Summary ---
Model: "Final_QR_PatchTST"
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)        ┃ Output Shape      ┃    Param # ┃ Connected to      ┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ input_layer         │ (None, 24, 11)    │          0 │ -                 │
│ (InputLayer)        │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ reshape (Reshape)   │ (None, 12, 22)    │          0 │ input_layer[0][0] │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense (Dense)       │ (None, 12, 128)   │      2,944 │ reshape[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ positional_encoding │ (None, 12, 128)   │          0 │ dense[0][0]       │
│ (PositionalEncodin… │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ multi_head_attenti… │ (None, 12, 128)   │    527,488 │ positional_encod… │
│ (MultiHeadAttentio… │                   │            │ positional_encod… │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_1 (Dropout) │ (None, 12, 128)   │          0 │ multi_head_atten… │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ add (Add)           │ (None, 12, 128)   │          0 │ positional_encod… │
│                     │                   │            │ dropout_1[0][0]   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ layer_normalization │ (None, 12, 128)   │        256 │ add[0][0]         │
│ (LayerNormalizatio… │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_1 (Dense)     │ (None, 12, 256)   │     33,024 │ layer_normalizat… │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_2 (Dense)     │ (None, 12, 128)   │     32,896 │ dense_1[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_2 (Dropout) │ (None, 12, 128)   │          0 │ dense_2[0][0]     │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ add_1 (Add)         │ (None, 12, 128)   │          0 │ layer_normalizat… │
│                     │                   │            │ dropout_2[0][0]   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ layer_normalizatio… │ (None, 12, 128)   │        256 │ add_1[0][0]       │
│ (LayerNormalizatio… │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ global_average_poo… │ (None, 128)       │          0 │ layer_normalizat… │
│ (GlobalAveragePool… │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dropout_3 (Dropout) │ (None, 128)       │          0 │ global_average_p… │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ dense_3 (Dense)     │ (None, 3)         │        387 │ dropout_3[0][0]   │
└─────────────────────┴───────────────────┴────────────┴───────────────────┘
 Total params: 597,251 (2.28 MB)
 Trainable params: 597,251 (2.28 MB)
 Non-trainable params: 0 (0.00 B)
---------------------------------------

--- Training the Final, Optimized QR-PatchTST Model ---
Epoch 1/100
2025-08-04 14:03:44.257522: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
143/143 ━━━━━━━━━━━━━━━━━━━━ 16s 76ms/step - loss: 0.3712 - val_loss: 0.1316 - learning_rate: 5.0000e-04
Epoch 2/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 13s 90ms/step - loss: 0.1412 - val_loss: 0.0963 - learning_rate: 5.0000e-04
Epoch 3/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 74ms/step - loss: 0.1080 - val_loss: 0.0769 - learning_rate: 5.0000e-04
Epoch 4/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 73ms/step - loss: 0.0900 - val_loss: 0.1279 - learning_rate: 5.0000e-04
Epoch 5/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 69ms/step - loss: 0.0806 - val_loss: 0.0794 - learning_rate: 5.0000e-04
Epoch 6/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 61ms/step - loss: 0.0702 - val_loss: 0.0711 - learning_rate: 5.0000e-04
Epoch 7/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 61ms/step - loss: 0.0656 - val_loss: 0.0937 - learning_rate: 5.0000e-04
Epoch 8/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 63ms/step - loss: 0.0600 - val_loss: 0.0662 - learning_rate: 5.0000e-04
Epoch 9/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 61ms/step - loss: 0.0594 - val_loss: 0.0509 - learning_rate: 5.0000e-04
Epoch 10/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 61ms/step - loss: 0.0550 - val_loss: 0.0615 - learning_rate: 5.0000e-04
Epoch 11/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 62ms/step - loss: 0.0533 - val_loss: 0.0492 - learning_rate: 5.0000e-04
Epoch 12/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 62ms/step - loss: 0.0541 - val_loss: 0.0440 - learning_rate: 5.0000e-04
Epoch 13/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 64ms/step - loss: 0.0498 - val_loss: 0.0436 - learning_rate: 5.0000e-04
Epoch 14/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 70ms/step - loss: 0.0503 - val_loss: 0.0540 - learning_rate: 5.0000e-04
Epoch 15/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 69ms/step - loss: 0.0489 - val_loss: 0.0454 - learning_rate: 5.0000e-04
Epoch 16/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 15s 103ms/step - loss: 0.0455 - val_loss: 0.0459 - learning_rate: 5.0000e-04
Epoch 17/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 16s 112ms/step - loss: 0.0450 - val_loss: 0.0536 - learning_rate: 5.0000e-04
Epoch 18/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 13s 93ms/step - loss: 0.0447 - val_loss: 0.0400 - learning_rate: 5.0000e-04
Epoch 19/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 13s 88ms/step - loss: 0.0431 - val_loss: 0.0512 - learning_rate: 5.0000e-04
Epoch 20/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 74ms/step - loss: 0.0444 - val_loss: 0.0439 - learning_rate: 5.0000e-04
Epoch 21/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 17s 119ms/step - loss: 0.0430 - val_loss: 0.0419 - learning_rate: 5.0000e-04
Epoch 22/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 14s 98ms/step - loss: 0.0461 - val_loss: 0.0420 - learning_rate: 5.0000e-04
Epoch 23/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 73ms/step - loss: 0.0468 - val_loss: 0.0420 - learning_rate: 5.0000e-04
Epoch 24/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 78ms/step - loss: 0.0411 - val_loss: 0.0399 - learning_rate: 1.0000e-04
Epoch 25/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 77ms/step - loss: 0.0399 - val_loss: 0.0412 - learning_rate: 1.0000e-04
Epoch 26/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 72ms/step - loss: 0.0389 - val_loss: 0.0400 - learning_rate: 1.0000e-04
Epoch 27/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 78ms/step - loss: 0.0391 - val_loss: 0.0390 - learning_rate: 1.0000e-04
Epoch 28/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 71ms/step - loss: 0.0386 - val_loss: 0.0380 - learning_rate: 1.0000e-04
Epoch 29/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 70ms/step - loss: 0.0385 - val_loss: 0.0380 - learning_rate: 1.0000e-04
Epoch 30/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 71ms/step - loss: 0.0385 - val_loss: 0.0385 - learning_rate: 1.0000e-04
Epoch 31/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 74ms/step - loss: 0.0382 - val_loss: 0.0388 - learning_rate: 1.0000e-04
Epoch 32/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 71ms/step - loss: 0.0380 - val_loss: 0.0371 - learning_rate: 1.0000e-04
Epoch 33/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 78ms/step - loss: 0.0381 - val_loss: 0.0400 - learning_rate: 1.0000e-04
Epoch 34/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 69ms/step - loss: 0.0380 - val_loss: 0.0390 - learning_rate: 1.0000e-04
Epoch 35/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 63ms/step - loss: 0.0380 - val_loss: 0.0398 - learning_rate: 1.0000e-04
Epoch 36/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 62ms/step - loss: 0.0379 - val_loss: 0.0373 - learning_rate: 1.0000e-04
Epoch 37/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 63ms/step - loss: 0.0377 - val_loss: 0.0398 - learning_rate: 1.0000e-04
Epoch 38/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 62ms/step - loss: 0.0363 - val_loss: 0.0368 - learning_rate: 2.0000e-05
Epoch 39/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 63ms/step - loss: 0.0359 - val_loss: 0.0369 - learning_rate: 2.0000e-05
Epoch 40/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 9s 65ms/step - loss: 0.0357 - val_loss: 0.0363 - learning_rate: 2.0000e-05
Epoch 41/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 79ms/step - loss: 0.0358 - val_loss: 0.0369 - learning_rate: 2.0000e-05
Epoch 42/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 71ms/step - loss: 0.0361 - val_loss: 0.0376 - learning_rate: 2.0000e-05
Epoch 43/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 79ms/step - loss: 0.0356 - val_loss: 0.0360 - learning_rate: 2.0000e-05
Epoch 44/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 69ms/step - loss: 0.0355 - val_loss: 0.0366 - learning_rate: 2.0000e-05
Epoch 45/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 74ms/step - loss: 0.0355 - val_loss: 0.0372 - learning_rate: 2.0000e-05
Epoch 46/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 70ms/step - loss: 0.0357 - val_loss: 0.0369 - learning_rate: 2.0000e-05
Epoch 47/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 78ms/step - loss: 0.0358 - val_loss: 0.0367 - learning_rate: 2.0000e-05
Epoch 48/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 68ms/step - loss: 0.0359 - val_loss: 0.0370 - learning_rate: 2.0000e-05
Epoch 49/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 72ms/step - loss: 0.0355 - val_loss: 0.0371 - learning_rate: 4.0000e-06
Epoch 50/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 76ms/step - loss: 0.0353 - val_loss: 0.0371 - learning_rate: 4.0000e-06
Epoch 51/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 72ms/step - loss: 0.0350 - val_loss: 0.0373 - learning_rate: 4.0000e-06
Epoch 52/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 11s 76ms/step - loss: 0.0353 - val_loss: 0.0371 - learning_rate: 4.0000e-06
Epoch 53/100
143/143 ━━━━━━━━━━━━━━━━━━━━ 10s 71ms/step - loss: 0.0352 - val_loss: 0.0371 - learning_rate: 4.0000e-06

Final model training completed in 9.43 minutes.
------------------------------------------------------

SECTION 5: EVALUATION AND VISUALIZATION OF UNCERTAINTY¶

In [7]:
# =============================================================================
# SECTION 5: EVALUATION AND VISUALIZATION OF UNCERTAINTY
# =============================================================================
#
# OBJECTIVE:
# To use our final, trained QR-PatchTST model to generate probabilistic
# forecasts on the unseen test data. We will then visualize these forecasts
# by plotting the prediction intervals and evaluate their quality using
# specialized probabilistic metrics.
#
# =============================================================================


# --- 5.1. Import Evaluation Libraries ---
#
# **CORRECTION**: Import the necessary functions from sklearn.metrics.
#
from sklearn.metrics import mean_squared_error, mean_absolute_error


# --- 5.2. Generate Quantile Predictions on the Test Set ---
#
# We use our trained 'final_model' to make predictions. Since the model was
# designed with three output neurons, it will directly output the three
# predicted quantiles in a single forward pass.
#
print("--- Generating Probabilistic Forecasts on Test Data ---")

# Predict the scaled quantiles
y_pred_scaled_quantiles = final_model.predict(X_test)

# Inverse transform the predictions to the original kWh scale
y_pred_quantiles = scaler_target.inverse_transform(y_pred_scaled_quantiles)

# Also, inverse transform the actual test values for comparison
y_actual = scaler_target.inverse_transform(y_test.reshape(-1, 1))

print("Predictions generated and inverse-scaled successfully.")
print(f"Shape of predicted quantiles array: {y_pred_quantiles.shape}")
print("-------------------------------------------------------")


# --- 5.3. Organize Results into a DataFrame ---
#
# For easier analysis and plotting, we'll place our results into a pandas DataFrame.
#
results_df = pd.DataFrame({
    'Actual_kWh': y_actual.flatten(),
    'Pred_Q10': y_pred_quantiles[:, 0], # Lower bound (10th percentile)
    'Pred_Q50': y_pred_quantiles[:, 1], # Median (50th percentile)
    'Pred_Q90': y_pred_quantiles[:, 2]  # Upper bound (90th percentile)
})

print("\n--- Results DataFrame ---")
print(results_df.head())
print("-------------------------")


# --- 5.4. Visualize the Probabilistic Forecast ---
#
# This is the primary visualization for the paper. It shows not just the point
# forecast (the median) but also the uncertainty range (the prediction interval).
# We will plot a smaller slice of the test set for clarity.
#
plot_slice = 720 # Plot approximately one month of data

plt.figure(figsize=(20, 10))
plt.plot(results_df.index[:plot_slice], results_df['Actual_kWh'][:plot_slice], label='Actual kWh', color='black', linewidth=1.5)
plt.plot(results_df.index[:plot_slice], results_df['Pred_Q50'][:plot_slice], label='Median Prediction (Q50)', color='red', linestyle='--', linewidth=1.5)

# Add the shaded prediction interval
plt.fill_between(
    results_df.index[:plot_slice],
    results_df['Pred_Q10'][:plot_slice],
    results_df['Pred_Q90'][:plot_slice],
    color='red',
    alpha=0.2,
    label='80% Prediction Interval (Q10-Q90)'
)

plt.title('Probabilistic Forecast of Building Energy Consumption', fontsize=18)
plt.xlabel('Time Step (Hour) in Test Set', fontsize=14)
plt.ylabel('Total Building Energy (kWh)', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()


# --- 5.5. Evaluate Probabilistic Metrics ---
#
# We now calculate the key metrics to quantitatively assess the quality of
# our probabilistic forecast: PICP and MPIW.
#
# 1. Prediction Interval Coverage Probability (PICP)
#    What percentage of actual values fall within the prediction interval?
#    For an 80% interval, the ideal PICP is 80%.
is_covered = (results_df['Actual_kWh'] >= results_df['Pred_Q10']) & (results_df['Actual_kWh'] <= results_df['Pred_Q90'])
picp = is_covered.mean() * 100

# 2. Mean Prediction Interval Width (MPIW)
#    What is the average width of the prediction intervals? A narrower
#    interval (for a given coverage) is better.
mpiw = (results_df['Pred_Q90'] - results_df['Pred_Q10']).mean()

# 3. Standard metrics for the median (point) forecast for reference
rmse = np.sqrt(mean_squared_error(results_df['Actual_kWh'], results_df['Pred_Q50']))
mae = mean_absolute_error(results_df['Actual_kWh'], results_df['Pred_Q50'])

# --- 5.6. Display Final Performance Metrics in a Table ---
performance_metrics = {
    'Metric': [
        'Point Forecast RMSE (kWh)',
        'Point Forecast MAE (kWh)',
        'Prediction Interval Coverage Probability (PICP %)',
        'Mean Prediction Interval Width (MPIW kWh)'
    ],
    'Value': [
        rmse,
        mae,
        picp,
        mpiw
    ]
}
performance_df = pd.DataFrame(performance_metrics)

print("\n--- Final Probabilistic Model Performance ---")
print(performance_df.round(4))
print("---------------------------------------------")
--- Generating Probabilistic Forecasts on Test Data ---
125/125 ━━━━━━━━━━━━━━━━━━━━ 1s 10ms/step
Predictions generated and inverse-scaled successfully.
Shape of predicted quantiles array: (3984, 3)
-------------------------------------------------------

--- Results DataFrame ---
   Actual_kWh    Pred_Q10    Pred_Q50    Pred_Q90
0  147.802167  122.166359  146.660095  168.431580
1  144.898500  109.010002  130.758270  150.440247
2  145.300667  111.531509  133.085175  152.397842
3  145.206667  116.998795  137.830627  156.548019
4  139.561833  112.017097  134.456635  155.676468
-------------------------
No description has been provided for this image
--- Final Probabilistic Model Performance ---
                                              Metric    Value
0                          Point Forecast RMSE (kWh)  55.1785
1                           Point Forecast MAE (kWh)  24.0603
2  Prediction Interval Coverage Probability (PICP %)  83.2580
3          Mean Prediction Interval Width (MPIW kWh)  70.2831
---------------------------------------------

SECTION 6: APPLICATION TO RISK-AWARE SUPERVISORY CONTROL¶

In [8]:
# =============================================================================
# SECTION 6: APPLICATION TO RISK-AWARE SUPERVISORY CONTROL
# =============================================================================
#
# OBJECTIVE:
# To demonstrate the practical, real-world value of our probabilistic
# forecasts. We will simulate a simple, risk-aware control strategy for
# managing peak load and compare its behavior to a standard controller that
# relies only on a single-point forecast.
#
# =============================================================================


# --- 6.1. Define the Control Scenario and Threshold ---
#
# Let's assume a scenario where the building operator wants to curtail load
# (e.g., by adjusting HVAC setpoints) whenever the hourly consumption is
# predicted to exceed a certain peak demand or high-cost tariff threshold.
#
PEAK_THRESHOLD_KWH = 650.0

print(f"--- Risk-Aware Control Simulation ---")
print(f"Operational Peak Threshold set to: {PEAK_THRESHOLD_KWH} kWh")
print("-------------------------------------")


# --- 6.2. Simulate and Compare Controller Actions ---
#
# We will compare two types of automated controllers based on our model's output.
#
# Controller 1: Standard (Median-Based)
# This controller acts only based on the most likely outcome (the median forecast).
results_df['Standard_Alert'] = results_df['Pred_Q50'] > PEAK_THRESHOLD_KWH

# Controller 2: Risk-Aware (Upper-Bound-Based)
# This controller is more cautious. It acts if there is a plausible risk (at least
# a 10% chance) that the load will exceed the threshold.
results_df['Risk_Aware_Alert'] = results_df['Pred_Q90'] > PEAK_THRESHOLD_KWH

# Calculate the number of alerts triggered by each controller
standard_alerts_count = results_df['Standard_Alert'].sum()
risk_aware_alerts_count = results_df['Risk_Aware_Alert'].sum()

print(f"\nSimulation Results:")
print(f"Standard Controller triggered {standard_alerts_count} curtailment alerts.")
print(f"Risk-Aware Controller triggered {risk_aware_alerts_count} curtailment alerts.")


# --- 6.3. Identify and Analyze Critical Disagreements ---
#
# The most interesting cases are when the Risk-Aware controller triggers an alert,
# but the Standard one does not. These represent events where uncertainty was high,
# and a standard forecast would have missed the potential peak.
#
critical_events = results_df[
    (results_df['Risk_Aware_Alert'] == True) & (results_df['Standard_Alert'] == False)
]

num_critical_events = len(critical_events)
print(f"\nFound {num_critical_events} critical events where the Risk-Aware controller")
print("acted, but the Standard controller did not.")

# Display the first few identified critical events for inspection
if not critical_events.empty:
    print("\nExamples of critical events (where standard controller failed to act):")
    print(critical_events[['Actual_kWh', 'Pred_Q50', 'Pred_Q90']].head().round(2))
else:
    print("No critical disagreement events found in the test set.")
print("--------------------------------------------------------------------")


# --- 6.4. Visualize a Case Study of a Critical Event ---
#
# We will create a focused plot to visually demonstrate the value of the
# risk-aware approach for the paper.
#
if not critical_events.empty:
    # Get the index of the first critical event for a clear plot
    event_index = critical_events.index[0]
    
    # Define a plotting window around the event for context
    plot_window_start = max(0, event_index - 12)
    plot_window_end = min(len(results_df), event_index + 12)
    
    plt.figure(figsize=(16, 8))
    
    # Plot the series data
    plt.plot(
        results_df.index[plot_window_start:plot_window_end],
        results_df['Actual_kWh'][plot_window_start:plot_window_end],
        label='Actual kWh', color='black', marker='o', linestyle='-'
    )
    plt.plot(
        results_df.index[plot_window_start:plot_window_end],
        results_df['Pred_Q50'][plot_window_start:plot_window_end],
        label='Median Prediction (Q50)', color='red', linestyle='--'
    )
    plt.fill_between(
        results_df.index[plot_window_start:plot_window_end],
        results_df['Pred_Q10'][plot_window_start:plot_window_end],
        results_df['Pred_Q90'][plot_window_start:plot_window_end],
        color='red', alpha=0.2, label='80% Prediction Interval'
    )
    
    # Highlight the operational threshold
    plt.axhline(y=PEAK_THRESHOLD_KWH, color='darkgreen', linestyle=':', linewidth=2, label=f'Peak Threshold ({PEAK_THRESHOLD_KWH} kWh)')
    
    # Highlight the specific critical event point
    plt.scatter(event_index, results_df.loc[event_index, 'Actual_kWh'], color='magenta', s=200, zorder=10, edgecolor='black', label=f'Critical Event (Actual: {results_df.loc[event_index, "Actual_kWh"]:.2f})')
    
    # Add an annotation to explain why this event is critical
    plt.annotate(
        'Risk-Aware Alert Triggered!\nMedian forecast is below threshold,\nbut the upper bound (risk) is above.',
        xy=(event_index, results_df.loc[event_index, 'Pred_Q90']),
        xytext=(event_index - 10, results_df.loc[event_index, 'Pred_Q90'] - 200),
        arrowprops=dict(facecolor='magenta', shrink=0.05, width=2),
        fontsize=12,
        bbox=dict(boxstyle="round,pad=0.5", fc="yellow", ec="black", lw=1, alpha=0.9)
    )
    
    plt.title('Case Study: Risk-Aware Control vs. Standard Control', fontsize=18)
    plt.xlabel('Time Step (Hour) in Test Set', fontsize=14)
    plt.ylabel('Total Building Energy (kWh)', fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()
--- Risk-Aware Control Simulation ---
Operational Peak Threshold set to: 650.0 kWh
-------------------------------------

Simulation Results:
Standard Controller triggered 76 curtailment alerts.
Risk-Aware Controller triggered 304 curtailment alerts.

Found 228 critical events where the Risk-Aware controller
acted, but the Standard controller did not.

Examples of critical events (where standard controller failed to act):
    Actual_kWh    Pred_Q50    Pred_Q90
9       691.49  633.830017  697.760010
10      669.13  643.559998  706.239990
14      661.47  640.500000  706.469971
83      614.12  600.530029  687.679993
88      584.68  587.750000  659.219971
--------------------------------------------------------------------
No description has been provided for this image

SECTION 7 : Advanced Visualization¶

In [9]:
# =============================================================================
# SECTION 7 : Advanced Visualization
# =============================================================================

# --- 7.1. Plotting Forecast Uncertainty as a Time Series ---
#
# Calculate the width of the 80% prediction interval for each time step.
# This represents the model's uncertainty at that point in time.
results_df['Interval_Width'] = results_df['Pred_Q90'] - results_df['Pred_Q10']

plt.figure(figsize=(18, 6))
results_df['Interval_Width'].plot(
    color='purple',
    label='80% Prediction Interval Width (Uncertainty)'
)

plt.title('Model Forecast Uncertainty (Interval Width) Over Time', fontsize=16)
plt.xlabel('Time Step (Hour) in Test Set', fontsize=12)
plt.ylabel('Uncertainty (kWh)', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# Display summary statistics of the uncertainty
print("\n--- Summary Statistics for Forecast Uncertainty ---")
print(results_df['Interval_Width'].describe())
print("---------------------------------------------")
No description has been provided for this image
--- Summary Statistics for Forecast Uncertainty ---
count    3984.000000
mean       70.283127
std        41.855949
min        22.470947
25%        39.051180
50%        56.601711
75%        91.550957
max       354.965881
Name: Interval_Width, dtype: float64
---------------------------------------------
In [10]:
# --- 7.2. Analyzing Uncertainty by Hour of Day ---
#
# To understand the daily patterns in forecast uncertainty, we can create a
# boxplot of the interval width for each hour of the day.
#
# First, we need to get the hour of the day for the test set.
# The test set starts after the training set ends.
test_start_index = train_df.index[-1] + pd.Timedelta(hours=1)
test_indices = pd.date_range(start=test_start_index, periods=len(results_df), freq='h')
results_df['Hour'] = test_indices.hour

plt.figure(figsize=(14, 7))
sns.boxplot(
    data=results_df,
    x='Hour',
    y='Interval_Width',
    color='skyblue'
)

plt.title('Forecast Uncertainty (Interval Width) by Hour of Day', fontsize=16)
plt.xlabel('Hour of Day', fontsize=12)
plt.ylabel('Prediction Interval Width (kWh)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()
No description has been provided for this image

SECTION 8: VALIDATION OF CONTROL STRATEGIES (CONTINGENCY ANALYSIS)¶

In [11]:
# =============================================================================
# SECTION 8: VALIDATION OF CONTROL STRATEGIES (CONTINGENCY ANALYSIS)
# =============================================================================
#
# OBJECTIVE:
# To rigorously validate the performance of the Standard vs. Risk-Aware
# control strategies. This involves comparing their alert triggers against the
# ground truth (actual peak exceedances) to quantify their effectiveness
# in terms of true positives, false positives, and critical misses.
#
# =============================================================================


# --- 8.1. Import Additional Libraries ---
#
# We need functions from sklearn.metrics to perform the contingency analysis.
#
from sklearn.metrics import confusion_matrix
import seaborn as sns
import pandas as pd


# --- 8.2. Define the "Ground Truth" for Peak Events ---
#
# The ground truth is whether the actual energy consumption for a given hour
# truly exceeded our operational threshold.
#
results_df['Actual_Exceedance'] = results_df['Actual_kWh'] > PEAK_THRESHOLD_KWH

# --- 8.3. Calculate Contingency Metrics for Both Controllers ---
#
# We will now analyze the performance of each controller's alerts against
# the ground truth. A confusion matrix gives us:
#   - True Negatives (TN): Correctly did not alert.
#   - False Positives (FP): Incorrectly alerted (False Alarm).
#   - False Negatives (FN): Incorrectly did not alert (Critical Miss).
#   - True Positives (TP): Correctly alerted.
#

# --- For the Standard (Median-Based) Controller ---
tn_std, fp_std, fn_std, tp_std = confusion_matrix(
    results_df['Actual_Exceedance'],
    results_df['Standard_Alert']
).ravel()

# --- For the Risk-Aware (Upper-Bound-Based) Controller ---
tn_risk, fp_risk, fn_risk, tp_risk = confusion_matrix(
    results_df['Actual_Exceedance'],
    results_df['Risk_Aware_Alert']
).ravel()


# --- 8.4. Create the Summary Performance Table ---
#
# This table will be a key result in the paper's discussion, directly
# comparing the outcomes of the two strategies.
#
contingency_data = {
    'Controller Type': ['Standard (Median-Based)', 'Risk-Aware (Upper Bound)'],
    'True Positives (Correct Alerts)': [tp_std, tp_risk],
    'False Positives (False Alarms)': [fp_std, fp_risk],
    'False Negatives (Critical Misses)': [fn_std, fn_risk],
    'Total Alerts Triggered': [tp_std + fp_std, tp_risk + fp_risk]
}

contingency_df = pd.DataFrame(contingency_data).set_index('Controller Type')

print("--- Contingency Analysis of Control Alert Performance ---")
print(contingency_df)
print("-------------------------------------------------------")


# --- 8.5. Visualize Normalized Confusion Matrices ---
#
# A normalized confusion matrix is the best way to visualize the trade-offs
# between the controllers. It shows the percentage of correct/incorrect
# classifications for each class (Peak vs. No Peak).
#
def plot_normalized_confusion_matrix(y_true, y_pred, title):
    """Generates and plots a normalized confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    
    # Normalize the confusion matrix
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.figure(figsize=(7, 6))
    sns.heatmap(
        cm_normalized,
        annot=True,
        fmt='.2%', # Format as percentage with two decimals
        cmap='Blues',
        xticklabels=['No Peak', 'Peak'],
        yticklabels=['No Peak', 'Peak']
    )
    plt.title(title, fontsize=16)
    plt.ylabel('Actual Condition', fontsize=12)
    plt.xlabel('Predicted Condition', fontsize=12)
    plt.show()

# --- Generate the plots for the paper ---

# Plot for the Standard Controller
plot_normalized_confusion_matrix(
    results_df['Actual_Exceedance'],
    results_df['Standard_Alert'],
    title='Normalized Confusion Matrix - Standard Controller'
)

# Plot for the Risk-Aware Controller
plot_normalized_confusion_matrix(
    results_df['Actual_Exceedance'],
    results_df['Risk_Aware_Alert'],
    title='Normalized Confusion Matrix - Risk-Aware Controller'
)
--- Contingency Analysis of Control Alert Performance ---
                          True Positives (Correct Alerts)  \
Controller Type                                             
Standard (Median-Based)                                44   
Risk-Aware (Upper Bound)                               88   

                          False Positives (False Alarms)  \
Controller Type                                            
Standard (Median-Based)                               32   
Risk-Aware (Upper Bound)                             216   

                          False Negatives (Critical Misses)  \
Controller Type                                               
Standard (Median-Based)                                  60   
Risk-Aware (Upper Bound)                                 16   

                          Total Alerts Triggered  
Controller Type                                   
Standard (Median-Based)                       76  
Risk-Aware (Upper Bound)                     304  
-------------------------------------------------------
No description has been provided for this image
No description has been provided for this image