# Core data manipulation libraries
import numpy as np
import pandas as pd
import os
import time
import random
import warnings
import h5py

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report

# Deep learning libraries
import tensorflow as tf
from tensorflow.keras import backend
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Conv2D, LeakyReLU, MaxPooling2D, Flatten
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical, plot_model

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print(tf.__version__)

2.19.0

def save_model(model, model_name, save_format='keras'):
    """
    Save a trained model to disk with proper versioning and metadata.
    
    Parameters:
    -----------
    model : Keras Model
        The trained model to save
    model_name : str
        Base name for the saved model
    save_format : str, default='keras'
        Format to save the model ('keras' or 'h5')
    
    Returns:
    --------
    str : Path to the saved model
    """
    from datetime import datetime
    
    # Create models directory if it doesn't exist
    os.makedirs('models', exist_ok=True)
    
    # Generate timestamp for versioning
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create filename with timestamp for versioning
    filename = f"models/{model_name}_{timestamp}"
    
    if save_format == 'h5':
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)
            filepath = f"{filename}.h5"
            model.save(filepath)
        print(f"Model saved to {filepath} (legacy HDF5 format)")
    else:
        # Recommended Keras format
        filepath = f"{filename}.keras"
        model.save(filepath)
        print(f"Model saved to {filepath} (recommended Keras format)")
    
    # Save model architecture visualization
    try:
        from tensorflow.keras.utils import plot_model
        diagram_path = f"{filename}_architecture.png"
        plot_model(model, to_file=diagram_path, show_shapes=True, show_dtype=True)
        print(f"Model architecture diagram saved to {diagram_path}")
    except Exception as e:
        print(f"Could not save model diagram: {e}")
    
    return filepath

def load_model(model_path):
    """
    Load a trained model from disk with proper error handling.
    
    Parameters:
    -----------
    model_path : str
        Path to the saved model
        
    Returns:
    --------
    model : Keras Model
        The loaded model
    """
    from tensorflow.keras.models import load_model as keras_load_model
    
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"No model found at {model_path}")
    
    try:
        model = keras_load_model(model_path)
        print(f"Model successfully loaded from {model_path}")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Define helper function for directory size calculation
def get_dir_size(path):
    """Calculate total size of a directory including subdirectories"""
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

def create_styled_plot(title, xlabel, ylabel, figsize=(10, 6)):
    """Create a consistently styled matplotlib figure with improved aesthetics"""
    plt.figure(figsize=figsize)
    
    # Set professional font styles
    plt.rcParams['font.family'] = 'sans-serif'
    plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans']
    
    # Add subtle grid for readability
    plt.grid(alpha=0.3, linestyle='--')
    
    # Set title and labels with enhanced formatting
    plt.title(title, fontsize=16, fontweight='bold', pad=20)
    plt.xlabel(xlabel, fontsize=13, labelpad=10)
    plt.ylabel(ylabel, fontsize=13, labelpad=10)
    
    # Improve tick label visibility
    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    
    return plt

def display_experiment_results(experiment_name, parameter_name, parameter_values, metrics):
    """Display experiment results as a styled pandas DataFrame"""
    
    # Create DataFrame from results
    results = pd.DataFrame({parameter_name: parameter_values})
    
    # Add all metrics
    for metric_name, values in metrics.items():
        results[metric_name] = values
    
    # Style the table
    styled_results = results.style.set_caption(f"<h3>{experiment_name}</h3>") \
                           .set_table_styles([
                               {'selector': 'caption', 'props': [('text-align', 'center')]},
                               {'selector': 'th', 'props': [('font-size', '12pt'), 
                                                         ('text-align', 'center'),
                                                         ('background-color', '#f0f0f0')]},
                               {'selector': 'td', 'props': [('text-align', 'center'),
                                                         ('font-size', '11pt')]}
                           ]) \
                           .highlight_max(subset=[col for col in metrics.keys() if 'accuracy' in col.lower()], 
                                        color='#d4f1d4') \
                           .highlight_min(subset=[col for col in metrics.keys() if 'time' in col.lower()], 
                                        color='#d4f1d4')
    
    display(HTML("<style>.dataframe th, .dataframe td {padding: 8px !important;}</style>"))
    display(styled_results)

# Open the file as read only

h5f = h5py.File('/Users/mohitpammu/Desktop/MIT-ADSP/Elective Project/Deep Learning/SVHN_single_grey1.h5', 'r')

# Load and split the data into train and test datasets

X_train = h5f['X_train'][:]

y_train = h5f['y_train'][:]

X_test = h5f['X_test'][:]

y_test = h5f['y_test'][:]

# Close the file

h5f.close()

len(X_train), len(X_test)

(42000, 18000)

# Visualize the first 10 images in the X_train dataset to understand the data
plt.figure(figsize=(10,1))

for i in range(10):

  plt.subplot(1,10,i+1) # Create a subplot with 1 row and 10 columns, and plot the i-th image in the i-th column

  plt.imshow(X_train[i], cmap = "gray") # Display the i-th image in grayscale

  plt.axis('off')

plt.show()

print('label for each of the above image, respectively: %s' % (y_train[0:10]))

label for each of the above image, respectively: [2 6 7 4 4 0 3 0 7 3]

# To visualize the average images properly, we need to reshape the flattened data back to 2D
# First, let's create copies of our data to avoid modifying the originals
X_train_images = X_train.copy()

# Ensure labels are in integer format (not one-hot encoded)
if len(y_train.shape) > 1 and y_train.shape[1] > 1:
    # Labels are one-hot encoded, convert back to integers
    y_train_labels = np.argmax(y_train, axis=1)
else:
    # Labels are already integers
    y_train_labels = y_train.copy()

# Reshape the flattened images back to 32x32 for visualization
X_train_images = X_train_images.reshape(-1, 32, 32)

def display_average_digits(images, labels, figsize=(15, 6)):
    """
    Calculate and display the average image for each digit class
    
    Parameters:
    -----------
    images: numpy array of shape (n_samples, 32, 32)
        The images to analyze
    labels: numpy array of shape (n_samples,)
        The corresponding labels (0-9)
    figsize: tuple
        Figure size for the plot
    """
    fig, axes = plt.subplots(2, 5, figsize=figsize)
    axes = axes.flatten()
    
    for digit in range(10):
        # Find all images of this digit
        digit_indices = np.where(labels == digit)[0]
        
        if len(digit_indices) > 0:
            # Calculate average image by taking the mean of all images with this label
            avg_image = np.mean(images[digit_indices], axis=0)
            
            # Display average image
            im = axes[digit].imshow(avg_image, cmap='gray')
            axes[digit].set_title(f"Digit {digit}")
            axes[digit].axis('off')
            
            # Add colorbar to show intensity scale
            plt.colorbar(im, ax=axes[digit], fraction=0.046, pad=0.04)
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

# Display the average digit images
display_average_digits(X_train_images, y_train_labels)

# Let's also analyze the number of images per digit to understand our dataset better
digit_counts = np.bincount(y_train_labels)
for digit, count in enumerate(digit_counts):
    print(f"Digit {digit}: {count} images")

Digit 0: 4186 images
Digit 1: 4172 images
Digit 2: 4197 images
Digit 3: 4281 images
Digit 4: 4188 images
Digit 5: 4232 images
Digit 6: 4168 images
Digit 7: 4192 images
Digit 8: 4188 images
Digit 9: 4196 images

def display_digit_examples(images, labels, num_per_class=5, figsize=(14, 12)):
    """Display examples of each digit class with improved formatting and individual labels"""
    # Create figure with extra space at top for the title
    fig = plt.figure(figsize=figsize)
    
    # Create the GridSpec layout with space at top for title
    gs = fig.add_gridspec(11, num_per_class, height_ratios=[0.5] + [1]*10)
    
    # Add overall title at the top of the figure using the reserved space
    fig.text(0.5, 0.98, "Examples of Each Digit Class", 
             fontsize=16, ha='center', va='top')
    
    axes = []
    for digit in range(10):
        digit_axes = []
        for i in range(num_per_class):
            # Create axes in rows 1-10 (row 0 is reserved for title)
            ax = fig.add_subplot(gs[digit+1, i])
            digit_axes.append(ax)
        axes.append(digit_axes)
    
    for digit in range(10):
        digit_indices = np.where(labels == digit)[0]
        sample_size = min(num_per_class, len(digit_indices))
        sample_indices = np.random.choice(digit_indices, size=sample_size, replace=False)
        
        for i, idx in enumerate(sample_indices):
            if i < num_per_class:
                # Increase figure size and DPI for higher quality
                axes[digit][i].imshow(images[idx], cmap='gray', interpolation='lanczos')
                axes[digit][i].axis('off')
                
                # Add a title with the true label instead of "Ex #"
                axes[digit][i].set_title(f"Label: {digit}", fontsize=8, pad=3)
        
        # Add row labels
        axes[digit][0].set_ylabel(f"Digit {digit}", size=12, rotation=0, labelpad=35)
    
    plt.tight_layout()
    plt.subplots_adjust(left=0.1, wspace=0.1, hspace=0.2, top=0.95)  # Increased top margin
    
    # Save with high DPI
    plt.savefig('digit_examples.png', dpi=300, bbox_inches='tight')
    plt.show()

# Display examples of each digit class
display_digit_examples(X_train_images, y_train_labels, num_per_class=5)

# Display class distribution with improved formatting
def display_class_distribution(labels):
    """Display class distribution with improved formatting"""
    plt.figure(figsize=(12, 6))
    
    # Use a better color palette
    ax = sns.countplot(x=labels, palette='viridis')

    plt.title("Class Distribution of Digits", fontsize=16, fontweight='bold', pad=20)
    plt.xlabel("Digit", fontsize=14)
    plt.ylabel("Count", fontsize=14)
    plt.xticks(range(10), fontsize=12)
    plt.yticks(fontsize=12)
    
    # Add count labels on top of bars
    for p in ax.patches:
        ax.annotate(f'{p.get_height():,}', 
                   (p.get_x() + p.get_width()/2., p.get_height()), 
                   ha='center', va='bottom', fontsize=11)
    
    plt.grid(axis='y', alpha=0.3)
    

print("Class Distribution Analysis")
display_class_distribution(y_train_labels)

Class Distribution Analysis

print("Shape of first image:", X_train[0].shape)

print('-'*60)

print("Array of pixels:\n", X_train[0])

Shape of first image: (32, 32)
------------------------------------------------------------
Array of pixels:
 [[ 33.0704  30.2601  26.852  ...  71.4471  58.2204  42.9939]
 [ 25.2283  25.5533  29.9765 ... 113.0209 103.3639  84.2949]
 [ 26.2775  22.6137  40.4763 ... 113.3028 121.775  115.4228]
 ...
 [ 28.5502  36.212   45.0801 ...  24.1359  25.0927  26.0603]
 [ 38.4352  26.4733  23.2717 ...  28.1094  29.4683  30.0661]
 [ 50.2984  26.0773  24.0389 ...  49.6682  50.853   53.0377]]

# Reshape input data from 2D images (32x32) to 1D feature vectors
X_train = X_train.reshape(X_train.shape[0], 1024)

X_test = X_test.reshape(X_test.shape[0], 1024)

# Normalize pixel values to [0,1] range to improve gradient flow

X_train = X_train.astype('float32')/255.0

X_test = X_test.astype('float32')/255.0

print('Training Set:', X_train.shape, y_train.shape)

print('Test Set:', X_test.shape, y_test.shape)

Training Set: (42000, 1024) (42000,)
Test Set: (18000, 1024) (18000,)

# One-hot encode the labels
y_train = to_categorical(y_train)

y_test = to_categorical(y_test)

# Check the shape of the one-hot encoded labels
y_test

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.]])

# Set the random seed for numpy to ensure reproducibility of results
np.random.seed(24)

import random

random.seed(24)

tf.random.set_seed(24)

def nn_model_1():
    """
    Baseline ANN architecture for SVHN digit recognition.
    
    Architecture:
    - Input layer (1024 units) → Hidden layer 1 (64 units) → Hidden layer 2 (32 units) → Output (10 units)
    - Simple feed-forward network with minimal complexity for baseline performance
    
    Returns:
        model: Compiled Keras Sequential model
    """
    model = Sequential()
    
    # Input processing layer with dimensionality reduction (1024→64)
    model.add(Dense(64, activation='relu', input_shape=(1024,)))
    
    # Feature abstraction layer with further dimensionality reduction (64→32)
    model.add(Dense(32, activation='relu'))
    
    # Classification layer with softmax activation for 10-class probability distribution
    model.add(Dense(10, activation='softmax'))
    
    # Configure optimizer with standard learning rate for stable convergence
    model.compile(loss='categorical_crossentropy', 
                  optimizer=Adam(learning_rate=0.001), 
                  metrics=['accuracy'])
    
    return model

# Assign the model function to a variable for further use
model_1 = nn_model_1()

# Print the model summary to check the layers and parameters
model_1.summary()

Model: "sequential"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense (Dense)                   │ (None, 64)             │        65,600 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 32)             │         2,080 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 10)             │           330 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 68,010 (265.66 KB)

 Trainable params: 68,010 (265.66 KB)

 Non-trainable params: 0 (0.00 B)

# Fit the model to the training data
history_model_1 = model_1.fit(X_train, y_train, validation_split = 0.2, batch_size = 128, verbose = 1, epochs = 20)

Epoch 1/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step - accuracy: 0.1034 - loss: 2.3158 - val_accuracy: 0.1332 - val_loss: 2.2731
Epoch 2/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.1547 - loss: 2.2320 - val_accuracy: 0.2156 - val_loss: 2.0881
Epoch 3/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.2490 - loss: 2.0493 - val_accuracy: 0.3004 - val_loss: 1.9352
Epoch 4/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.3345 - loss: 1.9059 - val_accuracy: 0.4098 - val_loss: 1.7457
Epoch 5/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4291 - loss: 1.6995 - val_accuracy: 0.4705 - val_loss: 1.5775
Epoch 6/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.4884 - loss: 1.5504 - val_accuracy: 0.5105 - val_loss: 1.4857
Epoch 7/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5229 - loss: 1.4688 - val_accuracy: 0.5367 - val_loss: 1.4293
Epoch 8/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5400 - loss: 1.4162 - val_accuracy: 0.5513 - val_loss: 1.3828
Epoch 9/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5512 - loss: 1.3801 - val_accuracy: 0.5614 - val_loss: 1.3555
Epoch 10/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5603 - loss: 1.3574 - val_accuracy: 0.5718 - val_loss: 1.3292
Epoch 11/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5674 - loss: 1.3378 - val_accuracy: 0.5792 - val_loss: 1.3094
Epoch 12/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5734 - loss: 1.3229 - val_accuracy: 0.5843 - val_loss: 1.2944
Epoch 13/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5790 - loss: 1.3108 - val_accuracy: 0.5893 - val_loss: 1.2847
Epoch 14/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5822 - loss: 1.3012 - val_accuracy: 0.5930 - val_loss: 1.2744
Epoch 15/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5849 - loss: 1.2932 - val_accuracy: 0.5957 - val_loss: 1.2681
Epoch 16/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5873 - loss: 1.2866 - val_accuracy: 0.5963 - val_loss: 1.2638
Epoch 17/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5903 - loss: 1.2805 - val_accuracy: 0.5979 - val_loss: 1.2606
Epoch 18/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5925 - loss: 1.2750 - val_accuracy: 0.5990 - val_loss: 1.2565
Epoch 19/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5940 - loss: 1.2695 - val_accuracy: 0.5987 - val_loss: 1.2550
Epoch 20/20
263/263 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - accuracy: 0.5942 - loss: 1.2667 - val_accuracy: 0.5990 - val_loss: 1.2476

# Save the trained model
ann_model1_path = save_model(model_1, "ANN_Model1")

Model saved to models/ANN_Model1_20250522_172102.keras (recommended Keras format)
Model architecture diagram saved to models/ANN_Model1_20250522_172102_architecture.png

# Extract the history of the model training for plotting
dict_hist = history_model_1.history
list_ep = [i for i in range(1, 21)]

# Create styled plot for ANN Model 1
plot = create_styled_plot(
    title='ANN Model 1: Training and Validation Accuracy',
    xlabel='Epochs',
    ylabel='Accuracy',
    figsize=(10, 6)
)

# Add data series with enhanced styling
plot.plot(list_ep, dict_hist['accuracy'], linewidth=2.5, label='Training')
plot.plot(list_ep, dict_hist['val_accuracy'], linewidth=2.5, label='Validation')

# Add legend and display
plot.legend(fontsize=12, frameon=True)
plt.tight_layout()
plt.show()

# Clear the backend session to free up resources and reset the state of the model
backend.clear_session()

# Fix the random seed again for reproducibility after clearing the session
np.random.seed(24)

random.seed(24)

tf.random.set_seed(24)

def nn_model_2():
    """
    Enhanced ANN architecture with regularization and normalization techniques.
    
    Architecture:
    - Deeper network (5 hidden layers) with width progression: 256→128→64→64→32
    - Incorporates dropout and batch normalization for improved generalization
    - Uses reduced learning rate for more stable training
    
    Returns:
        model: Compiled Keras Sequential model with regularization
    """
    model = Sequential()
    
    # High-capacity initial representation layer
    model.add(Dense(256, activation='relu', input_shape=(1024,)))
    
    # Secondary representation with dimensionality reduction
    model.add(Dense(128, activation='relu'))
    
    # Apply dropout regularization to prevent co-adaptation of features
    model.add(Dropout(0.2))
    
    # Mid-level feature abstraction layers
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    
    # Final feature refinement layer
    model.add(Dense(32, activation='relu'))
    
    # Normalize activations to stabilize training dynamics
    model.add(BatchNormalization())
    
    # Output layer for 10-class classification
    model.add(Dense(10, activation='softmax'))
    
    # Compile with reduced learning rate for fine-grained optimization
    model.compile(loss='categorical_crossentropy', 
                  optimizer=Adam(learning_rate=0.001), 
                  metrics=['accuracy'])
    
    return model

# Assig the model function to a variable for further use
model_2 = nn_model_2()

# Print the model summary to check the layers and parameters
model_2.summary()

Model: "sequential"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense (Dense)                   │ (None, 256)            │       262,400 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 128)            │        32,896 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 128)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 64)             │         8,256 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_3 (Dense)                 │ (None, 64)             │         4,160 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_4 (Dense)                 │ (None, 32)             │         2,080 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ batch_normalization             │ (None, 32)             │           128 │
│ (BatchNormalization)            │                        │               │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_5 (Dense)                 │ (None, 10)             │           330 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 310,250 (1.18 MB)

 Trainable params: 310,186 (1.18 MB)

 Non-trainable params: 64 (256.00 B)

# Fit the second ANN model to the training data
history_model_2 = model_2.fit(X_train, y_train, validation_split = 0.2, batch_size = 128, verbose = 1, epochs = 30)

Epoch 1/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.0987 - loss: 2.3884 - val_accuracy: 0.1011 - val_loss: 2.3099
Epoch 2/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.1019 - loss: 2.3041 - val_accuracy: 0.1155 - val_loss: 2.3020
Epoch 3/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.1723 - loss: 2.1866 - val_accuracy: 0.2736 - val_loss: 2.0183
Epoch 4/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.3883 - loss: 1.7011 - val_accuracy: 0.4469 - val_loss: 1.5867
Epoch 5/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.4730 - loss: 1.5155 - val_accuracy: 0.5396 - val_loss: 1.3712
Epoch 6/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5182 - loss: 1.4047 - val_accuracy: 0.5662 - val_loss: 1.3047
Epoch 7/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5434 - loss: 1.3437 - val_accuracy: 0.5762 - val_loss: 1.2687
Epoch 8/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5697 - loss: 1.2875 - val_accuracy: 0.5783 - val_loss: 1.2913
Epoch 9/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5783 - loss: 1.2545 - val_accuracy: 0.5762 - val_loss: 1.2658
Epoch 10/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.5940 - loss: 1.2361 - val_accuracy: 0.6302 - val_loss: 1.1465
Epoch 11/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6145 - loss: 1.1803 - val_accuracy: 0.6120 - val_loss: 1.1878
Epoch 12/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6194 - loss: 1.1624 - val_accuracy: 0.6383 - val_loss: 1.1153
Epoch 13/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6310 - loss: 1.1395 - val_accuracy: 0.6362 - val_loss: 1.1373
Epoch 14/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6396 - loss: 1.1171 - val_accuracy: 0.6529 - val_loss: 1.0909
Epoch 15/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6471 - loss: 1.0972 - val_accuracy: 0.6651 - val_loss: 1.0533
Epoch 16/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6563 - loss: 1.0807 - val_accuracy: 0.6688 - val_loss: 1.0323
Epoch 17/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6542 - loss: 1.0691 - val_accuracy: 0.6745 - val_loss: 1.0310
Epoch 18/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6604 - loss: 1.0513 - val_accuracy: 0.6668 - val_loss: 1.0314
Epoch 19/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6612 - loss: 1.0544 - val_accuracy: 0.6864 - val_loss: 0.9890
Epoch 20/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6695 - loss: 1.0319 - val_accuracy: 0.6723 - val_loss: 1.0226
Epoch 21/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6711 - loss: 1.0335 - val_accuracy: 0.6842 - val_loss: 0.9884
Epoch 22/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6738 - loss: 1.0193 - val_accuracy: 0.6852 - val_loss: 0.9894
Epoch 23/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6768 - loss: 1.0035 - val_accuracy: 0.6876 - val_loss: 0.9792
Epoch 24/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6812 - loss: 0.9967 - val_accuracy: 0.6936 - val_loss: 0.9656
Epoch 25/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6885 - loss: 0.9798 - val_accuracy: 0.6923 - val_loss: 0.9740
Epoch 26/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6919 - loss: 0.9739 - val_accuracy: 0.6969 - val_loss: 0.9573
Epoch 27/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6847 - loss: 0.9768 - val_accuracy: 0.6999 - val_loss: 0.9582
Epoch 28/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6902 - loss: 0.9802 - val_accuracy: 0.7040 - val_loss: 0.9476
Epoch 29/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6914 - loss: 0.9687 - val_accuracy: 0.7004 - val_loss: 0.9559
Epoch 30/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.6925 - loss: 0.9609 - val_accuracy: 0.7032 - val_loss: 0.9466

# Save the trained model
ann_model2_path = save_model(model_2, "ANN_Model2")

Model saved to models/ANN_Model2_20250522_172127.keras (recommended Keras format)
Model architecture diagram saved to models/ANN_Model2_20250522_172127_architecture.png

# Extract the history of the model training for plotting
dict_hist = history_model_2.history
list_ep = [i for i in range(1, 31)]

# Create styled plot for ANN Model 2
plot = create_styled_plot(
    title='ANN Model 2: Training and Validation Accuracy',
    xlabel='Epochs',
    ylabel='Accuracy',
    figsize=(10, 6)
)

# Add data series with enhanced styling
plot.plot(list_ep, dict_hist['accuracy'], linewidth=2.5, label='Training')
plot.plot(list_ep, dict_hist['val_accuracy'], linewidth=2.5, label='Validation')

# Add legend and display
plot.legend(fontsize=12, frameon=True)
plt.tight_layout()
plt.show()

# Predict the classes for the test set using the second trained ANN model
test_pred = model_2.predict(X_test)

# Convert the predicted probabilities to class labels by taking the index of the maximum probability
test_pred = np.argmax(test_pred, axis = -1)

563/563 ━━━━━━━━━━━━━━━━━━━━ 0s 441us/step

# Convert the one-hot encoded test labels back to class labels for comparison
y_test = np.argmax(y_test, axis = -1)

# Import confusion matrix from sklearn

def plot_enhanced_confusion_matrix(y_true, y_pred, title="Enhanced Confusion Matrix", figsize=(12, 10)):
    """
    Plot an enhanced confusion matrix with row-normalized values, better visual cues,
    and improved labeling for digit classification tasks.
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    title : str
        Title for the confusion matrix plot
    figsize : tuple
        Figure size (width, height)
    """
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Create figure
    plt.figure(figsize=figsize)
    
    # Plot raw counts
    plt.subplot(1, 2, 1)
    ax1 = plt.gca()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, square=True, ax=ax1)
    
    # Improved labeling for raw counts
    ax1.set_xlabel('Predicted Digit', fontsize=12, labelpad=10)
    ax1.set_ylabel('True Digit', fontsize=12, labelpad=10)
    ax1.set_title('Absolute Counts', fontsize=14, pad=20)
    
    # Add digit labels to rows and columns
    ax1.set_xticklabels([f"{i}" for i in range(10)])
    ax1.set_yticklabels([f"{i}" for i in range(10)])
    
    # Plot normalized confusion matrix (by row)
    plt.subplot(1, 2, 2)
    ax2 = plt.gca()
    
    # Normalize by row (true labels)
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm_norm = np.round(cm_norm * 100, 1)  # Convert to percentages
    
    # Create a mask for the diagonal elements to highlight them
    mask = np.zeros_like(cm_norm)
    np.fill_diagonal(mask, 1)
    
    # Plot normalized matrix with custom formatting
    sns.heatmap(cm_norm, annot=True, fmt='.1f', cmap='Blues', cbar=True, 
                cbar_kws={'label': 'Percentage (%)'}, square=True, ax=ax2)
    
    # Highlight the diagonal with a different color or pattern
    sns.heatmap(mask * cm_norm, annot=False, cmap='Greens', alpha=0.3, 
                cbar=False, square=True, ax=ax2)
    
    # Improved labeling for percentages
    ax2.set_xlabel('Predicted Digit', fontsize=12, labelpad=10)
    ax2.set_ylabel('True Digit', fontsize=12, labelpad=10)
    ax2.set_title('Normalized by True Class (%)', fontsize=14, pad=20)
    
    # Add digit labels to rows and columns
    ax2.set_xticklabels([f"{i}" for i in range(10)])
    ax2.set_yticklabels([f"{i}" for i in range(10)])
    
    # Add main title
    plt.suptitle(title, fontsize=16, fontweight='bold', y=0.98)
    
    # Add interpretation guide
    plt.figtext(0.5, 0.01, "Reading guide: Row values sum to 100%. Each cell shows the % of actual digits (rows)\nthat were classified as predicted digits (columns). Diagonal shows correct classifications.",
               ha='center', fontsize=11, bbox=dict(facecolor='whitesmoke', alpha=0.7))
    
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()
    
    # Calculate and display key error statistics
    misclassification_stats = {}
    for i in range(10):
        # Find top 2 misclassifications for each digit
        row = cm_norm[i].copy()
        row[i] = 0  # Zero out the correct classification
        top_errors = np.argsort(-row)[:2]  # Get indices of top 2 errors
        
        misclassification_stats[i] = {
            'digit': i,
            'correct_rate': cm_norm[i, i],
            'top_confusions': [(j, cm_norm[i, j]) for j in top_errors if cm_norm[i, j] > 0]
        }
    
    print("Key Misclassification Patterns:")
    print("-------------------------------")
    for digit, stats in misclassification_stats.items():
        confusion_str = ", ".join([f"{error[0]} ({error[1]:.1f}%)" for error in stats['top_confusions']])
        print(f"Digit {digit}: {stats['correct_rate']:.1f}% correct, commonly confused with: {confusion_str}")

# Import classification report sklearn

# Function to create an enhanced classification report
def display_enhanced_classification_report(y_true, y_pred, title="Model Performance Analysis"):
    """
    Display an enhanced classification report with improved formatting and visualization.
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    title : str
        Title for the report
    """
    # Get the regular classification report as text
    report = classification_report(y_true, y_pred)
    print(f"{title}\n")
    print(report)
    
    # Generate classification report as dict
    report_dict = classification_report(y_true, y_pred, output_dict=True)
    
    # Convert to DataFrame for better visualization (excluding averages)
    df = pd.DataFrame(report_dict).T
    df_classes = df.iloc[:-3]  # Remove avg rows
    
    # Format the values
    df_styled = df_classes.style.set_caption(title).format({
        'precision': '{:.3f}',
        'recall': '{:.3f}',
        'f1-score': '{:.3f}',
        'support': '{:.0f}'
    })
    
    # Highlight the cells based on their values (higher is better for precision, recall, f1)
    df_styled = df_styled.background_gradient(subset=['precision'], cmap='Blues')
    df_styled = df_styled.background_gradient(subset=['recall'], cmap='Greens')
    df_styled = df_styled.background_gradient(subset=['f1-score'], cmap='Oranges')
    
    # Display the enhanced report
    display(df_styled)

    # Calculate and display digit-specific error rates
    error_rates = [(label, 1 - report_dict[str(label)]['precision']) for label in range(10)]
    error_rates.sort(key=lambda x: x[1], reverse=True)
    
    print("\nDigit Error Analysis (Ordered by Error Rate):")
    print("--------------------------------------------")
    for digit, error_rate in error_rates:
        print(f"Digit {digit}: {error_rate*100:.1f}% error rate")

# After generating predictions
test_pred = model_2.predict(X_test)
test_pred = np.argmax(test_pred, axis=-1)

# Make sure y_test is in the correct format
if len(y_test.shape) > 1 and y_test.shape[1] > 1:
    y_test_labels = np.argmax(y_test, axis=-1)
else:
    y_test_labels = y_test

# Display both enhanced visualizations
plot_enhanced_confusion_matrix(
    y_test_labels, 
    test_pred, 
    title="ANN Model 2: Confusion Matrix Analysis",
    figsize=(14, 10)
)

display_enhanced_classification_report(
    y_test_labels, 
    test_pred,
    title="ANN Model 2: Classification Performance Analysis"
)

563/563 ━━━━━━━━━━━━━━━━━━━━ 0s 392us/step

Key Misclassification Patterns:
-------------------------------
Digit 0: 72.2% correct, commonly confused with: 9 (5.8%), 1 (4.6%)
Digit 1: 73.6% correct, commonly confused with: 4 (4.3%), 7 (3.6%)
Digit 2: 75.2% correct, commonly confused with: 7 (6.5%), 1 (3.6%)
Digit 3: 58.9% correct, commonly confused with: 5 (13.9%), 1 (6.4%)
Digit 4: 75.8% correct, commonly confused with: 1 (5.3%), 6 (4.0%)
Digit 5: 70.8% correct, commonly confused with: 6 (5.1%), 1 (4.6%)
Digit 6: 71.3% correct, commonly confused with: 0 (5.1%), 4 (5.0%)
Digit 7: 75.1% correct, commonly confused with: 2 (8.5%), 1 (5.7%)
Digit 8: 64.3% correct, commonly confused with: 6 (8.9%), 5 (5.4%)
Digit 9: 68.2% correct, commonly confused with: 1 (5.7%), 0 (5.4%)
ANN Model 2: Classification Performance Analysis

              precision    recall  f1-score   support

           0       0.73      0.72      0.73      1814
           1       0.63      0.74      0.68      1828
           2       0.73      0.75      0.74      1803
           3       0.73      0.59      0.65      1719
           4       0.77      0.76      0.76      1812
           5       0.64      0.71      0.67      1768
           6       0.71      0.71      0.71      1832
           7       0.77      0.75      0.76      1808
           8       0.67      0.64      0.65      1812
           9       0.70      0.68      0.69      1804

    accuracy                           0.71     18000
   macro avg       0.71      0.71      0.71     18000
weighted avg       0.71      0.71      0.71     18000

Digit Error Analysis (Ordered by Error Rate):
--------------------------------------------
Digit 1: 37.2% error rate
Digit 5: 35.7% error rate
Digit 8: 33.5% error rate
Digit 9: 29.9% error rate
Digit 6: 28.6% error rate
Digit 0: 27.2% error rate
Digit 3: 26.8% error rate
Digit 2: 26.5% error rate
Digit 7: 23.2% error rate
Digit 4: 23.0% error rate

# Load the h5 file again to ensure the data is loaded correctly for further use
h5f = h5py.File('/Users/mohitpammu/Desktop/MIT-ADSP/Elective Project/Deep Learning/SVHN_single_grey1.h5', 'r')

# Split the data into train and test datasets again to ensure consistency
X_train = h5f['X_train'][:]

y_train = h5f['y_train'][:]

X_test = h5f['X_test'][:]

y_test = h5f['y_test'][:]

h5f.close()

len(X_train), len(X_test)

(42000, 18000)

print('Shape of the first image:', X_train[0].shape)

print('-'*60)

print('Array of pixels:\n', X_train[0])

Shape of the first image: (32, 32)
------------------------------------------------------------
Array of pixels:
 [[ 33.0704  30.2601  26.852  ...  71.4471  58.2204  42.9939]
 [ 25.2283  25.5533  29.9765 ... 113.0209 103.3639  84.2949]
 [ 26.2775  22.6137  40.4763 ... 113.3028 121.775  115.4228]
 ...
 [ 28.5502  36.212   45.0801 ...  24.1359  25.0927  26.0603]
 [ 38.4352  26.4733  23.2717 ...  28.1094  29.4683  30.0661]
 [ 50.2984  26.0773  24.0389 ...  49.6682  50.853   53.0377]]

# Reshape the training and test dataset into 4D arrays for CNN input (samples, height, width, channels)
X_train = X_train.reshape(X_train.shape[0], 32, 32, 1)

X_test = X_test.reshape(X_test.shape[0], 32, 32, 1)

# Normalize pixel values to [0,1] range to improve gradient flow
X_train = X_train.astype('float32')/255.0

X_test = X_test.astype('float32')/255.0

print('Training Set:', X_train.shape, y_train.shape)

print('Test Set:', X_test.shape, y_test.shape)

Training Set: (42000, 32, 32, 1) (42000,)
Test Set: (18000, 32, 32, 1) (18000,)

# One-hot encode the labels for the CNN model
y_train = to_categorical(y_train)

y_test = to_categorical(y_test)

# Check the shape of the one-hot encoded labels for the test set
y_test

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.]])

# Set the random seed for numpy to ensure reproducibility of results
np.random.seed(24)

random.seed(24)

tf.random.set_seed(24)

def cnn_model_1():
    """
    Foundational CNN architecture for digit recognition with spatial feature extraction.
    
    Architecture:
    - Dual convolutional layers [16→32 filters] with 3×3 kernels
    - Single downsampling via max pooling
    - LeakyReLU activation to prevent dying neuron problem
    - 32-unit fully connected layer for final feature integration
    
    Returns:
        model: Compiled Keras Sequential CNN model
    """
    model = Sequential()
    
    # Initial feature extraction with edge and pattern detection
    model.add(Conv2D(16, (3,3), padding='same', input_shape=(32, 32, 1)))
    model.add(LeakyReLU(negative_slope=0.1))
    
    # Secondary feature extraction with increased filter count
    model.add(Conv2D(32, (3,3), padding='same'))
    model.add(LeakyReLU(negative_slope=0.1))
    
    # Spatial dimensionality reduction while preserving feature information
    model.add(MaxPooling2D(pool_size=(2,2)))
    
    # Convert 2D spatial features to 1D feature vector
    model.add(Flatten())
    
    # Final feature integration layer
    model.add(Dense(32))
    model.add(LeakyReLU(negative_slope=0.1))
    
    # Multi-class classification output layer
    model.add(Dense(10, activation='softmax'))
    
    # Configure optimizer for effective feature learning
    model.compile(loss='categorical_crossentropy', 
                  optimizer=Adam(learning_rate=0.001), 
                  metrics=['accuracy'])
    
    return model

# Assign the CNN model function to a variable for further use
cnn_model_1 = cnn_model_1()

# Print the model summary which will provide a detailed overview of the layers, output shapes, and number of parameters in the model.
cnn_model_1.summary()

Model: "sequential_1"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ conv2d (Conv2D)                 │ (None, 32, 32, 16)     │           160 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ leaky_re_lu (LeakyReLU)         │ (None, 32, 32, 16)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_1 (Conv2D)               │ (None, 32, 32, 32)     │         4,640 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ leaky_re_lu_1 (LeakyReLU)       │ (None, 32, 32, 32)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d (MaxPooling2D)    │ (None, 16, 16, 32)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ flatten (Flatten)               │ (None, 8192)           │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_6 (Dense)                 │ (None, 32)             │       262,176 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ leaky_re_lu_2 (LeakyReLU)       │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_7 (Dense)                 │ (None, 10)             │           330 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 267,306 (1.02 MB)

 Trainable params: 267,306 (1.02 MB)

 Non-trainable params: 0 (0.00 B)

# Fit the CNN model to the training data
history_cnn_model_1 = cnn_model_1.fit(X_train, y_train, validation_split = 0.2, batch_size = 32, verbose = 1, epochs = 20)

Epoch 1/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 8ms/step - accuracy: 0.5043 - loss: 1.4562 - val_accuracy: 0.8200 - val_loss: 0.6394
Epoch 2/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 8ms/step - accuracy: 0.8277 - loss: 0.5879 - val_accuracy: 0.8417 - val_loss: 0.5517
Epoch 3/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.8584 - loss: 0.4846 - val_accuracy: 0.8470 - val_loss: 0.5245
Epoch 4/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.8775 - loss: 0.4169 - val_accuracy: 0.8529 - val_loss: 0.5062
Epoch 5/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.8929 - loss: 0.3682 - val_accuracy: 0.8638 - val_loss: 0.4882
Epoch 6/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9031 - loss: 0.3238 - val_accuracy: 0.8613 - val_loss: 0.4959
Epoch 7/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9176 - loss: 0.2867 - val_accuracy: 0.8620 - val_loss: 0.5092
Epoch 8/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9252 - loss: 0.2577 - val_accuracy: 0.8626 - val_loss: 0.5247
Epoch 9/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9329 - loss: 0.2296 - val_accuracy: 0.8564 - val_loss: 0.5747
Epoch 10/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9386 - loss: 0.2083 - val_accuracy: 0.8557 - val_loss: 0.6038
Epoch 11/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9445 - loss: 0.1872 - val_accuracy: 0.8530 - val_loss: 0.6424
Epoch 12/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9466 - loss: 0.1723 - val_accuracy: 0.8580 - val_loss: 0.6382
Epoch 13/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9499 - loss: 0.1607 - val_accuracy: 0.8570 - val_loss: 0.6865
Epoch 14/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9565 - loss: 0.1445 - val_accuracy: 0.8570 - val_loss: 0.7050
Epoch 15/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9619 - loss: 0.1261 - val_accuracy: 0.8589 - val_loss: 0.7535
Epoch 16/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9634 - loss: 0.1188 - val_accuracy: 0.8580 - val_loss: 0.7521
Epoch 17/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9675 - loss: 0.1063 - val_accuracy: 0.8527 - val_loss: 0.8320
Epoch 18/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 8ms/step - accuracy: 0.9669 - loss: 0.1039 - val_accuracy: 0.8625 - val_loss: 0.7806
Epoch 19/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 8ms/step - accuracy: 0.9693 - loss: 0.0984 - val_accuracy: 0.8569 - val_loss: 0.8801
Epoch 20/20
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 8ms/step - accuracy: 0.9707 - loss: 0.0938 - val_accuracy: 0.8589 - val_loss: 0.9026

# Save the trained model
cnn_model1_path = save_model(cnn_model_1, "CNN_Model1")

Model saved to models/CNN_Model1_20250522_172430.keras (recommended Keras format)
Model architecture diagram saved to models/CNN_Model1_20250522_172430_architecture.png

# Extract the history of the model training for plotting
dict_hist = history_cnn_model_1.history
list_ep = [i for i in range(1, 21)]

# Create styled plot for CNN Model 1
plot = create_styled_plot(
    title='CNN Model 1: Training and Validation Accuracy',
    xlabel='Epochs',
    ylabel='Accuracy',
    figsize=(10, 6)
)

# Add data series with enhanced styling
plot.plot(list_ep, dict_hist['accuracy'], linewidth=2.5, label='Training')
plot.plot(list_ep, dict_hist['val_accuracy'], linewidth=2.5, label='Validation')

# Add legend and display
plot.legend(fontsize=12, frameon=True)
plt.tight_layout()
plt.show()

# Clear the backend session to free up resources and reset the state of the model
backend.clear_session()

# Fix the random seed again for reproducibility after clearing the session
np.random.seed(24)

random.seed(24)

tf.random.set_seed(24)

def cnn_model_2():
    """
    Advanced CNN architecture with hierarchical feature extraction and regularization.
    
    Architecture:
    - Four convolutional layers with filter progression [16→32→32→64]
    - Multi-stage downsampling with batch normalization for training stability
    - Aggressive dropout (0.5) for strong regularization
    - Parameter-efficient design with improved feature learning capacity
    
    Returns:
        model: Compiled Keras Sequential CNN model with regularization
    """
    model = Sequential()
    
    # Low-level feature extraction (edges, simple textures)
    model.add(Conv2D(16, (3,3), padding='same', input_shape=(32, 32, 1)))
    model.add(LeakyReLU(negative_slope=0.1))
    
    # Mid-level feature extraction (corners, contours)
    model.add(Conv2D(32, (3,3), padding='same'))
    model.add(LeakyReLU(negative_slope=0.1))
    
    # First dimensionality reduction and feature selection
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())  # Stabilize training through normalization
    
    # Higher-level feature extraction (shape components)
    model.add(Conv2D(32, (3,3), padding='same'))
    model.add(LeakyReLU(negative_slope=0.1))
    
    # High-level feature extraction (digit-specific patterns)
    model.add(Conv2D(64, (3,3), padding='same'))
    model.add(LeakyReLU(negative_slope=0.1))
    
    # Final dimensionality reduction with training stabilization
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())
    
    # Convert spatial features to vector representation
    model.add(Flatten())
    
    # Feature integration and abstraction
    model.add(Dense(32))
    model.add(LeakyReLU(negative_slope=0.1))
    
    # Apply strong regularization to prevent overfitting
    model.add(Dropout(0.5))
    
    # Output classification layer
    model.add(Dense(10, activation='softmax'))
    
    # Configure optimizer for effective backpropagation
    model.compile(loss='categorical_crossentropy', 
                  optimizer=Adam(learning_rate=0.001), 
                  metrics=['accuracy'])
    
    return model

# Assign the CNN model function to a variable for further use
cnn_model_2 = cnn_model_2()

# Print the model summary which will provide a detailed overview of the layers, output shapes, and number of parameters in the model.
cnn_model_2.summary()

Model: "sequential"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ conv2d (Conv2D)                 │ (None, 32, 32, 16)     │           160 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ leaky_re_lu (LeakyReLU)         │ (None, 32, 32, 16)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_1 (Conv2D)               │ (None, 32, 32, 32)     │         4,640 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ leaky_re_lu_1 (LeakyReLU)       │ (None, 32, 32, 32)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d (MaxPooling2D)    │ (None, 16, 16, 32)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ batch_normalization             │ (None, 16, 16, 32)     │           128 │
│ (BatchNormalization)            │                        │               │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_2 (Conv2D)               │ (None, 16, 16, 32)     │         9,248 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ leaky_re_lu_2 (LeakyReLU)       │ (None, 16, 16, 32)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_3 (Conv2D)               │ (None, 16, 16, 64)     │        18,496 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ leaky_re_lu_3 (LeakyReLU)       │ (None, 16, 16, 64)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d_1 (MaxPooling2D)  │ (None, 8, 8, 64)       │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ batch_normalization_1           │ (None, 8, 8, 64)       │           256 │
│ (BatchNormalization)            │                        │               │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ flatten (Flatten)               │ (None, 4096)           │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ (None, 32)             │       131,104 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ leaky_re_lu_4 (LeakyReLU)       │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 10)             │           330 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 164,362 (642.04 KB)

 Trainable params: 164,170 (641.29 KB)

 Non-trainable params: 192 (768.00 B)

# Fit the second CNN model to the training data
history_cnn_model_2 = cnn_model_2.fit(X_train, y_train, validation_split = 0.2, batch_size = 128, verbose = 1, epochs = 30)

Epoch 1/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 51ms/step - accuracy: 0.3136 - loss: 1.9737 - val_accuracy: 0.1032 - val_loss: 2.9916
Epoch 2/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 51ms/step - accuracy: 0.7764 - loss: 0.7298 - val_accuracy: 0.8496 - val_loss: 0.5335
Epoch 3/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 54ms/step - accuracy: 0.8220 - loss: 0.5886 - val_accuracy: 0.8574 - val_loss: 0.4828
Epoch 4/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 15s 56ms/step - accuracy: 0.8484 - loss: 0.4960 - val_accuracy: 0.8861 - val_loss: 0.3970
Epoch 5/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.8589 - loss: 0.4572 - val_accuracy: 0.8940 - val_loss: 0.3698
Epoch 6/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.8730 - loss: 0.4154 - val_accuracy: 0.8958 - val_loss: 0.3762
Epoch 7/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 54ms/step - accuracy: 0.8848 - loss: 0.3846 - val_accuracy: 0.8942 - val_loss: 0.3764
Epoch 8/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 54ms/step - accuracy: 0.8887 - loss: 0.3608 - val_accuracy: 0.8817 - val_loss: 0.4028
Epoch 9/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 15s 56ms/step - accuracy: 0.8985 - loss: 0.3332 - val_accuracy: 0.8992 - val_loss: 0.3493
Epoch 10/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 54ms/step - accuracy: 0.9024 - loss: 0.3091 - val_accuracy: 0.9013 - val_loss: 0.3797
Epoch 11/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9057 - loss: 0.2946 - val_accuracy: 0.8975 - val_loss: 0.3621
Epoch 12/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9071 - loss: 0.2902 - val_accuracy: 0.8935 - val_loss: 0.3656
Epoch 13/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9128 - loss: 0.2708 - val_accuracy: 0.9075 - val_loss: 0.3631
Epoch 14/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9195 - loss: 0.2546 - val_accuracy: 0.8892 - val_loss: 0.4428
Epoch 15/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 15s 55ms/step - accuracy: 0.9216 - loss: 0.2505 - val_accuracy: 0.8961 - val_loss: 0.4038
Epoch 16/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9249 - loss: 0.2382 - val_accuracy: 0.9052 - val_loss: 0.3579
Epoch 17/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 54ms/step - accuracy: 0.9248 - loss: 0.2232 - val_accuracy: 0.9085 - val_loss: 0.4078
Epoch 18/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 15s 55ms/step - accuracy: 0.9309 - loss: 0.2166 - val_accuracy: 0.9085 - val_loss: 0.3548
Epoch 19/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 55ms/step - accuracy: 0.9317 - loss: 0.2088 - val_accuracy: 0.9101 - val_loss: 0.4060
Epoch 20/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9338 - loss: 0.2071 - val_accuracy: 0.9038 - val_loss: 0.3996
Epoch 21/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 52ms/step - accuracy: 0.9343 - loss: 0.1985 - val_accuracy: 0.9080 - val_loss: 0.4019
Epoch 22/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 54ms/step - accuracy: 0.9407 - loss: 0.1874 - val_accuracy: 0.9033 - val_loss: 0.4107
Epoch 23/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 55ms/step - accuracy: 0.9399 - loss: 0.1816 - val_accuracy: 0.9004 - val_loss: 0.4033
Epoch 24/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 15s 56ms/step - accuracy: 0.9383 - loss: 0.1827 - val_accuracy: 0.9085 - val_loss: 0.4177
Epoch 25/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9449 - loss: 0.1690 - val_accuracy: 0.9082 - val_loss: 0.4039
Epoch 26/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9444 - loss: 0.1641 - val_accuracy: 0.9057 - val_loss: 0.4141
Epoch 27/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9484 - loss: 0.1628 - val_accuracy: 0.9067 - val_loss: 0.4313
Epoch 28/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 53ms/step - accuracy: 0.9501 - loss: 0.1510 - val_accuracy: 0.9014 - val_loss: 0.4497
Epoch 29/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 54ms/step - accuracy: 0.9461 - loss: 0.1607 - val_accuracy: 0.9024 - val_loss: 0.4530
Epoch 30/30
263/263 ━━━━━━━━━━━━━━━━━━━━ 14s 54ms/step - accuracy: 0.9476 - loss: 0.1577 - val_accuracy: 0.8912 - val_loss: 0.4977

# Save the trained model
cnn_model2_path = save_model(cnn_model_2, "CNN_Model2")

Model saved to models/CNN_Model2_20250522_173136.keras (recommended Keras format)
Model architecture diagram saved to models/CNN_Model2_20250522_173136_architecture.png

# Extract the history of the enhanced model training for plotting
dict_hist = history_cnn_model_2.history
list_ep = [i for i in range(1, 31)]

# Create styled plot for CNN Model 2
plot = create_styled_plot(
    title='CNN Model 2: Training and Validation Accuracy',
    xlabel='Epochs',
    ylabel='Accuracy',
    figsize=(10, 6)
)

# Add data series with enhanced styling
plot.plot(list_ep, dict_hist['accuracy'], linewidth=2.5, label='Training')
plot.plot(list_ep, dict_hist['val_accuracy'], linewidth=2.5, label='Validation')

# Add legend and display
plot.legend(fontsize=12, frameon=True)
plt.tight_layout()
plt.show()

# Predict the classes for the test set using the second trained CNN model
test_pred = cnn_model_2.predict(X_test)

# Convert the predicted probabilities to class labels by taking the index of the maximum probability
test_pred = np.argmax(test_pred, axis = -1)

563/563 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step

# Convert the one-hot encoded test labels back to class labels for comparison
y_test = np.argmax(y_test, axis = -1)

# Make sure y_test is in the correct format
if len(y_test.shape) > 1 and y_test.shape[1] > 1:
    y_test_labels = np.argmax(y_test, axis=-1)
else:
    y_test_labels = y_test

# Display both enhanced visualizations
plot_enhanced_confusion_matrix(
    y_test_labels, 
    test_pred, 
    title="CNN Model 2: Confusion Matrix Analysis",
    figsize=(14, 10)
)

display_enhanced_classification_report(
    y_test_labels, 
    test_pred,
    title="CNN Model 2: Classification Performance Analysis"
)

Key Misclassification Patterns:
-------------------------------
Digit 0: 94.0% correct, commonly confused with: 9 (1.6%), 1 (1.0%)
Digit 1: 85.8% correct, commonly confused with: 4 (4.6%), 0 (3.1%)
Digit 2: 90.5% correct, commonly confused with: 7 (2.5%), 9 (2.3%)
Digit 3: 84.1% correct, commonly confused with: 5 (4.1%), 9 (2.8%)
Digit 4: 93.0% correct, commonly confused with: 9 (1.8%), 0 (1.2%)
Digit 5: 88.4% correct, commonly confused with: 6 (3.8%), 9 (2.2%)
Digit 6: 88.0% correct, commonly confused with: 8 (3.1%), 0 (2.7%)
Digit 7: 89.1% correct, commonly confused with: 2 (3.4%), 1 (2.2%)
Digit 8: 85.3% correct, commonly confused with: 9 (3.5%), 6 (3.4%)
Digit 9: 91.0% correct, commonly confused with: 0 (2.7%), 2 (1.3%)
CNN Model 2: Classification Performance Analysis

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      1814
           1       0.93      0.86      0.89      1828
           2       0.89      0.91      0.90      1803
           3       0.91      0.84      0.87      1719
           4       0.88      0.93      0.91      1812
           5       0.90      0.88      0.89      1768
           6       0.88      0.88      0.88      1832
           7       0.92      0.89      0.90      1808
           8       0.90      0.85      0.88      1812
           9       0.84      0.91      0.88      1804

    accuracy                           0.89     18000
   macro avg       0.89      0.89      0.89     18000
weighted avg       0.89      0.89      0.89     18000

Digit Error Analysis (Ordered by Error Rate):
--------------------------------------------
Digit 9: 15.5% error rate
Digit 0: 14.4% error rate
Digit 6: 12.0% error rate
Digit 4: 11.8% error rate
Digit 2: 10.8% error rate
Digit 5: 10.1% error rate
Digit 8: 10.1% error rate
Digit 3: 8.9% error rate
Digit 7: 8.2% error rate
Digit 1: 7.4% error rate

def display_misclassifications(model, X_data, y_true, num_examples=15, figsize=(15, 8)):
    """
    Display examples of misclassified digits with true and predicted labels.
    
    Parameters:
    -----------
    model : Keras model
        The trained model to evaluate
    X_data : numpy array
        Test image data
    y_true : numpy array
        True labels (integer format)
    num_examples : int
        Number of misclassified examples to show
    figsize : tuple
        Figure size
    """
    # Get model predictions
    if len(X_data.shape) > 2 and model.input_shape[1] == 1024:
        # Reshape for ANN model if needed
        y_pred = model.predict(X_data.reshape(X_data.shape[0], -1))
    else:
        y_pred = model.predict(X_data)
    
    y_pred_labels = np.argmax(y_pred, axis=1)
    
    # Find misclassifications
    incorrect_idx = np.where(y_true != y_pred_labels)[0]
    
    if len(incorrect_idx) == 0:
        print("No misclassifications found.")
        return
        
    # Select a subset of misclassifications
    if len(incorrect_idx) > num_examples:
        sample_idx = np.random.choice(incorrect_idx, num_examples, replace=False)
    else:
        sample_idx = incorrect_idx
        
    # Set up the plot
    rows = (len(sample_idx) + 4) // 5  # Ceiling division for rows needed
    fig, axes = plt.subplots(rows, 5, figsize=figsize)
    axes = axes.flatten() if rows > 1 else [axes]
    
    for i, idx in enumerate(sample_idx):
        if i < len(axes):
            # Get the image and ensure proper shape for display
            img = X_data[idx]
            if len(img.shape) == 1:  # If flattened
                img = img.reshape(32, 32)
            elif len(img.shape) == 3 and img.shape[2] == 1:  # If has channel dim
                img = img.squeeze()
                
            # Display image with prediction info
            axes[i].imshow(img, cmap='gray')
            confidence = y_pred[idx][y_pred_labels[idx]] * 100
            title = f"True: {y_true[idx]}\nPred: {y_pred_labels[idx]}\n{confidence:.1f}%"
            axes[i].set_title(title, color='red' if y_true[idx] != y_pred_labels[idx] else 'black')
            axes[i].axis('off')
    
    # Turn off any remaining empty subplots
    for i in range(len(sample_idx), len(axes)):
        axes[i].axis('off')
    
    plt.suptitle('Misclassification Examples', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Summarize common error patterns
    print(f"Total misclassifications: {len(incorrect_idx)} out of {len(y_true)} images ({len(incorrect_idx)/len(y_true)*100:.1f}%)")
    
    # Find the most common misclassifications
    error_pairs = {}
    for idx in incorrect_idx:
        pair = (y_true[idx], y_pred_labels[idx])  # (true, predicted)
        error_pairs[pair] = error_pairs.get(pair, 0) + 1
    
    # Show top error patterns
    top_errors = sorted(error_pairs.items(), key=lambda x: x[1], reverse=True)[:5]
    print("\nTop 5 confusion patterns (True → Predicted):")
    for (true_label, pred_label), count in top_errors:
        print(f"  {true_label} → {pred_label}: {count} times")

# Make sure y_test is in integer format (not one-hot encoded)
if len(y_test.shape) > 1 and y_test.shape[1] > 1:
    y_test_labels = np.argmax(y_test, axis=1)
else:
    y_test_labels = y_test

# Display misclassification examples from CNN Model 2
display_misclassifications(
    model=cnn_model_2,
    X_data=X_test,
    y_true=y_test_labels,
    num_examples=15
)

563/563 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step

Total misclassifications: 1991 out of 18000 images (11.1%)

Top 5 confusion patterns (True → Predicted):
  1 → 4: 84 times
  3 → 5: 71 times
  5 → 6: 67 times
  8 → 9: 64 times
  8 → 6: 62 times

def visualize_cnn_features(model, image_idx=0):
    """
    Visualize the feature representations learned by CNN layers for model interpretability.
    
    This function analyzes how a trained CNN processes input images by:
    - Extracting activations from each convolutional layer
    - Visualizing feature maps to reveal pattern recognition strategies
    - Displaying the hierarchy of feature abstraction across network depth
    
    The visualizations help understand what patterns each layer responds to,
    from low-level features (edges, textures) to high-level representations
    (digit parts, complete forms).
    
    Parameters:
    -----------
    model : Trained Keras model with convolutional layers
        The CNN model whose feature representations will be visualized
    image_idx : int, default=0
        Index of test image to visualize, allowing analysis of different digit examples
        
    Returns:
        None: Results are displayed as visualization plots
    """

    # Retrieve and prepare test image
    img = X_test[image_idx:image_idx+1]  # Keep batch dimension
    label = np.argmax(y_test[image_idx]) if len(y_test.shape) > 1 else y_test[image_idx]
    
    # Display source image for reference
    plt.figure(figsize=(5, 5))
    plt.imshow(np.squeeze(img), cmap='gray')
    plt.title(f"Original Image (Label: {label})")
    plt.axis('off')
    plt.show()
    
    # Identify convolutional layers for feature extraction
    conv_layers = []
    for i, layer in enumerate(model.layers):
        if 'conv' in layer.name.lower():
            conv_layers.append(i)
    
    if not conv_layers:
        print("No convolutional layers found!")
        return
    
    print(f"Found {len(conv_layers)} convolutional layers")
    
    # Process each convolutional layer to extract and visualize features
    for layer_idx in conv_layers:
        layer_name = model.layers[layer_idx].name
        print(f"Visualizing layer: {layer_name}")
        
        # Create extraction model for this specific layer
        temp_model = tf.keras.models.Model(
            inputs=model.inputs,
            outputs=model.layers[layer_idx].output
        )
        
        # Generate feature activations
        feature_maps = temp_model.predict(img)
        print(f"Feature map shape: {feature_maps.shape}")
        
        # Determine visualization grid dimensions
        n_features = min(16, feature_maps.shape[3])
        grid_size = int(np.ceil(np.sqrt(n_features)))
        
        # Create visualization grid
        fig, axes = plt.subplots(grid_size, grid_size, figsize=(12, 12))
        fig.suptitle(f"Feature Maps - Layer {layer_name}", fontsize=16)
        
        # Plot individual feature maps
        for i in range(grid_size*grid_size):
            ax = axes[i//grid_size, i%grid_size]
            if i < n_features:
                ax.imshow(feature_maps[0, :, :, i], cmap='gray')
                ax.set_title(f"Filter {i}")
            ax.axis('off')
        
        plt.tight_layout()
        plt.subplots_adjust(top=0.9)
        plt.show()

# Visualize features for a specific image   
visualize_cnn_features(cnn_model_2, image_idx=42)  # Change index to visualize different images

Found 4 convolutional layers
Visualizing layer: conv2d
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 24ms/step
Feature map shape: (1, 32, 32, 16)

Visualizing layer: conv2d_1
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 25ms/step
Feature map shape: (1, 32, 32, 32)

Visualizing layer: conv2d_2
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 33ms/step
Feature map shape: (1, 16, 16, 32)

Visualizing layer: conv2d_3
WARNING:tensorflow:5 out of the last 568 calls to <function TensorFlowTrainer.make_predict_function.<locals>.one_step_on_data_distributed at 0x151f08ca0> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 38ms/step
Feature map shape: (1, 16, 16, 64)

def analyze_normalization_impact(num_runs=5):
    """
    Systematically analyze the impact of normalization on model performance with multiple runs.
    
    This experiment compares training with raw pixel values (0-255) versus normalized (0-1)
    values across multiple independent runs to establish statistically reliable conclusions.
    
    Parameters:
    -----------
    num_runs : int, default=5
        Number of experiment repetitions to average results across
    
    Returns:
        tuple: (avg_improvement, std_improvement) - Average and standard deviation of accuracy improvement
    """
    print(f"Running normalization impact analysis ({num_runs} runs per condition)...")
    
    # Load the data once
    h5f = h5py.File('/Users/mohitpammu/Desktop/MIT-ADSP/Elective Project/Deep Learning/SVHN_single_grey1.h5', 'r')
    X_train_raw = h5f['X_train'][:]
    y_train = h5f['y_train'][:]
    X_test_raw = h5f['X_test'][:]
    y_test = h5f['y_test'][:]
    h5f.close()
    
    # Reshape the data for CNN
    X_train_raw = X_train_raw.reshape(X_train_raw.shape[0], 32, 32, 1)
    X_test_raw = X_test_raw.reshape(X_test_raw.shape[0], 32, 32, 1)
    
    # Create normalized versions
    X_train_norm = X_train_raw.astype('float32')/255.0
    X_test_norm = X_test_raw.astype('float32')/255.0
    
    # One-hot encode labels
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)
    
    # Create consistent train/validation split for all experiments
    from sklearn.model_selection import train_test_split
    train_indices, val_indices = train_test_split(
        np.arange(len(X_train_raw)), test_size=0.2, random_state=42
    )
    
    # Storage for results across runs
    raw_accuracies = []
    norm_accuracies = []
    raw_losses = []
    norm_losses = []
    improvements = []
    
    # Execute multiple experiment runs
    for run in range(num_runs):
        print(f"\nPerforming run {run+1}/{num_runs}")
        
        # Without normalization
        backend.clear_session()
        seed = 24 + run  # Different seed for each run
        np.random.seed(seed)
        tf.random.set_seed(seed)
        random.seed(seed)
        
        model_without_norm = simple_cnn_model()
        history_without_norm = model_without_norm.fit(
            X_train_raw[train_indices], y_train[train_indices], 
            validation_data=(X_train_raw[val_indices], y_train[val_indices]),
            batch_size=128, 
            epochs=10, 
            verbose=0  # Reduce output noise
        )
        
        # With normalization
        backend.clear_session()
        np.random.seed(seed)  # Same seed as the unnormalized run for fair comparison
        tf.random.set_seed(seed)
        random.seed(seed)
        
        model_with_norm = simple_cnn_model()
        history_with_norm = model_with_norm.fit(
            X_train_norm[train_indices], y_train[train_indices], 
            validation_data=(X_train_norm[val_indices], y_train[val_indices]),
            batch_size=128, 
            epochs=10, 
            verbose=0
        )
        
        # Record results from this run
        final_acc_without_norm = history_without_norm.history['val_accuracy'][-1]
        final_acc_with_norm = history_with_norm.history['val_accuracy'][-1]
        improvement = (final_acc_with_norm - final_acc_without_norm) * 100
        
        raw_accuracies.append(history_without_norm.history['val_accuracy'])
        norm_accuracies.append(history_with_norm.history['val_accuracy'])
        raw_losses.append(history_without_norm.history['val_loss'])
        norm_losses.append(history_with_norm.history['val_loss'])
        improvements.append(improvement)
        
        print(f"  Run {run+1} improvement: {improvement:.2f}%")
        
        # Clean up to prevent memory issues
        del model_without_norm, model_with_norm
    
    # Calculate statistics
    avg_raw_acc = np.mean([acc[-1] for acc in raw_accuracies])
    avg_norm_acc = np.mean([acc[-1] for acc in norm_accuracies])
    avg_improvement = np.mean(improvements)
    std_improvement = np.std(improvements)
    
    # Calculate average learning curves
    avg_raw_acc_history = np.mean(raw_accuracies, axis=0)
    avg_norm_acc_history = np.mean(norm_accuracies, axis=0)
    avg_raw_loss_history = np.mean(raw_losses, axis=0)
    avg_norm_loss_history = np.mean(norm_losses, axis=0)
    
    # Visualization
    plt.figure(figsize=(12, 5))
    
    # Plot average accuracy curves
    plt.subplot(1, 2, 1)
    plt.plot(avg_raw_acc_history, label='Without Normalization')
    plt.plot(avg_norm_acc_history, label='With Normalization')
    plt.fill_between(range(len(avg_raw_acc_history)), 
                     np.min(raw_accuracies, axis=0),
                     np.max(raw_accuracies, axis=0),
                     alpha=0.2)
    plt.fill_between(range(len(avg_norm_acc_history)), 
                     np.min(norm_accuracies, axis=0),
                     np.max(norm_accuracies, axis=0),
                     alpha=0.2)
    plt.title('Average Validation Accuracy (5 Runs)')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot average loss curves
    plt.subplot(1, 2, 2)
    plt.plot(avg_raw_loss_history, label='Without Normalization')
    plt.plot(avg_norm_loss_history, label='With Normalization')
    plt.fill_between(range(len(avg_raw_loss_history)), 
                 np.min(raw_losses, axis=0),
                 np.max(raw_losses, axis=0),
                 alpha=0.2)
    plt.fill_between(range(len(avg_norm_loss_history)), 
                 np.min(norm_losses, axis=0),
                 np.max(norm_losses, axis=0),
                 alpha=0.2)
    plt.title('Average Validation Loss (5 Runs)')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    # Print summary of results
    print("\nNormalization Impact Analysis Results (Averaged over 5 runs):")
    print("---------------------------------------------------------")
    print(f"Average validation accuracy without normalization: {avg_raw_acc:.4f}")
    print(f"Average validation accuracy with normalization: {avg_norm_acc:.4f}")
    print(f"Average accuracy improvement: {avg_improvement:.2f}% ± {std_improvement:.2f}%")
    print(f"\nNormalization produces a statistically {'significant' if avg_improvement > 2*std_improvement else 'modest'} improvement")
    
    return avg_improvement, std_improvement

def simple_cnn_model():
    """Simple CNN model for normalization experiments (unchanged from original)"""
    model = Sequential()
    model.add(Conv2D(16, (3,3), padding='same', input_shape=(32, 32, 1)))
    model.add(LeakyReLU(negative_slope=0.1))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Flatten())
    model.add(Dense(32))
    model.add(LeakyReLU(negative_slope=0.1))
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy', 
                 optimizer=Adam(learning_rate=0.001), 
                 metrics=['accuracy'])
    return model

# Run the normalization impact analysis
avg_improvement, std_improvement = analyze_normalization_impact(num_runs=5)

# Print the results summary
print(f"Average accuracy improvement with normalization: {avg_improvement:.2f}% ± {std_improvement:.2f}%")

Running normalization impact analysis (5 runs per condition)...

Performing run 1/5
  Run 1 improvement: 2.17%

Performing run 2/5
  Run 2 improvement: 3.79%

Performing run 3/5
  Run 3 improvement: 3.15%

Performing run 4/5
  Run 4 improvement: 9.00%

Performing run 5/5
  Run 5 improvement: 4.40%

Normalization Impact Analysis Results (Averaged over 5 runs):
---------------------------------------------------------
Average validation accuracy without normalization: 0.7834
Average validation accuracy with normalization: 0.8284
Average accuracy improvement: 4.50% ± 2.37%

Normalization produces a statistically modest improvement
Average accuracy improvement with normalization: 4.50% ± 2.37%

def analyze_model_complexity_vs_performance(num_runs=5):
    """
    Systematically analyze the relationship between CNN model complexity and performance
    with multiple runs for statistical reliability.
    
    Creates and trains models with increasing complexity levels, tracking:
    - Parameter counts
    - Training times
    - Validation accuracy
    
    Parameters:
    -----------
    num_runs : int, default=5
        Number of complete experiment repetitions to average results across
    
    Returns:
        tuple: (complexities, avg_param_counts, avg_accuracies, std_accuracies)
    """
    print(f"Running model complexity analysis ({num_runs} runs per complexity level)...")
    
    # Suppress TensorFlow warnings
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  
    # Track metrics across experiments
    complexities = [1, 2, 3, 4]
    all_param_counts = {c: [] for c in complexities}
    all_training_times = {c: [] for c in complexities}
    all_val_accuracies = {c: [] for c in complexities}
    
    # Create explicit validation split once before model training
    from sklearn.model_selection import train_test_split
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )
    
    # Perform multiple runs of the entire experiment
    for run in range(num_runs):
        print(f"\nPerforming experiment run {run+1}/{num_runs}")
        
        # Run experiment for each complexity level
        for c in complexities:
            print(f"  Training model with complexity level {c}...")
            
            # Clear previous model from memory and set seed based on run
            backend.clear_session()
            seed = 42 + run  # Different seed for each run
            np.random.seed(seed)
            random.seed(seed)
            tf.random.set_seed(seed)
            
            # Create model with appropriate complexity
            model = Sequential()
            
            # First conv block - scale filters with complexity
            model.add(Conv2D(8 * c, (3,3), padding='same', input_shape=(32, 32, 1)))
            model.add(LeakyReLU(0.1))
            model.add(MaxPooling2D(pool_size=(2,2)))
            
            # Optional second conv block based on complexity
            if c >= 2:
                model.add(Conv2D(16 * c, (3,3), padding='same'))
                model.add(LeakyReLU(0.1))
                model.add(MaxPooling2D(pool_size=(2,2)))
            
            # Flatten and dense layers
            model.add(Flatten())
            model.add(Dense(16 * c))
            model.add(LeakyReLU(0.1))
            
            # Add dropout for higher complexity models
            if c >= 3:
                model.add(Dropout(0.3))
            
            # Output layer
            model.add(Dense(10, activation='softmax'))
            
            # Compile with standard settings
            model.compile(loss='categorical_crossentropy', 
                         optimizer=Adam(0.001), 
                         metrics=['accuracy'])
            
            # Record parameter count
            all_param_counts[c].append(model.count_params())
            
            # Measure training time
            start_time = time.time()
            
            # Suppress warning during fit operation
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                # Train model with explicit validation data
                history = model.fit(
                    X_train_split, y_train_split,
                    validation_data=(X_val_split, y_val_split),
                    batch_size=128,
                    epochs=5,
                    verbose=0  # Reduce output noise
                )
            
            # Record metrics
            training_time = time.time() - start_time
            all_training_times[c].append(training_time)
            all_val_accuracies[c].append(history.history['val_accuracy'][-1])
            
            # Clean up to prevent memory issues
            del model
    
    # Calculate average and std deviation of metrics
    avg_param_counts = {c: np.mean(params) for c, params in all_param_counts.items()}
    avg_training_times = {c: np.mean(times) for c, times in all_training_times.items()}
    avg_val_accuracies = {c: np.mean(accs) for c, accs in all_val_accuracies.items()}
    std_val_accuracies = {c: np.std(accs) for c, accs in all_val_accuracies.items()}
    
    # Visualization of experimental results
    fig, axs = plt.subplots(2, 2, figsize=(15, 12))
    
    # Plot 1: Parameters vs Accuracy with error bars
    axs[0, 0].errorbar(
        [avg_param_counts[c] for c in complexities], 
        [avg_val_accuracies[c] for c in complexities],
        yerr=[std_val_accuracies[c] for c in complexities],
        marker='o', linestyle='-'
    )
    axs[0, 0].set_xlabel('Number of Parameters')
    axs[0, 0].set_ylabel('Validation Accuracy')
    axs[0, 0].set_title(f'Model Size vs Accuracy (Averaged Across {num_runs} Runs)')
    axs[0, 0].grid(alpha=0.3)
    
    # Plot 2: Training time vs Accuracy with error bars
    axs[0, 1].errorbar(
        [avg_training_times[c] for c in complexities], 
        [avg_val_accuracies[c] for c in complexities],
        yerr=[std_val_accuracies[c] for c in complexities],
        marker='o', linestyle='-'
    )
    axs[0, 1].set_xlabel('Training Time (seconds)')
    axs[0, 1].set_ylabel('Validation Accuracy')
    axs[0, 1].set_title(f'Training Time vs Accuracy (Averaged Across {num_runs} Runs)')
    axs[0, 1].grid(alpha=0.3)
    
    # Plot 3: Complexity vs Parameters
    axs[1, 0].bar(complexities, [avg_param_counts[c] for c in complexities], yerr=[np.std(all_param_counts[c]) for c in complexities])
    axs[1, 0].set_xlabel('Model Complexity Level')
    axs[1, 0].set_ylabel('Number of Parameters')
    axs[1, 0].set_title('Complexity vs Model Size')
    axs[1, 0].set_xticks(complexities)
    axs[1, 0].grid(axis='y', alpha=0.3)
    
    # Plot 4: Complexity vs Accuracy with error bars
    axs[1, 1].errorbar(
        complexities,
        [avg_val_accuracies[c] for c in complexities],
        yerr=[std_val_accuracies[c] for c in complexities],
        marker='o', linestyle='-'
    )
    axs[1, 1].set_xlabel('Model Complexity Level')
    axs[1, 1].set_ylabel('Validation Accuracy')
    axs[1, 1].set_title(f'Complexity vs Accuracy (Averaged Across {num_runs} Runs)')
    axs[1, 1].set_xticks(complexities)
    axs[1, 1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary of key findings with standard deviations
    print(f"\nModel Complexity Analysis Results (Averaged Across {num_runs} Runs):")
    print("-------------------------------------------------")
    for c in complexities:
        print(f"Complexity {c}: {avg_param_counts[c]:,} parameters, "
              f"{avg_training_times[c]:.2f}s ± {np.std(all_training_times[c]):.2f}s training time, "
              f"{avg_val_accuracies[c]:.4f} ± {std_val_accuracies[c]:.4f} accuracy")

    # Identify the optimal complexity level based on accuracy per parameter
    efficiency = {c: avg_val_accuracies[c]/avg_param_counts[c] for c in complexities}
    optimal_c = max(efficiency.items(), key=lambda x: x[1])[0]
    print(f"\nOptimal efficiency at complexity level {optimal_c}")
    print(f"Achieves {avg_val_accuracies[optimal_c]*100:.2f}% ± {std_val_accuracies[optimal_c]*100:.2f}% accuracy with {avg_param_counts[optimal_c]:,} parameters")
    
    return (complexities, 
            [avg_param_counts[c] for c in complexities], 
            [avg_val_accuracies[c] for c in complexities], 
            [std_val_accuracies[c] for c in complexities])

# Run the model complexity analysis experiment with 5 runs per complexity level
complexities, param_counts, accuracies, accuracy_stds = analyze_model_complexity_vs_performance(num_runs=5)

# Print a summary of the findings
print("\nSummary of Model Complexity vs Performance:")
print("-----------------------------------------")
for i, complexity in enumerate(complexities):
    print(f"Complexity {complexity}: {param_counts[i]:,} parameters, {accuracies[i]*100:.2f}% ± {accuracy_stds[i]*100:.2f}% accuracy")

# Identify the best accuracy and most efficient model
best_accuracy_idx = np.argmax(accuracies)
best_efficiency_idx = np.argmax([acc/params for acc, params in zip(accuracies, param_counts)])

print(f"\nBest accuracy: Complexity {complexities[best_accuracy_idx]} with {accuracies[best_accuracy_idx]*100:.2f}%")
print(f"Most parameter-efficient: Complexity {complexities[best_efficiency_idx]} with {accuracies[best_efficiency_idx]*100:.2f}% using {param_counts[best_efficiency_idx]:,} parameters")

Running model complexity analysis (5 runs per complexity level)...

Performing experiment run 1/5
  Training model with complexity level 1...
  Training model with complexity level 2...
  Training model with complexity level 3...
  Training model with complexity level 4...

Performing experiment run 2/5
  Training model with complexity level 1...
  Training model with complexity level 2...
  Training model with complexity level 3...
  Training model with complexity level 4...

Performing experiment run 3/5
  Training model with complexity level 1...
  Training model with complexity level 2...
  Training model with complexity level 3...
  Training model with complexity level 4...

Performing experiment run 4/5
  Training model with complexity level 1...
  Training model with complexity level 2...
  Training model with complexity level 3...
  Training model with complexity level 4...

Performing experiment run 5/5
  Training model with complexity level 1...
  Training model with complexity level 2...
  Training model with complexity level 3...
  Training model with complexity level 4...

Model Complexity Analysis Results (Averaged Across 5 Runs):
-------------------------------------------------
Complexity 1: 33,034.0 parameters, 7.96s ± 0.12s training time, 0.7528 ± 0.0187 accuracy
Complexity 2: 70,698.0 parameters, 17.52s ± 0.32s training time, 0.8478 ± 0.0050 accuracy
Complexity 3: 158,650.0 parameters, 23.98s ± 0.62s training time, 0.8622 ± 0.0049 accuracy
Complexity 4: 281,674.0 parameters, 35.48s ± 0.32s training time, 0.8740 ± 0.0028 accuracy

Optimal efficiency at complexity level 1
Achieves 75.28% ± 1.87% accuracy with 33,034.0 parameters

Summary of Model Complexity vs Performance:
-----------------------------------------
Complexity 1: 33,034.0 parameters, 75.28% ± 1.87% accuracy
Complexity 2: 70,698.0 parameters, 84.78% ± 0.50% accuracy
Complexity 3: 158,650.0 parameters, 86.22% ± 0.49% accuracy
Complexity 4: 281,674.0 parameters, 87.40% ± 0.28% accuracy

Best accuracy: Complexity 4 with 87.40%
Most parameter-efficient: Complexity 1 with 75.28% using 33,034.0 parameters

def learning_rate_analysis(num_runs=5):
    """
    Systematically analyze the impact of learning rate on model convergence and accuracy.
    
    Performs multiple training runs for each learning rate to reduce variance in results,
    then averages the outcomes for more reliable conclusions.
    
    Parameters:
    -----------
    num_runs : int, default=5
        Number of complete experiment repetitions to average results across
        
    Returns:
        tuple: (learning_rates, avg_accuracies, std_accuracies)
    """
    print(f"Running learning rate analysis ({num_runs} runs per learning rate)...")
    
    # Create a fixed train/validation split for consistent comparison
    from sklearn.model_selection import train_test_split
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=24
    )
    
    # Experimental configuration
    learning_rates = [0.0001, 0.001, 0.01, 0.1]
    
    # Store results across multiple runs
    all_runs_histories = {lr: [] for lr in learning_rates}
    all_runs_final_accuracies = {lr: [] for lr in learning_rates}
    
    # Perform multiple complete experiment runs
    for run in range(num_runs):
        print(f"\nPerforming experiment run {run+1}/{num_runs}")
        
        # Training loop for models with varying learning rates
        for lr in learning_rates:
            # Reset environment to ensure fair comparison
            backend.clear_session()
            # Use different seed for each run to assess variance
            seed = 24 + run
            np.random.seed(seed)
            tf.random.set_seed(seed)
            random.seed(seed)
            
            # Create consistent model architecture across all trials
            model = Sequential([
                Conv2D(16, (3,3), padding='same', input_shape=(32, 32, 1)),
                LeakyReLU(0.1),
                MaxPooling2D((2,2)),
                Flatten(),
                Dense(32),
                LeakyReLU(0.1),
                Dense(10, activation='softmax')
            ])
            
            # Configure optimizer with experimental learning rate
            model.compile(
                loss='categorical_crossentropy',
                optimizer=Adam(learning_rate=lr),
                metrics=['accuracy']
            )
            
            # Train model with fixed validation data
            print(f"  Training with learning rate: {lr}")
            history = model.fit(
                X_train_split, y_train_split,
                validation_data=(X_val_split, y_val_split),
                epochs=10,
                batch_size=128,
                verbose=0  # Reduce output noise for multiple runs
            )
            
            # Store results for this run
            all_runs_histories[lr].append(history.history)
            all_runs_final_accuracies[lr].append(history.history['val_accuracy'][-1])
            
            # Clean up
            del model
            
    # Average results across runs
    avg_histories = {}
    for lr in learning_rates:
        # Initialize with the structure of a single history
        avg_history = {metric: np.zeros(len(all_runs_histories[lr][0][metric])) 
                      for metric in all_runs_histories[lr][0].keys()}
        
        # Sum up histories from all runs
        for run_history in all_runs_histories[lr]:
            for metric, values in run_history.items():
                avg_history[metric] += np.array(values)
        
        # Divide by number of runs to get average
        for metric in avg_history:
            avg_history[metric] /= num_runs
            
        avg_histories[lr] = avg_history
    
    # Calculate average final accuracies and their standard deviations
    avg_final_accuracies = {lr: np.mean(accs) for lr, accs in all_runs_final_accuracies.items()}
    std_final_accuracies = {lr: np.std(accs) for lr, accs in all_runs_final_accuracies.items()}
    
    # Visualization of experimental results
    plt.figure(figsize=(12, 10))
    
    # Plot 1: Training accuracy comparison
    plt.subplot(2, 1, 1)
    for i, lr in enumerate(learning_rates):
        plt.plot(avg_histories[lr]['accuracy'], label=f'Train (lr={lr})')
    plt.title(f'Average Training Accuracy for Different Learning Rates (Across {num_runs} Runs)')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Plot 2: Validation accuracy comparison
    plt.subplot(2, 1, 2)
    for i, lr in enumerate(learning_rates):
        plt.plot(avg_histories[lr]['val_accuracy'], label=f'Val (lr={lr})')
    plt.title(f'Average Validation Accuracy for Different Learning Rates (Across {num_runs} Runs)')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

    # Print summary of key findings with standard deviations
    print(f"\nLearning Rate Analysis Results (Averaged Across {num_runs} Runs):")
    print("------------------------------------------------------")
    for lr in learning_rates:
        print(f"Learning Rate {lr}: Final Validation Accuracy = {avg_final_accuracies[lr]:.4f} ± {std_final_accuracies[lr]:.4f}")

    print(f"\nNote: Results represent the average of {num_runs} complete experimental runs with different random seeds.")
    print("Standard deviations (±) indicate the consistency of results across runs.")
    
    return learning_rates, list(avg_final_accuracies.values()), list(std_final_accuracies.values())

# Call the function to run the learning rate analysis experiment
learning_rates, accuracies, accuracy_stds = learning_rate_analysis(num_runs=3)  # Using 3 runs to save time, increase for more reliability

# Create a formatted table to display results
results = pd.DataFrame({
    'Learning Rate': learning_rates,
    'Validation Accuracy': [f"{acc:.4f} ± {std:.4f}" for acc, std in zip(accuracies, accuracy_stds)]
})

# Display the results in a tabular format
display(results.style.set_caption("Learning Rate Impact on Model Performance"))

# Identify the optimal learning rate
best_lr_idx = np.argmax(accuracies)
print(f"\nOptimal learning rate: {learning_rates[best_lr_idx]} with accuracy: {accuracies[best_lr_idx]:.4f} ± {accuracy_stds[best_lr_idx]:.4f}")

# Calculate the performance drop from best to worst learning rate
worst_lr_idx = np.argmin(accuracies)
performance_drop = accuracies[best_lr_idx] - accuracies[worst_lr_idx]
print(f"Performance difference between best and worst learning rates: {performance_drop*100:.2f} percentage points")

Running learning rate analysis (3 runs per learning rate)...

Performing experiment run 1/3
  Training with learning rate: 0.0001
  Training with learning rate: 0.001
  Training with learning rate: 0.01
  Training with learning rate: 0.1

Performing experiment run 2/3
  Training with learning rate: 0.0001
  Training with learning rate: 0.001
  Training with learning rate: 0.01
  Training with learning rate: 0.1

Performing experiment run 3/3
  Training with learning rate: 0.0001
  Training with learning rate: 0.001
  Training with learning rate: 0.01
  Training with learning rate: 0.1

Learning Rate Analysis Results (Averaged Across 3 Runs):
------------------------------------------------------
Learning Rate 0.0001: Final Validation Accuracy = 0.6206 ± 0.0535
Learning Rate 0.001: Final Validation Accuracy = 0.8283 ± 0.0043
Learning Rate 0.01: Final Validation Accuracy = 0.8117 ± 0.0050
Learning Rate 0.1: Final Validation Accuracy = 0.1182 ± 0.0140

Note: Results represent the average of 3 complete experimental runs with different random seeds.
Standard deviations (±) indicate the consistency of results across runs.

Optimal learning rate: 0.001 with accuracy: 0.8283 ± 0.0043
Performance difference between best and worst learning rates: 71.01 percentage points

def batch_size_analysis(num_runs=5):
    """
    Systematically analyze the impact of batch size on training dynamics and model performance.
    
    Performs multiple training runs for each batch size to reduce variance in results,
    providing statistically reliable conclusions about batch size effects.
    
    Parameters:
    -----------
    num_runs : int, default=5
        Number of complete experiment repetitions to average results across
        
    Returns:
        tuple: (batch_sizes, avg_accuracies, std_accuracies)
    """
    print(f"Running batch size analysis ({num_runs} runs per batch size)...")
    
    # Create a fixed train/validation split for consistent comparison
    from sklearn.model_selection import train_test_split
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=24
    )

    # Experimental configuration
    batch_sizes = [16, 64, 256, 1024]
    
    # Store results across multiple runs
    all_runs_histories = {bs: [] for bs in batch_sizes}
    all_runs_final_accuracies = {bs: [] for bs in batch_sizes}
    all_runs_training_times = {bs: [] for bs in batch_sizes}
    
    # Perform multiple complete experiment runs
    for run in range(num_runs):
        print(f"\nPerforming experiment run {run+1}/{num_runs}")
        
        # Training loop for models with varying batch sizes
        for bs in batch_sizes:
            # Reset environment to ensure fair comparison
            backend.clear_session()
            # Use different seed for each run
            seed = 24 + run
            np.random.seed(seed)
            tf.random.set_seed(seed)
            random.seed(seed)
            
            # Create consistent model architecture across all trials
            model = Sequential([
                Conv2D(16, (3,3), padding='same', input_shape=(32, 32, 1)),
                LeakyReLU(0.1),
                MaxPooling2D((2,2)),
                Flatten(),
                Dense(32),
                LeakyReLU(0.1),
                Dense(10, activation='softmax')
            ])
            
            # Use consistent optimizer settings across trials
            model.compile(
                loss='categorical_crossentropy',
                optimizer=Adam(learning_rate=0.001),
                metrics=['accuracy']
            )
            
            # Train model with experimental batch size and measure time
            print(f"  Training with batch size: {bs}")
            start_time = time.time()
            history = model.fit(
                X_train_split, y_train_split,
                validation_data=(X_val_split, y_val_split),
                epochs=5,
                batch_size=bs,
                verbose=0  # Reduce output noise
            )
            training_time = time.time() - start_time
            
            # Store results for this run
            all_runs_histories[bs].append(history.history)
            all_runs_final_accuracies[bs].append(history.history['val_accuracy'][-1])
            all_runs_training_times[bs].append(training_time)
            
            # Clean up
            del model
    
    # Average results across runs
    avg_histories = {}
    for bs in batch_sizes:
        # Initialize with the structure of a single history
        avg_history = {metric: np.zeros(len(all_runs_histories[bs][0][metric])) 
                      for metric in all_runs_histories[bs][0].keys()}
        
        # Sum up histories from all runs
        for run_history in all_runs_histories[bs]:
            for metric, values in run_history.items():
                avg_history[metric] += np.array(values)
        
        # Divide by number of runs to get average
        for metric in avg_history:
            avg_history[metric] /= num_runs
            
        avg_histories[bs] = avg_history
    
    # Calculate average final accuracies, training times and their standard deviations
    avg_final_accuracies = {bs: np.mean(accs) for bs, accs in all_runs_final_accuracies.items()}
    std_final_accuracies = {bs: np.std(accs) for bs, accs in all_runs_final_accuracies.items()}
    avg_training_times = {bs: np.mean(times) for bs, times in all_runs_training_times.items()}
    std_training_times = {bs: np.std(times) for bs, times in all_runs_training_times.items()}
    
    # Visualization of experimental results
    plt.figure(figsize=(15, 12))
    
    # Plot 1: Training stability analysis
    plt.subplot(2, 2, 1)
    for bs in batch_sizes:
        plt.plot(avg_histories[bs]['loss'], label=f'Batch Size = {bs}')
    plt.title(f'Average Training Loss for Different Batch Sizes (Across {num_runs} Runs)')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Plot 2: Generalization performance analysis
    plt.subplot(2, 2, 2)
    for bs in batch_sizes:
        plt.plot(avg_histories[bs]['val_accuracy'], label=f'Batch Size = {bs}')
    plt.title(f'Average Validation Accuracy for Different Batch Sizes (Across {num_runs} Runs)')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()
    
    # Plot 3: Final accuracy vs batch size with error bars
    plt.subplot(2, 2, 3)
    accuracies = [avg_final_accuracies[bs] for bs in batch_sizes]
    errors = [std_final_accuracies[bs] for bs in batch_sizes]
    plt.errorbar(batch_sizes, accuracies, yerr=errors, marker='o', linestyle='-')
    plt.xscale('log', base=2)
    plt.title('Final Validation Accuracy vs. Batch Size')
    plt.xlabel('Batch Size (log scale)')
    plt.ylabel('Validation Accuracy')
    plt.grid(alpha=0.3)
    
    # Plot 4: Training time vs batch size with error bars
    plt.subplot(2, 2, 4)
    times = [avg_training_times[bs] for bs in batch_sizes]
    time_errors = [std_training_times[bs] for bs in batch_sizes]
    plt.errorbar(batch_sizes, times, yerr=time_errors, marker='o', linestyle='-')
    plt.xscale('log', base=2)
    plt.title('Training Time vs. Batch Size')
    plt.xlabel('Batch Size (log scale)')
    plt.ylabel('Training Time (s)')
    plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

    # Print summary of key findings with standard deviations
    print(f"\nBatch Size Analysis Results (Averaged Across {num_runs} Runs):")
    print("------------------------------------------------------------")
    print("| Batch Size | Validation Accuracy |  Training Time  |")
    print("|------------|---------------------|-----------------|")
    for bs in batch_sizes:
        print(f"| {bs:10d} | {avg_final_accuracies[bs]:.4f} ± {std_final_accuracies[bs]:.4f} | {avg_training_times[bs]:.2f}s ± {std_training_times[bs]:.2f}s |")
        
    # Find optimal batch size (best accuracy)
    best_bs = max(avg_final_accuracies.items(), key=lambda x: x[1])[0]
    print(f"\nBest performing batch size: {best_bs} with {avg_final_accuracies[best_bs]:.4f} ± {std_final_accuracies[best_bs]:.4f} accuracy")
    
    return batch_sizes, list(avg_final_accuracies.values()), list(std_final_accuracies.values())

# Call the function to run the batch size analysis experiment
batch_sizes, accuracies, accuracy_stds = batch_size_analysis(num_runs=3)  # Using 3 runs to save time, increase for more reliability

# Create a formatted table to display results
results = pd.DataFrame({
    'Batch Size': batch_sizes,  
    'Validation Accuracy': [f"{acc:.4f} ± {std:.4f}" for acc, std in zip(accuracies, accuracy_stds)]
})

# Display the results in a tabular format
display(results.style.set_caption("Batch Size Impact on Model Performance"))

# Identify the optimal batch size
best_bs_idx = np.argmax(accuracies)
print(f"\nOptimal batch size: {batch_sizes[best_bs_idx]} with accuracy: {accuracies[best_bs_idx]:.4f} ± {accuracy_stds[best_bs_idx]:.4f}")

# Calculate the performance difference between best and worst batch sizes
worst_bs_idx = np.argmin(accuracies)
performance_diff = accuracies[best_bs_idx] - accuracies[worst_bs_idx]
print(f"Performance difference between best and worst batch sizes: {performance_diff*100:.2f} percentage points")

Running batch size analysis (3 runs per batch size)...

Performing experiment run 1/3
  Training with batch size: 16
  Training with batch size: 64
  Training with batch size: 256
  Training with batch size: 1024

Performing experiment run 2/3
  Training with batch size: 16
  Training with batch size: 64
  Training with batch size: 256
  Training with batch size: 1024

Performing experiment run 3/3
  Training with batch size: 16
  Training with batch size: 64
  Training with batch size: 256
  Training with batch size: 1024

Batch Size Analysis Results (Averaged Across 3 Runs):
------------------------------------------------------------
| Batch Size | Validation Accuracy |  Training Time  |
|------------|---------------------|-----------------|
|         16 | 0.8165 ± 0.0101 | 26.84s ± 0.73s |
|         64 | 0.8036 ± 0.0057 | 11.53s ± 0.28s |
|        256 | 0.7778 ± 0.0084 | 8.43s ± 0.06s |
|       1024 | 0.5802 ± 0.1045 | 7.26s ± 0.17s |

Best performing batch size: 16 with 0.8165 ± 0.0101 accuracy

Optimal batch size: 16 with accuracy: 0.8165 ± 0.0101
Performance difference between best and worst batch sizes: 23.63 percentage points

def measure_inference_performance(num_runs=5):
    """
    Comprehensively benchmark inference speed of different model architectures
    across varying batch sizes, with multiple runs for statistical reliability.
    
    Parameters:
    -----------
    num_runs : int, default=5
        Number of complete experiment repetitions to average results across
        
    Returns:
        dict: Results containing average and std dev of inference times by model and batch size
    """
    print(f"Running inference performance analysis ({num_runs} runs per configuration)...")
    
    # Model collection for comparison
    models = {
        'ANN Model 2': model_2,
        'CNN Model 1': cnn_model_1,
        'CNN Model 2': cnn_model_2
    }
    
    # Experimental configuration
    batch_sizes = [1, 10, 50, 100]
    
    # Results storage structure
    results = {model_name: {bs: {'times': [], 'per_image_times': []} 
                           for bs in batch_sizes} 
              for model_name in models}
    
    # Benchmarking loop for each model and batch size combination
    for model_name, model in models.items():
        print(f"Testing {model_name}...")
        
        for batch_size in batch_sizes:
            print(f"  Batch size {batch_size}: ", end='')
            
            # Execute multiple experiment runs for statistical reliability
            for run in range(num_runs):
                # Prepare appropriate test data format (new samples each run)
                indices = np.random.choice(len(X_test), batch_size, replace=False)
                if model_name.startswith('ANN'):
                    # Flatten the images for ANN model
                    test_batch = X_test[indices].reshape(batch_size, 1024)
                else:
                    test_batch = X_test[indices]
                
                # Perform warmup run to eliminate initialization overhead
                _ = model.predict(test_batch, verbose=0)
                
                # Execute multiple timed runs for precise measurement
                times = []
                for _ in range(10):  # 10 predictions per run for stable measurement
                    start_time = time.time()
                    _ = model.predict(test_batch, verbose=0)
                    times.append(time.time() - start_time)
                
                # Calculate and record timing metrics for this run
                avg_time = np.mean(times)
                results[model_name][batch_size]['times'].append(avg_time)
                results[model_name][batch_size]['per_image_times'].append(avg_time/batch_size)
                
                print(f"{run+1}..", end='')
            print(" Done!")
    
    # Calculate final statistics across all runs
    final_results = {model_name: [] for model_name in models}
    
    for model_name in models:
        for batch_size in batch_sizes:
            times = results[model_name][batch_size]['times']
            per_image_times = results[model_name][batch_size]['per_image_times']
            
            avg_time = np.mean(times)
            std_time = np.std(times)
            avg_per_image = np.mean(per_image_times)
            std_per_image = np.std(per_image_times)
            
            final_results[model_name].append({
                'batch_size': batch_size,
                'avg_time': avg_time,
                'std_time': std_time,
                'avg_per_image': avg_per_image,
                'std_per_image': std_per_image
            })
    
    # Visualization of benchmark results
    plt.figure(figsize=(15, 10))
    
    # Plot 1: Total inference time analysis
    plt.subplot(2, 2, 1)
    for model_name, model_results in final_results.items():
        batch_sizes = [r['batch_size'] for r in model_results]
        total_times = [r['avg_time'] for r in model_results]
        error_bars = [r['std_time'] for r in model_results]
        plt.errorbar(batch_sizes, total_times, yerr=error_bars, marker='o', label=model_name)
    
    plt.title('Total Inference Time by Batch Size')
    plt.xlabel('Batch Size')
    plt.ylabel('Time (seconds)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 2: Per-image efficiency analysis
    plt.subplot(2, 2, 2)
    for model_name, model_results in final_results.items():
        batch_sizes = [r['batch_size'] for r in model_results]
        per_image_times = [r['avg_per_image'] for r in model_results]
        error_bars = [r['std_per_image'] for r in model_results]
        plt.errorbar(batch_sizes, per_image_times, yerr=error_bars, marker='o', label=model_name)
    
    plt.title('Inference Time per Image')
    plt.xlabel('Batch Size')
    plt.ylabel('Time per Image (seconds)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 3: Relative speedup with batch processing
    plt.subplot(2, 2, 3)
    for model_name, model_results in final_results.items():
        batch_sizes = [r['batch_size'] for r in model_results]
        # Calculate speedup relative to single image processing
        speedups = [model_results[0]['avg_per_image'] / r['avg_per_image'] for r in model_results]
        plt.plot(batch_sizes, speedups, marker='o', label=model_name)
    
    plt.title('Batch Processing Efficiency (Speedup Factor)')
    plt.xlabel('Batch Size')
    plt.ylabel('Speedup vs. Single Image')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot 4: Model comparison at different batch sizes
    plt.subplot(2, 2, 4)
    x = np.arange(len(models))
    width = 0.2
    
    for i, bs in enumerate([1, 10, 100]):  # Select representative batch sizes
        bs_idx = batch_sizes.index(bs)
        heights = [final_results[model][bs_idx]['avg_per_image'] for model in models]
        errors = [final_results[model][bs_idx]['std_per_image'] for model in models]
        plt.bar(x + (i-1)*width, heights, width, yerr=errors, 
                label=f'Batch Size {bs}')
    
    plt.title('Model Comparison at Different Batch Sizes')
    plt.ylabel('Time per Image (seconds)')
    plt.xlabel('Model Architecture')
    plt.xticks(x, models.keys())
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary results table
    print(f"\nInference Performance Analysis Results (Averaged Across {num_runs} Runs):")
    print("-" * 80)
    print(f"{'Model':<15} {'Batch Size':<10} {'Inference Time':<20} {'Time per Image':<20}")
    print("-" * 80)
    
    for model_name, model_results in final_results.items():
        for result in model_results:
            bs = result['batch_size']
            t = result['avg_time']
            t_std = result['std_time']
            pi = result['avg_per_image']
            pi_std = result['std_per_image']
            
            print(f"{model_name:<15} {bs:<10d} {t:.4f}s ± {t_std:.4f}s {pi:.4f}s ± {pi_std:.4f}s")
            
    # Find the most efficient configuration for each model
    print("\nOptimal Batch Size for Each Model:")
    for model_name, model_results in final_results.items():
        best_idx = np.argmin([r['avg_per_image'] for r in model_results])
        best_bs = model_results[best_idx]['batch_size']
        best_time = model_results[best_idx]['avg_per_image']
        print(f"{model_name}: Batch size {best_bs} ({best_time:.4f}s per image)")
    
    return final_results

# Run the enhanced inference analysis
inference_results = measure_inference_performance(num_runs=5)

Running inference performance analysis (5 runs per configuration)...
Testing ANN Model 2...
  Batch size 1: 1..2..3..4..5.. Done!
  Batch size 10: 1..2..3..4..5.. Done!
  Batch size 50: 1..2..3..4..5.. Done!
  Batch size 100: 1..2..3..4..5.. Done!
Testing CNN Model 1...
  Batch size 1: 1..2..3..4..5.. Done!
  Batch size 10: 1..2..3..4..5.. Done!
  Batch size 50: 1..2..3..4..5.. Done!
  Batch size 100: 1..2..3..4..5.. Done!
Testing CNN Model 2...
  Batch size 1: 1..2..3..4..5.. Done!
  Batch size 10: 1..2..3..4..5.. Done!
  Batch size 50: 1..2..3..4..5.. Done!
  Batch size 100: 1..2..3..4..5.. Done!

Inference Performance Analysis Results (Averaged Across 5 Runs):
--------------------------------------------------------------------------------
Model           Batch Size Inference Time       Time per Image      
--------------------------------------------------------------------------------
ANN Model 2     1          0.0256s ± 0.0001s 0.0256s ± 0.0001s
ANN Model 2     10         0.0262s ± 0.0013s 0.0026s ± 0.0001s
ANN Model 2     50         0.0272s ± 0.0002s 0.0005s ± 0.0000s
ANN Model 2     100        0.0283s ± 0.0003s 0.0003s ± 0.0000s
CNN Model 1     1          0.0260s ± 0.0001s 0.0260s ± 0.0001s
CNN Model 1     10         0.0273s ± 0.0004s 0.0027s ± 0.0000s
CNN Model 1     50         0.0312s ± 0.0001s 0.0006s ± 0.0000s
CNN Model 1     100        0.0352s ± 0.0003s 0.0004s ± 0.0000s
CNN Model 2     1          0.0263s ± 0.0001s 0.0263s ± 0.0001s
CNN Model 2     10         0.0285s ± 0.0004s 0.0029s ± 0.0000s
CNN Model 2     50         0.0341s ± 0.0004s 0.0007s ± 0.0000s
CNN Model 2     100        0.0419s ± 0.0007s 0.0004s ± 0.0000s

Optimal Batch Size for Each Model:
ANN Model 2: Batch size 100 (0.0003s per image)
CNN Model 1: Batch size 100 (0.0004s per image)
CNN Model 2: Batch size 100 (0.0004s per image)

import keras_tuner as kt
from tensorflow import keras

def build_tunable_model(hp):
    """Define a model with tunable hyperparameters"""
    # Clear previous model from memory
    backend.clear_session()
    
    # Initialize model
    model = Sequential()
    
    # First convolutional layer with tunable parameters
    filters = hp.Int('filters_1', min_value=16, max_value=64, step=16)
    model.add(Conv2D(filters, (3, 3), padding='same', input_shape=(32, 32, 1)))
    
    # Choose activation function
    if hp.Boolean('use_leaky_relu'):
        model.add(LeakyReLU(alpha=hp.Float('leaky_alpha', 0.01, 0.3, step=0.05)))
    else:
        model.add(Activation('relu'))
    
    # First pooling layer
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # Optional batch normalization
    if hp.Boolean('use_batch_norm'):
        model.add(BatchNormalization())
    
    # Second convolutional layer
    filters_2 = hp.Int('filters_2', min_value=32, max_value=128, step=32)
    model.add(Conv2D(filters_2, (3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # Flatten and dense layers
    model.add(Flatten())
    model.add(Dense(hp.Int('dense_units', 32, 128, 32)))
    model.add(Activation('relu'))
    
    # Dropout for regularization
    dropout_rate = hp.Float('dropout', 0, 0.5, step=0.1)
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate))
    
    # Output layer
    model.add(Dense(10, activation='softmax'))
    
    # Compile with tunable learning rate
    learning_rate = hp.Choice('learning_rate', [1e-4, 5e-4, 1e-3, 5e-3])
    
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create a smaller validation dataset for faster tuning
X_train_sample, X_val_tuning, y_train_sample, y_val_tuning = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Further reduce training data size for faster iterations
X_train_sample, _, y_train_sample, _ = train_test_split(
    X_train_sample, y_train_sample, test_size=0.8, random_state=42
)

print(f"Tuning with {X_train_sample.shape[0]} training samples and {X_val_tuning.shape[0]} validation samples")

# Create hyperparameter tuner - using RandomSearch for simplicity
tuner = kt.RandomSearch(
    build_tunable_model,
    objective='val_accuracy',
    max_trials=10,  # Try 10 different combinations
    executions_per_trial=1,
    directory='hyperparameter_tuning',
    project_name='svhn_digits'
)

print("Search space summary:")
tuner.search_space_summary()

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    restore_best_weights=True
)

# Run the search
tuner.search(
    X_train_sample, y_train_sample,
    validation_data=(X_val_tuning, y_val_tuning),
    epochs=10,
    callbacks=[early_stopping]
)

# Get best hyperparameters and build best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = build_tunable_model(best_hps)

# Train best model on full dataset
history = best_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=15,
    callbacks=[early_stopping]
)

# Convert test data to one-hot format to match training
from tensorflow.keras.utils import to_categorical
y_test_onehot = to_categorical(y_test, 10)

# Evaluate best model
test_loss, test_acc = best_model.evaluate(X_test, y_test_onehot)
print(f"Best model test accuracy: {test_acc:.4f}")

# Display best hyperparameters
print("\nBest Hyperparameter Configuration:")
for param, value in best_hps.values.items():
    print(f"{param}: {value}")

# Save the optimized model
automl_model_path = save_model(best_model, "AutoML_CNN_Model")

# Visualize training results
plt.figure(figsize=(12, 5))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Accuracy (AutoML Model)')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Loss (AutoML Model)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Compare with manual models
def compare_with_manual_models():
    """Compare automated optimization results with previous manual tuning"""
    # Get predictions from the AutoML model
    automl_preds = best_model.predict(X_test)
    automl_pred_labels = np.argmax(automl_preds, axis=1)
    
    # Make sure y_test is in the correct format
    if len(y_test.shape) > 1 and y_test.shape[1] > 1:
        y_test_labels = np.argmax(y_test, axis=1)
    else:
        y_test_labels = y_test
    
    # Create confusion matrix
    plot_enhanced_confusion_matrix(
        y_test_labels,  # Use the correct format
        automl_pred_labels, 
        title="AutoML Model: Confusion Matrix"
    )
    
    # Model comparison data
    models = ["ANN Model 1", "ANN Model 2", "CNN Model 1", "CNN Model 2", "AutoML Model"]
    accuracies = [0.60, 0.71, 0.85, 0.89, test_acc]
    
    # Create visualization
    plt.figure(figsize=(12, 7))
    bars = plt.bar(models, accuracies, width=0.7, color=['#5470c6', '#91cc75', '#fac858', '#ee6666', '#73c0de'])
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2.,
            height + 0.01,
            f'{height:.2f}',
            ha='center', 
            va='bottom',
            fontsize=12
        )
    
    plt.title("Model Accuracy Comparison (Including AutoML)", fontsize=16, fontweight='bold')
    plt.ylabel("Test Accuracy", fontsize=14)
    plt.ylim(0, 1.0)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nKey Findings from Automated Hyperparameter Optimization:")
    print("-" * 60)
    if test_acc > 0.89:
        print(f"✓ The AutoML model achieved {test_acc:.2f} accuracy, improving upon our best manual model.")
    else:
        print(f"✓ The AutoML model achieved {test_acc:.2f} accuracy, which is {'comparable to' if test_acc > 0.87 else 'lower than'} our best manual model.")
    
    # Access the number of trials using a different attribute
    num_trials = len(tuner.oracle.trials)
    print(f"✓ The search explored {num_trials} different configurations in the time it would take to manually test just a few.")
    print(f"✓ Key hyperparameters identified: filters={best_hps.get('filters_1')}, dropout={best_hps.get('dropout')}, learning_rate={best_hps.get('learning_rate')}")

# Run comparison
compare_with_manual_models()

Tuning with 6720 training samples and 8400 validation samples
Reloading Tuner from hyperparameter_tuning/svhn_digits/tuner0.json
Search space summary:
Search space summary
Default search space size: 8
filters_1 (Int)
{'default': None, 'conditions': [], 'min_value': 16, 'max_value': 64, 'step': 16, 'sampling': 'linear'}
use_leaky_relu (Boolean)
{'default': False, 'conditions': []}
use_batch_norm (Boolean)
{'default': False, 'conditions': []}
filters_2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': 'linear'}
dense_units (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 128, 'step': 32, 'sampling': 'linear'}
dropout (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.5, 'step': 0.1, 'sampling': 'linear'}
learning_rate (Choice)
{'default': 0.0001, 'conditions': [], 'values': [0.0001, 0.0005, 0.001, 0.005], 'ordered': True}
leaky_alpha (Float)
{'default': 0.01, 'conditions': [], 'min_value': 0.01, 'max_value': 0.3, 'step': 0.05, 'sampling': 'linear'}
Epoch 1/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 10s 9ms/step - accuracy: 0.4013 - loss: 1.7983 - val_accuracy: 0.8144 - val_loss: 0.6889
Epoch 2/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.8263 - loss: 0.6254 - val_accuracy: 0.8538 - val_loss: 0.5549
Epoch 3/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.8584 - loss: 0.5052 - val_accuracy: 0.8643 - val_loss: 0.5145
Epoch 4/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.8746 - loss: 0.4460 - val_accuracy: 0.8669 - val_loss: 0.4945
Epoch 5/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.8843 - loss: 0.4047 - val_accuracy: 0.8695 - val_loss: 0.4775
Epoch 6/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.8927 - loss: 0.3709 - val_accuracy: 0.8735 - val_loss: 0.4635
Epoch 7/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9010 - loss: 0.3418 - val_accuracy: 0.8755 - val_loss: 0.4546
Epoch 8/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9099 - loss: 0.3152 - val_accuracy: 0.8792 - val_loss: 0.4475
Epoch 9/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9171 - loss: 0.2917 - val_accuracy: 0.8811 - val_loss: 0.4446
Epoch 10/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9242 - loss: 0.2708 - val_accuracy: 0.8818 - val_loss: 0.4436
Epoch 11/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9298 - loss: 0.2516 - val_accuracy: 0.8837 - val_loss: 0.4442
Epoch 12/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9345 - loss: 0.2332 - val_accuracy: 0.8833 - val_loss: 0.4450
Epoch 13/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 10s 9ms/step - accuracy: 0.9412 - loss: 0.2165 - val_accuracy: 0.8850 - val_loss: 0.4480
Epoch 14/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 11s 11ms/step - accuracy: 0.9458 - loss: 0.2010 - val_accuracy: 0.8860 - val_loss: 0.4515
Epoch 15/15
1050/1050 ━━━━━━━━━━━━━━━━━━━━ 9s 9ms/step - accuracy: 0.9502 - loss: 0.1859 - val_accuracy: 0.8857 - val_loss: 0.4608
563/563 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - accuracy: 0.8853 - loss: 0.4455
Best model test accuracy: 0.8819

Best Hyperparameter Configuration:
filters_1: 16
use_leaky_relu: False
use_batch_norm: True
filters_2: 128
dense_units: 96
dropout: 0.0
learning_rate: 0.0001
leaky_alpha: 0.01
Model saved to models/AutoML_CNN_Model_20250522_185731.keras (recommended Keras format)
Model architecture diagram saved to models/AutoML_CNN_Model_20250522_185731_architecture.png

563/563 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step

Key Misclassification Patterns:
-------------------------------
Digit 0: 94.0% correct, commonly confused with: 2 (1.0%), 4 (0.9%)
Digit 1: 86.0% correct, commonly confused with: 4 (3.7%), 0 (2.2%)
Digit 2: 92.5% correct, commonly confused with: 9 (2.0%), 3 (1.7%)
Digit 3: 87.0% correct, commonly confused with: 5 (3.1%), 2 (2.4%)
Digit 4: 92.4% correct, commonly confused with: 9 (1.4%), 2 (1.2%)
Digit 5: 86.8% correct, commonly confused with: 3 (5.1%), 6 (2.5%)
Digit 6: 84.3% correct, commonly confused with: 8 (4.9%), 5 (3.0%)
Digit 7: 86.6% correct, commonly confused with: 2 (3.9%), 1 (2.5%)
Digit 8: 86.2% correct, commonly confused with: 9 (2.8%), 3 (2.5%)
Digit 9: 86.1% correct, commonly confused with: 2 (2.8%), 0 (2.7%)

Key Findings from Automated Hyperparameter Optimization:
------------------------------------------------------------
✓ The AutoML model achieved 0.88 accuracy, which is comparable to our best manual model.
✓ The search explored 10 different configurations in the time it would take to manually test just a few.
✓ Key hyperparameters identified: filters=16, dropout=0.0, learning_rate=0.0001

def demonstrate_model_serialization():
    """
    Demonstrate saving, loading and using a serialized model for inference.
    
    This function:
    1. Takes the best model (CNN Model 2)
    2. Saves it to disk in multiple formats
    3. Reloads the model
    4. Performs inference with the reloaded model
    5. Compares results with the original model
    
    Returns:
        None: Results are printed to screen
    """
    
    print("Model Serialization and Deployment Demonstration")
    print("=" * 50)
    
    # Save model in both formats (using keras format as default/preferred)
    keras_path = save_model(cnn_model_2, "CNN_Model2_Production")  # Default is keras format
    h5_path = save_model(cnn_model_2, "CNN_Model2_Production_Legacy", save_format='h5')
    
    # Select sample images for testing
    test_indices = np.random.choice(len(X_test), 5, replace=False)
    test_samples = X_test[test_indices]
    true_labels = np.argmax(y_test[test_indices], axis=1) if len(y_test.shape) > 1 else y_test[test_indices]
    
    # Get predictions from original model
    original_preds = cnn_model_2.predict(test_samples)
    original_pred_labels = np.argmax(original_preds, axis=1)
    
    # Load Keras model and test
    print("\nTesting Keras model:")
    keras_model = load_model(keras_path)
    
    # Perform inference with loaded model
    start_time = time.time()
    keras_preds = keras_model.predict(test_samples)
    keras_time = time.time() - start_time
    keras_pred_labels = np.argmax(keras_preds, axis=1)
    
    # Load H5 model and test
    print("\nTesting H5 model:")
    h5_model = load_model(h5_path)
    
    # Perform inference with loaded model
    start_time = time.time()
    h5_preds = h5_model.predict(test_samples)
    h5_time = time.time() - start_time
    h5_pred_labels = np.argmax(h5_preds, axis=1)
    
    # Verify predictions match between models
    keras_match = np.array_equal(original_pred_labels, keras_pred_labels)
    h5_match = np.array_equal(original_pred_labels, h5_pred_labels)
    
    print("\nResults Comparison:")
    print(f"Original model predictions: {original_pred_labels}")
    print(f"Keras model predictions:   {keras_pred_labels} - Match: {keras_match}")
    print(f"H5 model predictions:      {h5_pred_labels} - Match: {h5_match}")
    
    # Display size comparison
    h5_size = os.path.getsize(h5_path) / (1024 * 1024)  # MB
    keras_size = os.path.getsize(keras_path) / (1024 * 1024) if os.path.isfile(keras_path) else get_dir_size(keras_path) / (1024 * 1024)  # MB
    
    print(f"\nModel Size Comparison:")
    print(f"Keras Model: {keras_size:.2f} MB (recommended format)")
    print(f"H5 Model: {h5_size:.2f} MB (legacy format)")
    
    # Visualize sample predictions
    plt.figure(figsize=(15, 3))
    for i, idx in enumerate(range(len(test_samples))):
        plt.subplot(1, 5, i+1)
        plt.imshow(np.squeeze(test_samples[idx]), cmap='gray')
        plt.title(f"True: {true_labels[idx]}\nPred: {keras_pred_labels[idx]}")
        plt.axis('off')
    plt.tight_layout()
    plt.suptitle("Predictions from Loaded Keras Model", y=1.05)
    plt.show()
    
    print("\nModel Serialization Complete!")

# Call the serialization demonstration function
demonstrate_model_serialization()

Model Serialization and Deployment Demonstration
==================================================
Model saved to models/CNN_Model2_Production_20250522_185851.keras (recommended Keras format)

WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`.

Model architecture diagram saved to models/CNN_Model2_Production_20250522_185851_architecture.png
Model saved to models/CNN_Model2_Production_Legacy_20250522_185852.h5 (legacy HDF5 format)
Model architecture diagram saved to models/CNN_Model2_Production_Legacy_20250522_185852_architecture.png
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 32ms/step

Testing Keras model:
Model successfully loaded from models/CNN_Model2_Production_20250522_185851.keras
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 51ms/step

Testing H5 model:

WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

Model successfully loaded from models/CNN_Model2_Production_Legacy_20250522_185852.h5
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 50ms/step

Results Comparison:
Original model predictions: [4 7 1 3 4]
Keras model predictions:   [4 7 1 3 4] - Match: True
H5 model predictions:      [4 7 1 3 4] - Match: True

Model Size Comparison:
Keras Model: 1.95 MB (recommended format)
H5 Model: 1.95 MB (legacy format)

Model Serialization Complete!

# Load all saved models before comprehensive test evaluation
print("Loading all trained models for comprehensive evaluation...")

# Load ANN models (need to reshape data for ANN input)
try:
    # Find the most recent ANN model files
    import glob
    ann1_files = glob.glob('models/ANN_Model1_*.keras')
    ann2_files = glob.glob('models/ANN_Model2_*.keras')
    
    if ann1_files:
        model_1 = load_model(sorted(ann1_files)[-1])  # Get most recent
        print(f"Loaded ANN Model 1")
    else:
        print("ANN Model 1 not found - will skip in evaluation")
        
    if ann2_files:
        model_2 = load_model(sorted(ann2_files)[-1])  # Get most recent
        print(f"Loaded ANN Model 2")
    else:
        print("ANN Model 2 not found - will skip in evaluation")
        
except Exception as e:
    print(f"Error loading ANN models: {e}")

# Load CNN models
try:
    cnn1_files = glob.glob('models/CNN_Model1_*.keras')
    cnn2_files = glob.glob('models/CNN_Model2_*.keras')
    
    if cnn1_files:
        cnn_model_1 = load_model(sorted(cnn1_files)[-1])
        print(f"Loaded CNN Model 1")
    else:
        print("CNN Model 1 not found - will skip in evaluation")
        
    if cnn2_files:
        cnn_model_2 = load_model(sorted(cnn2_files)[-1])
        print(f"Loaded CNN Model 2")
    else:
        print("CNN Model 2 not found - will skip in evaluation")
        
except Exception as e:
    print(f"Error loading CNN models: {e}")

# Load AutoML model
try:
    automl_files = glob.glob('models/AutoML_CNN_Model_*.keras')
    if automl_files:
        best_model = load_model(sorted(automl_files)[-1])
        print(f"✓ Loaded AutoML Model")
    else:
        print("⚠ AutoML Model not found - will skip in evaluation")
except Exception as e:
    print(f"Error loading AutoML model: {e}")

# Verify data format for test evaluation
print(f"\nData verification:")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

# Ensure we have the correct data format
if len(X_test.shape) == 2:  # If flattened, reshape for CNN models
    X_test = X_test.reshape(-1, 32, 32, 1)
    print(f"Reshaped X_test to: {X_test.shape}")

# Ensure y_test is in one-hot format
if len(y_test.shape) == 1 or (len(y_test.shape) > 1 and y_test.shape[1] == 1):
    y_test = to_categorical(y_test, 10)
    print(f"Converted y_test to one-hot: {y_test.shape}")

print("All models loaded and data prepared for comprehensive evaluation!")

Loading all trained models for comprehensive evaluation...
Model successfully loaded from models/ANN_Model1_20250522_172102.keras
✓ Loaded ANN Model 1
Model successfully loaded from models/ANN_Model2_20250522_172127.keras
✓ Loaded ANN Model 2
Model successfully loaded from models/CNN_Model1_20250522_172430.keras
✓ Loaded CNN Model 1
Model successfully loaded from models/CNN_Model2_Production_20250522_185851.keras
✓ Loaded CNN Model 2
Model successfully loaded from models/AutoML_CNN_Model_20250522_185731.keras
✓ Loaded AutoML Model

Data verification:
X_test shape: (18000, 32, 32, 1)
y_test shape: (18000,)
Converted y_test to one-hot: (18000, 10)
All models loaded and data prepared for comprehensive evaluation!

def comprehensive_test_set_evaluation():
    """
    Systematically evaluate all trained models on the test set for final performance comparison.
    This provides definitive test accuracies to complete the performance summary table.
    """
    print("="*80)
    print("COMPREHENSIVE TEST SET EVALUATION - FINAL MODEL COMPARISON")
    print("="*80)
    
    # Ensure test data is in proper format for evaluation
    # Convert y_test back to one-hot if needed
    if len(y_test.shape) == 1 or (len(y_test.shape) > 1 and y_test.shape[1] == 1):
        y_test_onehot = to_categorical(y_test, 10)
        y_test_labels = y_test if len(y_test.shape) == 1 else y_test.flatten()
    else:
        y_test_onehot = y_test
        y_test_labels = np.argmax(y_test, axis=1)
    
    # Models to evaluate with their required input formats
    models_to_evaluate = [
        ("ANN Model 1", model_1, X_test.reshape(X_test.shape[0], -1)),
        ("ANN Model 2", model_2, X_test.reshape(X_test.shape[0], -1)), 
        ("CNN Model 1", cnn_model_1, X_test),
        ("CNN Model 2", cnn_model_2, X_test),
        ("AutoML Model", best_model, X_test)
    ]
    
    results = []
    
    print(f"{'Model':<15} {'Test Accuracy':<12} {'Test Loss':<10} {'Parameters':<12} {'Efficiency Score':<15}")
    print("-" * 80)
    
    for model_name, model, test_data in models_to_evaluate:
        try:
            # Get test loss and accuracy
            test_loss, test_accuracy = model.evaluate(test_data, y_test_onehot, verbose=0)
            
            # Calculate efficiency metric (accuracy per 100K parameters)
            param_count = model.count_params()
            efficiency = (test_accuracy * 100000) / param_count
            
            # Print results
            print(f"{model_name:<15} {test_accuracy:<12.4f} {test_loss:<10.4f} {param_count:<12,} {efficiency:<15.2f}")
            
            results.append({
                'Model': model_name,
                'Test Accuracy': test_accuracy,
                'Test Loss': test_loss,
                'Parameters': param_count,
                'Efficiency': efficiency
            })
            
        except Exception as e:
            print(f"{model_name:<15} {'ERROR':<12} {'ERROR':<10} {'N/A':<12} {'N/A':<15}")
            print(f"Error evaluating {model_name}: {e}")
    
    print("\n" + "="*80)
    print("FINAL RANKINGS AND INSIGHTS")
    print("="*80)
    
    if results:
        # Sort by test accuracy
        sorted_by_accuracy = sorted(results, key=lambda x: x['Test Accuracy'], reverse=True)
        sorted_by_efficiency = sorted(results, key=lambda x: x['Efficiency'], reverse=True)
        
        print("RANKING BY TEST ACCURACY:")
        for i, result in enumerate(sorted_by_accuracy, 1):
            print(f"  {i}. {result['Model']}: {result['Test Accuracy']*100:.2f}%")
        
        print("\nRANKING BY EFFICIENCY (Accuracy per 100K Parameters):")
        for i, result in enumerate(sorted_by_efficiency, 1):
            print(f"  {i}. {result['Model']}: {result['Efficiency']:.2f}")
        
        # Key insights
        best_model_result = sorted_by_accuracy[0]
        most_efficient = sorted_by_efficiency[0]
        
        print(f"\nKEY INSIGHTS:")
        print(f"   • Best Test Accuracy: {best_model_result['Model']} ({best_model_result['Test Accuracy']*100:.2f}%)")
        print(f"   • Most Efficient: {most_efficient['Model']} (Score: {most_efficient['Efficiency']:.2f})")
        
        # Calculate performance gaps
        if len(sorted_by_accuracy) >= 2:
            gap = (sorted_by_accuracy[0]['Test Accuracy'] - sorted_by_accuracy[1]['Test Accuracy']) * 100
            print(f"   • Performance Gap: {gap:.2f} percentage points between 1st and 2nd place")
        
        # CNN vs ANN comparison
        cnn_results = [r for r in results if 'CNN' in r['Model']]
        ann_results = [r for r in results if 'ANN' in r['Model']]
        
        if cnn_results and ann_results:
            best_cnn = max(cnn_results, key=lambda x: x['Test Accuracy'])
            best_ann = max(ann_results, key=lambda x: x['Test Accuracy'])
            cnn_advantage = (best_cnn['Test Accuracy'] - best_ann['Test Accuracy']) * 100
            print(f"   • CNN Advantage: {cnn_advantage:.2f} percentage points over best ANN")
        
        # Validation vs Test accuracy comparison
        print(f"\nVALIDATION vs TEST ACCURACY COMPARISON:")
        validation_accuracies = {
            'ANN Model 1': 0.60,
            'ANN Model 2': 0.71, 
            'CNN Model 1': 0.85,
            'CNN Model 2': 0.89,
            'AutoML Model': 0.88
        }
        
        for result in results:
            model_name = result['Model']
            if model_name in validation_accuracies:
                val_acc = validation_accuracies[model_name]
                test_acc = result['Test Accuracy']
                diff = (test_acc - val_acc) * 100
                status = "✓ Good generalization" if abs(diff) < 2 else ("⚠ Possible overfitting" if diff < -2 else "⚠ Unusual pattern")
                print(f"   • {model_name}: Val={val_acc:.2f}, Test={test_acc:.2f} ({diff:+.1f}pp) - {status}")
    
    print("\n" + "="*80)
    return results

# Execute the comprehensive test evaluation
print("Conducting final test set evaluation for all models...")
final_test_results = comprehensive_test_set_evaluation()

# Create updated performance summary table
def create_updated_performance_table(test_results):
    """Create an updated performance summary table with test accuracies"""
    
    # Extract test accuracies from results
    test_acc_dict = {result['Model']: result['Test Accuracy'] for result in test_results}
    
    # Create comprehensive comparison table
    performance_data = {
        'Model': ['ANN Model 1', 'ANN Model 2', 'CNN Model 1', 'CNN Model 2', 'AutoML Model'],
        'Architecture': [
            '2 hidden layers (64→32)',
            '5 hidden layers (256→128→64→64→32)', 
            '2 conv layers (16→32) + 1 dense',
            '4 conv layers (16→32→32→64) + 1 dense',
            '2 conv layers (16→128) + 1 dense'
        ],
        'Parameters': [68010, 310186, 267306, 164170, 265000],
        'Validation Accuracy': [0.60, 0.71, 0.85, 0.89, 0.88],
        'Test Accuracy': [
            test_acc_dict.get('ANN Model 1', 0.0),
            test_acc_dict.get('ANN Model 2', 0.0),
            test_acc_dict.get('CNN Model 1', 0.0), 
            test_acc_dict.get('CNN Model 2', 0.0),
            test_acc_dict.get('AutoML Model', 0.0)
        ]
    }
    
    df = pd.DataFrame(performance_data)
    
    # Style the table for better presentation
    styled_df = df.style.set_caption("Complete Model Performance Summary with Test Results") \
                       .format({
                           'Parameters': '{:,}',
                           'Validation Accuracy': '{:.2%}',
                           'Test Accuracy': '{:.2%}'
                       }) \
                       .highlight_max(subset=['Validation Accuracy', 'Test Accuracy'], color='lightgreen') \
                       .set_table_styles([
                           {'selector': 'th', 'props': [('text-align', 'center')]},
                           {'selector': 'td', 'props': [('text-align', 'center')]}
                       ])
    
    display(styled_df)
    
    return df

# Display the updated performance table
print("\nUPDATED PERFORMANCE SUMMARY TABLE:")
print("="*50)
updated_table = create_updated_performance_table(final_test_results)

Conducting final test set evaluation for all models...
================================================================================
COMPREHENSIVE TEST SET EVALUATION - FINAL MODEL COMPARISON
================================================================================
Model           Test Accuracy Test Loss  Parameters   Efficiency Score
--------------------------------------------------------------------------------
ANN Model 1     0.5944       1.2534     68,010       0.87           
ANN Model 2     0.7059       0.9359     310,250      0.23           
CNN Model 1     0.8609       0.8947     267,306      0.32           
CNN Model 2     0.8894       0.5293     164,362      0.54           
AutoML Model    0.8819       0.4688     806,282      0.11           

================================================================================
FINAL RANKINGS AND INSIGHTS
================================================================================
RANKING BY TEST ACCURACY:
  1. CNN Model 2: 88.94%
  2. AutoML Model: 88.19%
  3. CNN Model 1: 86.09%
  4. ANN Model 2: 70.59%
  5. ANN Model 1: 59.44%

RANKING BY EFFICIENCY (Accuracy per 100K Parameters):
  1. ANN Model 1: 0.87
  2. CNN Model 2: 0.54
  3. CNN Model 1: 0.32
  4. ANN Model 2: 0.23
  5. AutoML Model: 0.11

KEY INSIGHTS:
   • Best Test Accuracy: CNN Model 2 (88.94%)
   • Most Efficient: ANN Model 1 (Score: 0.87)
   • Performance Gap: 0.74 percentage points between 1st and 2nd place
   • CNN Advantage: 18.35 percentage points over best ANN

VALIDATION vs TEST ACCURACY COMPARISON:
   • ANN Model 1: Val=0.60, Test=0.59 (-0.6pp) - ✓ Good generalization
   • ANN Model 2: Val=0.71, Test=0.71 (-0.4pp) - ✓ Good generalization
   • CNN Model 1: Val=0.85, Test=0.86 (+1.1pp) - ✓ Good generalization
   • CNN Model 2: Val=0.89, Test=0.89 (-0.1pp) - ✓ Good generalization
   • AutoML Model: Val=0.88, Test=0.88 (+0.2pp) - ✓ Good generalization

================================================================================

UPDATED PERFORMANCE SUMMARY TABLE:
==================================================

	precision	recall	f1-score	support
0	0.728	0.722	0.725	1814
1	0.628	0.736	0.677	1828
2	0.735	0.752	0.743	1803
3	0.732	0.589	0.653	1719
4	0.770	0.758	0.764	1812
5	0.643	0.708	0.674	1768
6	0.714	0.713	0.713	1832
7	0.768	0.751	0.759	1808
8	0.665	0.643	0.654	1812
9	0.701	0.682	0.692	1804

Learning Rate	Validation Accuracy	Training Behavior	Stability
0.0001	0.6206 ± 0.0535	Slow convergence	High
0.001	0.8283 ± 0.0043	Optimal	High
0.01	0.8117 ± 0.0050	Fast but erratic	Medium
0.1	0.1182 ± 0.0140	Failed training	Very Low

Metric	ANN Model 1	ANN Model 2	CNN Model 1	CNN Model 2	AutoML Model
Architecture	2 hidden layers (64→32)	5 hidden layers (256→128→64→64→32)	2 conv layers (16→32) + 1 dense	4 conv layers (16→32→32→64) + 1 dense	2 conv layers (16→128) + 1 dense
Validation Accuracy	60%	71%	85%	89%	88%
Test Accuracy	59.44%	70.59%	86.09%	88.94%	88.19%
Parameters	68,010	310,186	267,306	164,170	~265,000
Training Time	~45s	~1m 36s	~3m 5s	~7m 3s	~6m
Inference Time (1 img)	0.0267s	0.0256s	0.0260s	0.0263s	N/A
Inference Time (100 img)	0.0003s	0.0003s	0.0004s	0.0004s	N/A
Generalization	Excellent (-0.56% gap)	Excellent (-0.41% gap)	Excellent (+1.09% improvement)	Excellent (-0.06% gap)	Excellent (+0.19% improvement)
Key Features	Basic feedforward No regularization	Dropout (0.2) Batch normalization	LeakyReLU(alpha=0.1) Single pooling	Batch normalization Dropout (0.5) Dual pooling	Batch normalization Optimized filters ReLU activation
Efficiency Score	87.4	22.8	32.2	54.2	33.3
Recommended Use Case	Quick prototyping Resource-constrained deployment	When ANNs are required but accuracy matters	Balanced production applications requiring good accuracy-speed ratio	High-accuracy scenarios where precision is paramount	Production deployment with minimal manual tuning

	precision	recall	f1-score	support
0	0.856	0.940	0.896	1814
1	0.926	0.858	0.891	1828
2	0.892	0.905	0.899	1803
3	0.911	0.841	0.875	1719
4	0.882	0.930	0.905	1812
5	0.899	0.884	0.891	1768
6	0.880	0.880	0.880	1832
7	0.918	0.891	0.904	1808
8	0.899	0.853	0.875	1812
9	0.845	0.910	0.876	1804

	Learning Rate	Validation Accuracy
0	0.000100	0.6206 ± 0.0535
1	0.001000	0.8283 ± 0.0043
2	0.010000	0.8117 ± 0.0050
3	0.100000	0.1182 ± 0.0140

	Batch Size	Validation Accuracy
0	16	0.8165 ± 0.0101
1	64	0.8036 ± 0.0057
2	256	0.7778 ± 0.0084
3	1024	0.5802 ± 0.1045

Batch Size	Validation Accuracy	Training Stability	Memory Usage	Processing Speed
16	0.8165 ± 0.0101	Low	Minimal	Slowest
64	0.8036 ± 0.0057	Medium	Low	Medium
256	0.7778 ± 0.0084	High	Medium	Fast
1024	0.5802 ± 0.1045	Very Low	High	Fastest

Model	Accuracy	Inference Time (1 img)	Inference Time (100 imgs)	Relative Speed
ANN Model 2	71%	0.0256s ± 0.0001s	0.0003s ± 0.0000s	~85×
CNN Model 1	85%	0.0260s ± 0.0001s	0.0004s ± 0.0000s	~65×
CNN Model 2	89%	0.0263s ± 0.0001s	0.0004s ± 0.0000s	~66×

	Model	Architecture	Parameters	Validation Accuracy	Test Accuracy
0	ANN Model 1	2 hidden layers (64→32)	68,010	60.00%	59.44%
1	ANN Model 2	5 hidden layers (256→128→64→64→32)	310,186	71.00%	70.59%
2	CNN Model 1	2 conv layers (16→32) + 1 dense	267,306	85.00%	86.09%
3	CNN Model 2	4 conv layers (16→32→32→64) + 1 dense	164,170	89.00%	88.94%
4	AutoML Model	2 conv layers (16→128) + 1 dense	265,000	88.00%	88.19%

Deep Learning: Street View House Number Digit Recognition¶

By: Mohit Pammu, MBA¶

Table of Contents¶

Introduction¶

Executive Summary¶

AI Assistance Attribution Statement¶

Problem Statement & Context¶

Research Context: Situating Our Work¶

Historical Perspective¶

Current Research Landscape¶

Unique Contributions¶

Objective¶

Dataset Overview¶

Data Exploration & Preprocessing¶

Importing Libraries¶

Utility Functions¶

Visualization Functions¶

Dataset Loading & Preparation¶

Dataset Dimensions¶

Visual Data Exploration¶

Dataset Characteristics & Patterns¶

Data Preprocessing Pipeline¶

Pixel Value Normalization¶

Tensor Dimension Verification¶

Label Encoding Transformation¶

Artificial Neural Network Models¶

ANN Architecture Evolution¶

Baseline ANN Design¶

Training Performance Visualization¶

ANN Model 1: Basic Architecture¶

Enhanced ANN Architecture Implementation¶

ANN Model 2 Architecture¶

ANN Model 2 Training Performance¶

ANN Model 2: Enhanced Architecture¶

ANN Model 2 Evaluation¶

Classification Performance Analysis¶

Convolutional Neural Network Models¶

From ANNs to CNNs: Leveraging Spatial Information¶

CNN-Specific Data Preprocessing¶

Dataset Reinitialization for CNN Models¶

Dataset Verification¶

Tensor Reshaping for CNN Input¶

Input Normalization¶

Tensor Shape Verification¶

Target Variable Encoding¶

CNN Architecture Development¶

Foundation CNN Architecture¶

CNN Model 1 Training Performance Visualization¶

CNN Model 1: Basic Architecture¶

Advanced CNN Architecture¶

CNN Model 2 Training Performance Visualization¶

CNN Model 2: Enhanced Architecture¶

CNN Model 2 Performance Evaluation¶

Test Set Prediction & Analysis¶

Classification Performance Analysis¶

Visualization of Convolutional Filters and Feature Maps¶

Observations and Insights:¶

Experiments & Analysis¶

Impact of Normalization on Model Performance¶

Observations and Insights¶

Model Complexity vs. Performance Analysis¶

Observations and Insights:¶

Learning Rate Sensitivity Analysis¶

Observations and Insights:¶

Batch Size Impact Analysis¶

Observations and Insights:¶

Inference Performance Analysis¶

Observations and Insights:¶

Automated Hyperparameter Optimization¶

Observations and Insights:¶

Model Serialization and Deployment¶

Serialization Strategy¶

Model Versioning¶

Deployment Considerations¶

Real-World Applications¶

Urban Navigation and Mapping¶

Document Processing¶

Accessibility Applications¶

Urban Planning and Analytics¶

Learning Insights: From Theory to Practice¶