Fully connected TensorFlow model - Learning curve#

Aims#

  • Ascertain the relationship between training set size and model accuracy

Basic methodology#

  • Models are fitted to a single fold of previously split k-fold training and test data sets.

  • MinMax scaling is used (all features are scaled 0-1 based on the feature min/max).

  • Model has two hidden layers, each with the number of neurons being 2x the number of features. Prior studies show performance of the network is similar across all models with this complexity or more. A dropout value of 0.5 is used based on previous exploration.

  • A batch size of 32 is used (“Friends don’t let friends use mini-batches larger than 32”. Yan LeCun on paper: arxiv.org/abs/1804.07612)

  • 30 Training epochs are used as previously established.

  • Adjust size of training set

Model structure:

  • Input layer

  • Dense layer (# neurons = 2x features, ReLu activation)

  • Batch normalisation

  • Dropout layer

  • Dense layer (# neurons = 2x features, ReLu activation)

  • Batch normalisation

  • Dropout layer

  • Output layer (single sigmoid activation)

Import libraries#

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

# sklearn for pre-processing
from sklearn.preprocessing import MinMaxScaler

# TensorFlow api model
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.losses import binary_crossentropy

Function to scale data (minmax scaling)#

def scale_data(X_train, X_test):
    """Scale data 0-1 based on min and max in training set"""
    
    # Initialise a new scaling object for normalising input data
    sc = MinMaxScaler()

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_sc = sc.transform(X_train)
    test_sc = sc.transform(X_test)
    
    return train_sc, test_sc  

Define neural net#

def make_net(number_features, expansion=2, learning_rate=0.003, dropout=0.5):
    
    # Clear Tensorflow
    K.clear_session()
    
    # Input layer
    inputs = layers.Input(shape=number_features)
    
    # Dense layer 1
    dense_1 = layers.Dense(
        number_features * expansion, activation='relu')(inputs)
    norm_1 = layers.BatchNormalization()(dense_1)
    dropout_1 = layers.Dropout(dropout)(norm_1)
    
    # Dense layer 2
    dense_2 = layers.Dense(
        number_features * expansion, activation='relu')(dropout_1)
    norm_2 = layers.BatchNormalization()(dense_2)
    dropout_2 = layers.Dropout(dropout)(norm_2)    
 
    # Outpout (single sigmoid)
    outputs = layers.Dense(1, activation='sigmoid')(dropout_2)
    
    # Build net
    net = Model(inputs, outputs)
    
    # Compiling model
    opt = Adam(lr=learning_rate)
    net.compile(loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])
    return net

Run k-fold validation with varying training set sizes#

# Set up list to collect results
results_training_size = []
results_accuracy = []
results_all_accuracy = []

# Get maximum training size (number of training records)
train_data = pd.read_csv(f'../data/kfold_5fold/train_0.csv')
max_training_size = train_data.shape[0]

# Construct training sizes (values closer at lower end)
train_sizes = [50, 100, 250, 500, 1000, 2500]
for i in range (5000, max_training_size, 5000):
    train_sizes.append(i)
    
# Loop through training sizes
for train_size in train_sizes:
    
    # Record accuracy across k-fold replicates
    replicate_accuracy = []

    for k in range(5):
        
        # Load data
        train = pd.read_csv(f'../data/kfold_5fold/train_{k}.csv')
        test = pd.read_csv(f'../data/kfold_5fold/test_{k}.csv')
        
        # OneHot encode stroke team
        coded = pd.get_dummies(train['StrokeTeam'])
        train = pd.concat([train, coded], axis=1)
        train.drop('StrokeTeam', inplace=True, axis=1)
        coded = pd.get_dummies(test['StrokeTeam'])
        test = pd.concat([test, coded], axis=1)
        test.drop('StrokeTeam', inplace=True, axis=1)
        
        # Sample from training data
        train = train.sample(n=train_size)

        # Split into X, y
        X_train_df = train.drop('S2Thrombolysis',axis=1) 
        y_train_df = train['S2Thrombolysis']
        X_test_df = test.drop('S2Thrombolysis',axis=1) 
        y_test_df = test['S2Thrombolysis'] 

        # Convert to NumPy
        X_train = X_train_df.values
        X_test = X_test_df.values
        y_train = y_train_df.values
        y_test = y_test_df.values
        
        # Scale data
        X_train_sc, X_test_sc = scale_data(X_train, X_test)

        # Define network
        number_features = X_train_sc.shape[1]
        model = make_net(number_features)
       
        # Train model (including class weights)
        history = model.fit(X_train_sc,
                            y_train,
                            epochs=30,
                            batch_size=32,
                            validation_data=(X_test_sc, y_test),
                            verbose=0)

        # Predict test set
        probability = model.predict(X_test_sc)
        y_pred_test = probability >= 0.5
        y_pred_test = y_pred_test.flatten()
        accuracy_test = np.mean(y_pred_test == y_test)
        replicate_accuracy.append(accuracy_test)
        results_all_accuracy.append(accuracy_test)
    
    # Store mean accuracy across the k-fold splits
    mean_accuracy = np.mean(accuracy_test)
    results_accuracy.append(mean_accuracy)
    results_training_size.append(train_size)
    
    # Print output
    print (f'Training set size {train_size}, accuracy: {mean_accuracy:0.3f}')

k_fold_accuracy = np.array(results_all_accuracy).reshape(len(train_sizes), 5)    
Training set size 50, accuracy: 0.669
Training set size 100, accuracy: 0.713
Training set size 250, accuracy: 0.761
Training set size 500, accuracy: 0.759
Training set size 1000, accuracy: 0.765
Training set size 2500, accuracy: 0.794
Training set size 5000, accuracy: 0.808
Training set size 10000, accuracy: 0.823
Training set size 15000, accuracy: 0.828
Training set size 20000, accuracy: 0.834
Training set size 25000, accuracy: 0.837
Training set size 30000, accuracy: 0.837
Training set size 35000, accuracy: 0.840
Training set size 40000, accuracy: 0.834
Training set size 45000, accuracy: 0.839
Training set size 50000, accuracy: 0.839
Training set size 55000, accuracy: 0.840
Training set size 60000, accuracy: 0.839
Training set size 65000, accuracy: 0.843
Training set size 70000, accuracy: 0.844
fig = plt.figure(figsize=(10,5))

ax1 = fig.add_subplot(121)

for i in range(5):
    ax1.plot(results_training_size, k_fold_accuracy[:, i])

ax1.set_xlabel('Training set size')
ax1.set_ylabel('Accuracy')

# Focus on first 5000
ax2 = fig.add_subplot(122)
for i in range(5):
    ax2.plot(results_training_size, k_fold_accuracy[:, i])

ax2.set_xlabel('Training set size')
ax2.set_ylabel('Accuracy')
ax2.set_xlim(0, 5000)

plt.tight_layout()
plt.savefig('./output/nn_fc_learning_curve.jpg', dpi=300)
plt.show()
../_images/001d_learning_curve_9_0.png

Observations#

  • Training accuracy rises, and then increases only slightly over 25,000 training samples.