Ensemble combine with a neural network#

Aim: To combine output from logistic regression, random forests, and neural network models in a neural network model, with or without original features.

Import libraries#

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

# TensorFlow api model
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.losses import binary_crossentropy

Function to normalise data#

def scale_data(X_train, X_test):
    """Scale data 0-1 based on min and max in training set"""
    
    # Initialise a new scaling object for normalising input data
    sc = MinMaxScaler()

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_sc = sc.transform(X_train)
    test_sc = sc.transform(X_test)
    
    return train_sc, test_sc    

Define neural net#

def make_net(number_features, expansion=2, learning_rate=0.003, dropout=0.5):
    
    # Clear Tensorflow
    K.clear_session()
    
    # Input layer
    inputs = layers.Input(shape=number_features)
    
    # Dense layer 1
    dense_1 = layers.Dense(
        number_features * expansion, activation='relu')(inputs)
    norm_1 = layers.BatchNormalization()(dense_1)
    dropout_1 = layers.Dropout(dropout)(norm_1)
    
    # Dense layer 2
    dense_2 = layers.Dense(
        number_features * expansion, activation='relu')(dropout_1)
    norm_2 = layers.BatchNormalization()(dense_2)
    dropout_2 = layers.Dropout(dropout)(norm_2)    
 
    # Outpout (single sigmoid)
    outputs = layers.Dense(1, activation='sigmoid')(dropout_2)
    
    # Build net
    net = Model(inputs, outputs)
    
    # Compiling model
    opt = Adam(lr=learning_rate)
    net.compile(loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])
    return net

Fit a model using original training data only#

# Load data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')

# OneHot encode stroke team
coded = pd.get_dummies(train['StrokeTeam'])
train = pd.concat([train, coded], axis=1)
train.drop('StrokeTeam', inplace=True, axis=1)
coded = pd.get_dummies(test['StrokeTeam'])
test = pd.concat([test, coded], axis=1)
test.drop('StrokeTeam', inplace=True, axis=1)

# Split into X, y
X_train_df = train.drop('S2Thrombolysis',axis=1) 
y_train_df = train['S2Thrombolysis']
X_test_df = test.drop('S2Thrombolysis',axis=1) 
y_test_df = test['S2Thrombolysis'] 

# Convert to NumPy
X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values
y_test = y_test_df.values

# Scale data
X_train_sc, X_test_sc = scale_data(X_train, X_test)

# Define network
number_features = X_train_sc.shape[1]
model = make_net(number_features)

# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    'model_checkpoint.h5', save_best_only=True)

# Define early stopping callback: Stop when no validation improvement
# Restore weights to best validation accuracy
early_stopping_cb = keras.callbacks.EarlyStopping(
    patience=50, restore_best_weights=True)

# Train model (including class weights)
history = model.fit(X_train_sc,
                    y_train,
                    epochs=5000,
                    batch_size=32,
                    validation_data=(X_test_sc, y_test),
                    verbose=0,
                    callbacks=[checkpoint_cb, early_stopping_cb])
    
# Get predicted probabilities
y_train_probs = model.predict(X_train_sc)
y_train_probs = y_train_probs.flatten()
y_test_probs = model.predict(X_test_sc)
y_test_probs = y_test_probs.flatten()
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.862
Test accuracy: 0.834

Fit a model using model probabilities only#

# Load data
model_probs_train = pd.read_csv(
    './individual_model_output/probabilities_train.csv')
model_probs_test = pd.read_csv(
    './individual_model_output/probabilities_test.csv')

# Set up train and test data
X_train = model_probs_train
X_test = model_probs_test
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# Convert to NumPy
X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values
y_test = y_test_df.values

# Scale data
X_train_sc, X_test_sc = scale_data(X_train, X_test)

# Define network
number_features = X_train_sc.shape[1]
model = make_net(number_features)

# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    'model_checkpoint.h5', save_best_only=True)

# Define early stopping callback: Stop when no validation improvement
# Restore weights to best validation accuracy
early_stopping_cb = keras.callbacks.EarlyStopping(
    patience=50, restore_best_weights=True)

# Train model (including class weights)
history = model.fit(X_train_sc,
                    y_train,
                    epochs=5000,
                    batch_size=32,
                    validation_data=(X_test_sc, y_test),
                    verbose=0,
                    callbacks=[checkpoint_cb, early_stopping_cb])
    
# Get predicted probabilities
y_train_probs = model.predict(X_train_sc)
y_train_probs = y_train_probs.flatten()
y_test_probs = model.predict(X_test_sc)
y_test_probs = y_test_probs.flatten()
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.869
Test accuracy: 0.835

Fit a model using original features and model probabilities#

# Load data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
model_probs_train = pd.read_csv(
    './individual_model_output/probabilities_train.csv')
model_probs_test = pd.read_csv(
    './individual_model_output/probabilities_test.csv')

# OneHot encode stroke team
coded = pd.get_dummies(train['StrokeTeam'])
train = pd.concat([train, coded], axis=1)
train.drop('StrokeTeam', inplace=True, axis=1)
coded = pd.get_dummies(test['StrokeTeam'])
test = pd.concat([test, coded], axis=1)
test.drop('StrokeTeam', inplace=True, axis=1)

# Split into X, y
X_train_df = train.drop('S2Thrombolysis',axis=1) 
y_train_df = train['S2Thrombolysis']
X_test_df = test.drop('S2Thrombolysis',axis=1) 
y_test_df = test['S2Thrombolysis']

# Add in model probabilties
X_train_df = pd.concat([X_train_df, model_probs_train], axis=1)
X_test_df = pd.concat([X_test_df, model_probs_test], axis=1)

# Convert to NumPy
X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values
y_test = y_test_df.values

# Scale data
X_train_sc, X_test_sc = scale_data(X_train, X_test)

# Define network
number_features = X_train_sc.shape[1]
model = make_net(number_features)

# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    'model_checkpoint.h5', save_best_only=True)

# Define early stopping callback: Stop when no validation improvement
# Restore weights to best validation accuracy
early_stopping_cb = keras.callbacks.EarlyStopping(
    patience=50, restore_best_weights=True)

# Train model (including class weights)
history = model.fit(X_train_sc,
                    y_train,
                    epochs=5000,
                    batch_size=32,
                    validation_data=(X_test_sc, y_test),
                    verbose=0,
                    callbacks=[checkpoint_cb, early_stopping_cb])
    
# Get predicted probabilities
y_train_probs = model.predict(X_train_sc)
y_train_probs = y_train_probs.flatten()
y_test_probs = model.predict(X_test_sc)
y_test_probs = y_test_probs.flatten()
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.830

Observations#

  • Including in model probability outputs from previous logistic regression, random forests, and neural networks, did not improve accuracy of the model compared to fitting a neural network model just on the original data.