Ensemble combine with a neural network
Contents
Ensemble combine with a neural network#
Aim: To combine output from logistic regression, random forests, and neural network models in a neural network model, with or without original features.
Import libraries#
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# TensorFlow api model
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.losses import binary_crossentropy
Function to normalise data#
def scale_data(X_train, X_test):
"""Scale data 0-1 based on min and max in training set"""
# Initialise a new scaling object for normalising input data
sc = MinMaxScaler()
# Set up the scaler just on the training set
sc.fit(X_train)
# Apply the scaler to the training and test sets
train_sc = sc.transform(X_train)
test_sc = sc.transform(X_test)
return train_sc, test_sc
Define neural net#
def make_net(number_features, expansion=2, learning_rate=0.003, dropout=0.5):
# Clear Tensorflow
K.clear_session()
# Input layer
inputs = layers.Input(shape=number_features)
# Dense layer 1
dense_1 = layers.Dense(
number_features * expansion, activation='relu')(inputs)
norm_1 = layers.BatchNormalization()(dense_1)
dropout_1 = layers.Dropout(dropout)(norm_1)
# Dense layer 2
dense_2 = layers.Dense(
number_features * expansion, activation='relu')(dropout_1)
norm_2 = layers.BatchNormalization()(dense_2)
dropout_2 = layers.Dropout(dropout)(norm_2)
# Outpout (single sigmoid)
outputs = layers.Dense(1, activation='sigmoid')(dropout_2)
# Build net
net = Model(inputs, outputs)
# Compiling model
opt = Adam(lr=learning_rate)
net.compile(loss='binary_crossentropy',
optimizer=opt,
metrics=['accuracy'])
return net
Fit a model using original training data only#
# Load data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
# OneHot encode stroke team
coded = pd.get_dummies(train['StrokeTeam'])
train = pd.concat([train, coded], axis=1)
train.drop('StrokeTeam', inplace=True, axis=1)
coded = pd.get_dummies(test['StrokeTeam'])
test = pd.concat([test, coded], axis=1)
test.drop('StrokeTeam', inplace=True, axis=1)
# Split into X, y
X_train_df = train.drop('S2Thrombolysis',axis=1)
y_train_df = train['S2Thrombolysis']
X_test_df = test.drop('S2Thrombolysis',axis=1)
y_test_df = test['S2Thrombolysis']
# Convert to NumPy
X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values
y_test = y_test_df.values
# Scale data
X_train_sc, X_test_sc = scale_data(X_train, X_test)
# Define network
number_features = X_train_sc.shape[1]
model = make_net(number_features)
# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
'model_checkpoint.h5', save_best_only=True)
# Define early stopping callback: Stop when no validation improvement
# Restore weights to best validation accuracy
early_stopping_cb = keras.callbacks.EarlyStopping(
patience=50, restore_best_weights=True)
# Train model (including class weights)
history = model.fit(X_train_sc,
y_train,
epochs=5000,
batch_size=32,
validation_data=(X_test_sc, y_test),
verbose=0,
callbacks=[checkpoint_cb, early_stopping_cb])
# Get predicted probabilities
y_train_probs = model.predict(X_train_sc)
y_train_probs = y_train_probs.flatten()
y_test_probs = model.predict(X_test_sc)
y_test_probs = y_test_probs.flatten()
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.862
Test accuracy: 0.834
Fit a model using model probabilities only#
# Load data
model_probs_train = pd.read_csv(
'./individual_model_output/probabilities_train.csv')
model_probs_test = pd.read_csv(
'./individual_model_output/probabilities_test.csv')
# Set up train and test data
X_train = model_probs_train
X_test = model_probs_test
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']
# Convert to NumPy
X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values
y_test = y_test_df.values
# Scale data
X_train_sc, X_test_sc = scale_data(X_train, X_test)
# Define network
number_features = X_train_sc.shape[1]
model = make_net(number_features)
# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
'model_checkpoint.h5', save_best_only=True)
# Define early stopping callback: Stop when no validation improvement
# Restore weights to best validation accuracy
early_stopping_cb = keras.callbacks.EarlyStopping(
patience=50, restore_best_weights=True)
# Train model (including class weights)
history = model.fit(X_train_sc,
y_train,
epochs=5000,
batch_size=32,
validation_data=(X_test_sc, y_test),
verbose=0,
callbacks=[checkpoint_cb, early_stopping_cb])
# Get predicted probabilities
y_train_probs = model.predict(X_train_sc)
y_train_probs = y_train_probs.flatten()
y_test_probs = model.predict(X_test_sc)
y_test_probs = y_test_probs.flatten()
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.869
Test accuracy: 0.835
Fit a model using original features and model probabilities#
# Load data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
model_probs_train = pd.read_csv(
'./individual_model_output/probabilities_train.csv')
model_probs_test = pd.read_csv(
'./individual_model_output/probabilities_test.csv')
# OneHot encode stroke team
coded = pd.get_dummies(train['StrokeTeam'])
train = pd.concat([train, coded], axis=1)
train.drop('StrokeTeam', inplace=True, axis=1)
coded = pd.get_dummies(test['StrokeTeam'])
test = pd.concat([test, coded], axis=1)
test.drop('StrokeTeam', inplace=True, axis=1)
# Split into X, y
X_train_df = train.drop('S2Thrombolysis',axis=1)
y_train_df = train['S2Thrombolysis']
X_test_df = test.drop('S2Thrombolysis',axis=1)
y_test_df = test['S2Thrombolysis']
# Add in model probabilties
X_train_df = pd.concat([X_train_df, model_probs_train], axis=1)
X_test_df = pd.concat([X_test_df, model_probs_test], axis=1)
# Convert to NumPy
X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values
y_test = y_test_df.values
# Scale data
X_train_sc, X_test_sc = scale_data(X_train, X_test)
# Define network
number_features = X_train_sc.shape[1]
model = make_net(number_features)
# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
'model_checkpoint.h5', save_best_only=True)
# Define early stopping callback: Stop when no validation improvement
# Restore weights to best validation accuracy
early_stopping_cb = keras.callbacks.EarlyStopping(
patience=50, restore_best_weights=True)
# Train model (including class weights)
history = model.fit(X_train_sc,
y_train,
epochs=5000,
batch_size=32,
validation_data=(X_test_sc, y_test),
verbose=0,
callbacks=[checkpoint_cb, early_stopping_cb])
# Get predicted probabilities
y_train_probs = model.predict(X_train_sc)
y_train_probs = y_train_probs.flatten()
y_test_probs = model.predict(X_test_sc)
y_test_probs = y_test_probs.flatten()
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.830
Observations#
Including in model probability outputs from previous logistic regression, random forests, and neural networks, did not improve accuracy of the model compared to fitting a neural network model just on the original data.