Ensemble model fitting - generate probabilities
Contents
Ensemble model fitting - generate probabilities#
Aims:
To generate probability outputs for training and test data using logistic regression, random forests, and a neural network.
Ensemble modelling combines multiple models in an attempt to increase accuracy. In this test we use the training/test sets with a 10k test cohort. Individual models are built using:
Random Forests (with one-hot encoding of hospital)
Logistic Regression (with one-hot encoding of hospital)
Neural Net (with an embedding layer to encode the hospital)
This notebook runs fits for each type of model and returns the predicted probability of receiving thrombolysis for each model.
In subsequent notebooks, these individual model outputs will be used as inputs for ensemble models
Import libraries#
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.losses import binary_crossentropy
Set up dataframes for probabilitiy outputs#
probability_train = pd.DataFrame()
probability_test = pd.DataFrame()
Functions#
Standardise data#
def standardise_data(X_train, X_test):
"""
Converts all data to a similar scale.
Standardisation subtracts mean and divides by standard deviation
for each feature.
Standardised data will have a mena of 0 and standard deviation of 1.
The training data mean and standard deviation is used to standardise both
training and test set data.
"""
# Initialise a new scaling object for normalising input data
sc = StandardScaler()
# Set up the scaler just on the training set
sc.fit(X_train)
# Apply the scaler to the training and test sets
train_std=sc.transform(X_train)
test_std=sc.transform(X_test)
return train_std, test_std
MinMax scaling#
def scale_data(X_train, X_test):
"""Scale data 0-1 based on min and max in training set"""
# Initialise a new scaling object for normalising input data
sc = MinMaxScaler()
# Set up the scaler just on the training set
sc.fit(X_train)
# Apply the scaler to the training and test sets
train_sc = sc.transform(X_train)
test_sc = sc.transform(X_test)
return train_sc, test_sc
Logistic regression model#
# Read in data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']
# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)
# Standardise X data
X_train_std, X_test_std = standardise_data(X_train, X_test)
# Define and Fit model
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train_std, y_train)
# Get predicted probabilities
y_train_probs = model.predict_proba(X_train_std)[:,1]
y_test_probs = model.predict_proba(X_test_std)[:,1]
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.834
Test accuracy: 0.832
Add to probability results
probability_train['logistic_regression'] = y_train_probs
probability_test['logistic_regression'] = y_test_probs
Random forests#
# Read in data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']
# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)
# Define model
model = RandomForestClassifier(
n_estimators=100, n_jobs=-1, class_weight='balanced', random_state=42)
# Fit model
model.fit(X_train, y_train)
# Get predicted probabilities
y_train_probs = model.predict_proba(X_train)[:,1]
y_test_probs = model.predict_proba(X_test)[:,1]
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.842
probability_train['random_forest'] = y_train_probs
probability_test['random_forest'] = y_test_probs
Neural nets (with embedding)#
Define net#
def make_net(number_features_patient,
number_features_pathway,
number_features_hospital,
patient_encoding_neurones=1,
pathway_encoding_neurones=1,
hospital_encoding_neurones=1,
expansion=2,
learning_rate=0.003,
dropout=0.5):
# Clear Tensorflow
K.clear_session()
# Patient (clinical data) encoding layers
input_patient = layers.Input(shape=number_features_patient)
# Patient dense layer 1
patient_dense_1 = layers.Dense(
number_features_patient * expansion, activation='relu')(input_patient)
patient_norm_1 = layers.BatchNormalization()(patient_dense_1)
patient_dropout_1 = layers.Dropout(dropout)(patient_norm_1)
# Patient encoding layer
patient_encoding = layers.Dense(
patient_encoding_neurones, activation='sigmoid',
name='patient_encode')(patient_dropout_1)
# Pathway encoding layers
input_pathway = layers.Input(shape=number_features_pathway)
# pathway dense layer 1
pathway_dense_1 = layers.Dense(
number_features_pathway * expansion, activation='relu')(input_pathway)
pathway_norm_1 = layers.BatchNormalization()(pathway_dense_1)
pathway_dropout_1 = layers.Dropout(dropout)(pathway_norm_1)
# pathway encoding layer
pathway_encoding = layers.Dense(
pathway_encoding_neurones, activation='sigmoid',
name='pathway_encode')(pathway_dropout_1)
# hospital encoding layers
input_hospital = layers.Input(shape=number_features_hospital)
# hospital encoding layer
hospital_encoding = layers.Dense(
hospital_encoding_neurones, activation='sigmoid',
name='hospital_encode')(input_hospital)
# Concatenation layer
concat = layers.Concatenate()(
[patient_encoding, pathway_encoding, hospital_encoding])
# Outpout (single sigmoid)
outputs = layers.Dense(1, activation='sigmoid')(concat)
# Build net
net = Model(inputs=[
input_patient, input_pathway, input_hospital], outputs=outputs)
# Compiling model
opt = Adam(lr=learning_rate)
net.compile(loss='binary_crossentropy',
optimizer=opt,
metrics=['accuracy'])
return net
Fit model#
# Get data subgroups
subgroups = pd.read_csv('../data/subnet.csv', index_col='Item')
# Get list of clinical items
clinical_subgroup = subgroups.loc[subgroups['Subnet']=='clinical']
clinical_subgroup = list(clinical_subgroup.index)
# Get list of pathway items
pathway_subgroup = subgroups.loc[subgroups['Subnet']=='pathway']
pathway_subgroup = list(pathway_subgroup.index)
# Get list of hospital items
hospital_subgroup = subgroups.loc[subgroups['Subnet']=='hospital']
hospital_subgroup = list(hospital_subgroup.index)
# Read in data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
# OneHot encode stroke team
coded = pd.get_dummies(train['StrokeTeam'])
train = pd.concat([train, coded], axis=1)
train.drop('StrokeTeam', inplace=True, axis=1)
coded = pd.get_dummies(test['StrokeTeam'])
test = pd.concat([test, coded], axis=1)
test.drop('StrokeTeam', inplace=True, axis=1)
# Split into X, y
X_train_df = train.drop('S2Thrombolysis',axis=1)
y_train_df = train['S2Thrombolysis']
X_test_df = test.drop('S2Thrombolysis',axis=1)
y_test_df = test['S2Thrombolysis']
# Split train and test data by subgroups
X_train_patients = X_train_df[clinical_subgroup]
X_test_patients = X_test_df[clinical_subgroup]
X_train_pathway = X_train_df[pathway_subgroup]
X_test_pathway = X_test_df[pathway_subgroup]
X_train_hospitals = X_train_df[hospital_subgroup]
X_test_hospitals = X_test_df[hospital_subgroup]
# Convert to NumPy
X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values
y_test = y_test_df.values
# Scale data
X_train_patients_sc, X_test_patients_sc = \
scale_data(X_train_patients, X_test_patients)
X_train_pathway_sc, X_test_pathway_sc = \
scale_data(X_train_pathway, X_test_pathway)
X_train_hospitals_sc, X_test_hospitals_sc = \
scale_data(X_train_hospitals, X_test_hospitals)
# Define network
number_features_patient = X_train_patients_sc.shape[1]
number_features_pathway = X_train_pathway_sc.shape[1]
number_features_hospital = X_train_hospitals_sc.shape[1]
model = make_net(
number_features_patient,
number_features_pathway,
number_features_hospital)
# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
'model_checkpoint_1d.h5', save_best_only=True)
# Define early stopping callback: Stop when no validation improvement
# Restore weights to best validation accuracy
early_stopping_cb = keras.callbacks.EarlyStopping(
patience=50, restore_best_weights=True)
# Train model (including class weights)
history = model.fit(
[X_train_patients_sc, X_train_pathway_sc, X_train_hospitals_sc],
y_train,
epochs=5000,
batch_size=32,
validation_data=(
[X_test_patients_sc, X_test_pathway_sc, X_test_hospitals_sc],
y_test),
verbose=0,
callbacks=[checkpoint_cb, early_stopping_cb])
# Get predicted probabilities
y_train_probs = model.predict(
[X_train_patients_sc, X_train_pathway_sc, X_train_hospitals_sc])
y_test_probs = model.predict(
[X_test_patients_sc, X_test_pathway_sc, X_test_hospitals_sc])
# Flatten arrays
y_train_probs = y_train_probs.flatten()
y_test_probs = y_test_probs.flatten()
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.859
Test accuracy: 0.850
probability_train['neural_net'] = y_train_probs
probability_test['neural_net'] = y_test_probs
Add actual use of thrombolysis#
probability_train['actual'] = train['S2Thrombolysis']
probability_test['actual'] = test['S2Thrombolysis']
Save results#
probability_train.to_csv(
'./individual_model_output/probabilities_train.csv', index = False)
probability_test.to_csv(
'./individual_model_output/probabilities_test.csv', index=False)