Ensemble model fitting - generate probabilities#

Aims:

  • To generate probability outputs for training and test data using logistic regression, random forests, and a neural network.

Ensemble modelling combines multiple models in an attempt to increase accuracy. In this test we use the training/test sets with a 10k test cohort. Individual models are built using:

  • Random Forests (with one-hot encoding of hospital)

  • Logistic Regression (with one-hot encoding of hospital)

  • Neural Net (with an embedding layer to encode the hospital)

This notebook runs fits for each type of model and returns the predicted probability of receiving thrombolysis for each model.

In subsequent notebooks, these individual model outputs will be used as inputs for ensemble models

Import libraries#

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow.keras.losses import binary_crossentropy

Set up dataframes for probabilitiy outputs#

probability_train = pd.DataFrame()
probability_test = pd.DataFrame()

Functions#

Standardise data#

def standardise_data(X_train, X_test):
    """
    Converts all data to a similar scale.
    Standardisation subtracts mean and divides by standard deviation
    for each feature.
    Standardised data will have a mena of 0 and standard deviation of 1.
    The training data mean and standard deviation is used to standardise both
    training and test set data.
    """
    
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler() 

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_std=sc.transform(X_train)
    test_std=sc.transform(X_test)
    
    return train_std, test_std

MinMax scaling#

def scale_data(X_train, X_test):
    """Scale data 0-1 based on min and max in training set"""
    
    # Initialise a new scaling object for normalising input data
    sc = MinMaxScaler()

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_sc = sc.transform(X_train)
    test_sc = sc.transform(X_test)
    
    return train_sc, test_sc    

Logistic regression model#

# Read in data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')

# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)

# Standardise X data
X_train_std, X_test_std = standardise_data(X_train, X_test)

# Define and Fit model
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train_std, y_train)

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train_std)[:,1]
y_test_probs = model.predict_proba(X_test_std)[:,1]

# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.834
Test accuracy: 0.832

Add to probability results

probability_train['logistic_regression'] = y_train_probs
probability_test['logistic_regression'] = y_test_probs

Random forests#

# Read in data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')

# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)

# Define model
model = RandomForestClassifier(
    n_estimators=100, n_jobs=-1, class_weight='balanced', random_state=42)

# Fit model
model.fit(X_train, y_train)

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train)[:,1]
y_test_probs = model.predict_proba(X_test)[:,1]
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.842
probability_train['random_forest'] = y_train_probs
probability_test['random_forest'] = y_test_probs

Neural nets (with embedding)#

Define net#

def make_net(number_features_patient,
             number_features_pathway,
             number_features_hospital, 
             patient_encoding_neurones=1,
             pathway_encoding_neurones=1,
             hospital_encoding_neurones=1,
             expansion=2, 
             learning_rate=0.003, 
             dropout=0.5):
    
    # Clear Tensorflow
    K.clear_session()
    
    # Patient (clinical data) encoding layers
    input_patient = layers.Input(shape=number_features_patient)
    
    # Patient dense layer 1
    patient_dense_1 = layers.Dense(
        number_features_patient * expansion, activation='relu')(input_patient)
    patient_norm_1 = layers.BatchNormalization()(patient_dense_1)
    patient_dropout_1 = layers.Dropout(dropout)(patient_norm_1)
    
    # Patient encoding layer
    patient_encoding = layers.Dense(
        patient_encoding_neurones, activation='sigmoid', 
        name='patient_encode')(patient_dropout_1)
    
    
    # Pathway encoding layers
    input_pathway = layers.Input(shape=number_features_pathway)
    
    # pathway dense layer 1
    pathway_dense_1 = layers.Dense(
        number_features_pathway * expansion, activation='relu')(input_pathway)
    pathway_norm_1 = layers.BatchNormalization()(pathway_dense_1)
    pathway_dropout_1 = layers.Dropout(dropout)(pathway_norm_1)
    
    # pathway encoding layer
    pathway_encoding = layers.Dense(
        pathway_encoding_neurones, activation='sigmoid', 
        name='pathway_encode')(pathway_dropout_1)
    
    
    # hospital encoding layers
    input_hospital = layers.Input(shape=number_features_hospital)
    
    # hospital encoding layer
    hospital_encoding = layers.Dense(
        hospital_encoding_neurones, activation='sigmoid', 
        name='hospital_encode')(input_hospital)    
    
    # Concatenation layer
    concat = layers.Concatenate()(
        [patient_encoding, pathway_encoding, hospital_encoding])
    
    # Outpout (single sigmoid)
    outputs = layers.Dense(1, activation='sigmoid')(concat)
    
    # Build net
    net = Model(inputs=[
        input_patient, input_pathway, input_hospital], outputs=outputs)
    
    # Compiling model
    opt = Adam(lr=learning_rate)
    net.compile(loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'])
    return net

Fit model#

# Get data subgroups
subgroups = pd.read_csv('../data/subnet.csv', index_col='Item')
# Get list of clinical items
clinical_subgroup = subgroups.loc[subgroups['Subnet']=='clinical']
clinical_subgroup = list(clinical_subgroup.index)
# Get list of pathway items
pathway_subgroup = subgroups.loc[subgroups['Subnet']=='pathway']
pathway_subgroup = list(pathway_subgroup.index)
# Get list of hospital items
hospital_subgroup = subgroups.loc[subgroups['Subnet']=='hospital']
hospital_subgroup = list(hospital_subgroup.index)
# Read in data
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')

# OneHot encode stroke team
coded = pd.get_dummies(train['StrokeTeam'])
train = pd.concat([train, coded], axis=1)
train.drop('StrokeTeam', inplace=True, axis=1)
coded = pd.get_dummies(test['StrokeTeam'])
test = pd.concat([test, coded], axis=1)
test.drop('StrokeTeam', inplace=True, axis=1)

# Split into X, y
X_train_df = train.drop('S2Thrombolysis',axis=1) 
y_train_df = train['S2Thrombolysis']
X_test_df = test.drop('S2Thrombolysis',axis=1) 
y_test_df = test['S2Thrombolysis'] 

# Split train and test data by subgroups
X_train_patients = X_train_df[clinical_subgroup]
X_test_patients = X_test_df[clinical_subgroup]
X_train_pathway = X_train_df[pathway_subgroup]
X_test_pathway = X_test_df[pathway_subgroup]
X_train_hospitals = X_train_df[hospital_subgroup]
X_test_hospitals = X_test_df[hospital_subgroup]

# Convert to NumPy
X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values
y_test = y_test_df.values

# Scale data
X_train_patients_sc, X_test_patients_sc = \
    scale_data(X_train_patients, X_test_patients)

X_train_pathway_sc, X_test_pathway_sc = \
    scale_data(X_train_pathway, X_test_pathway)

X_train_hospitals_sc, X_test_hospitals_sc = \
    scale_data(X_train_hospitals, X_test_hospitals)

# Define network
number_features_patient = X_train_patients_sc.shape[1]
number_features_pathway = X_train_pathway_sc.shape[1]
number_features_hospital = X_train_hospitals_sc.shape[1]

model = make_net(
    number_features_patient, 
    number_features_pathway, 
    number_features_hospital)

# Define save checkpoint callback (only save if new best validation results)
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    'model_checkpoint_1d.h5', save_best_only=True)

# Define early stopping callback: Stop when no validation improvement
# Restore weights to best validation accuracy
early_stopping_cb = keras.callbacks.EarlyStopping(
    patience=50, restore_best_weights=True)

# Train model (including class weights)
history = model.fit(
    [X_train_patients_sc, X_train_pathway_sc, X_train_hospitals_sc],
    y_train,
    epochs=5000,
    batch_size=32,
    validation_data=(
        [X_test_patients_sc, X_test_pathway_sc, X_test_hospitals_sc], 
        y_test),
    verbose=0,
    callbacks=[checkpoint_cb, early_stopping_cb])

# Get predicted probabilities
y_train_probs = model.predict(
        [X_train_patients_sc, X_train_pathway_sc, X_train_hospitals_sc])
y_test_probs = model.predict(
        [X_test_patients_sc, X_test_pathway_sc, X_test_hospitals_sc])

# Flatten arrays
y_train_probs = y_train_probs.flatten()
y_test_probs = y_test_probs.flatten()

# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.859
Test accuracy: 0.850
probability_train['neural_net'] = y_train_probs
probability_test['neural_net'] = y_test_probs

Add actual use of thrombolysis#

probability_train['actual'] = train['S2Thrombolysis']
probability_test['actual'] = test['S2Thrombolysis']

Save results#

probability_train.to_csv(
    './individual_model_output/probabilities_train.csv', index = False)
probability_test.to_csv(
    './individual_model_output/probabilities_test.csv', index=False)