Ensemble combine with logistic regression#

Aim: To combine output from logistic regression, random forests, and neural network models in a single logistic regression model, with or without original features.

Import libraries#

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Load data#

train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
model_probs_train = pd.read_csv(
    './individual_model_output/probabilities_train.csv')
model_probs_test = pd.read_csv(
    './individual_model_output/probabilities_test.csv')

Function to standardise data#

def standardise_data(X_train, X_test):
    """
    Converts all data to a similar scale.
    Standardisation subtracts mean and divides by standard deviation
    for each feature.
    Standardised data will have a mena of 0 and standard deviation of 1.
    The training data mean and standard deviation is used to standardise both
    training and test set data.
    """
    
    # Initialise a new scaling object for normalising input data
    sc = StandardScaler() 

    # Set up the scaler just on the training set
    sc.fit(X_train)

    # Apply the scaler to the training and test sets
    train_std=sc.transform(X_train)
    test_std=sc.transform(X_test)
    
    return train_std, test_std

Fit a model using original training data only#

# Set up train and test data
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)

# Standardise X data
X_train_std, X_test_std = standardise_data(X_train, X_test)

# Define and Fit model
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train_std, y_train)

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train_std)[:,1]
y_test_probs = model.predict_proba(X_test_std)[:,1]
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.834
Test accuracy: 0.832

Fit a model using model probabilities only#

# Set up train and test data

X_train = model_probs_train
X_test = model_probs_test
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# Standardise X data
X_train_std, X_test_std = standardise_data(X_train, X_test)

# Define and Fit model
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train_std, y_train)

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train_std)[:,1]
y_test_probs = model.predict_proba(X_test_std)[:,1]

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train)[:,1]
y_test_probs = model.predict_proba(X_test)[:,1]
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.836

Fit a model using original features and model probabilities#

# Set up train and test data
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)

# Add in model probabilties
X_train = pd.concat([X_train, model_probs_train], axis=1)
X_test = pd.concat([X_test, model_probs_test], axis=1)

# Standardise X data
X_train_std, X_test_std = standardise_data(X_train, X_test)

# Define and Fit model
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train_std, y_train)

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train_std)[:,1]
y_test_probs = model.predict_proba(X_test_std)[:,1]
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.839

Observations#

  • Including in model probability outputs from previous logistic regression, random forests, and neural networks, did not improve accuracy of the model compared to fitting a logistic regression model just on the original data.