Ensemble combine with logistic regression
Contents
Ensemble combine with logistic regression#
Aim: To combine output from logistic regression, random forests, and neural network models in a single logistic regression model, with or without original features.
Import libraries#
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
Load data#
train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
model_probs_train = pd.read_csv(
'./individual_model_output/probabilities_train.csv')
model_probs_test = pd.read_csv(
'./individual_model_output/probabilities_test.csv')
Function to standardise data#
def standardise_data(X_train, X_test):
"""
Converts all data to a similar scale.
Standardisation subtracts mean and divides by standard deviation
for each feature.
Standardised data will have a mena of 0 and standard deviation of 1.
The training data mean and standard deviation is used to standardise both
training and test set data.
"""
# Initialise a new scaling object for normalising input data
sc = StandardScaler()
# Set up the scaler just on the training set
sc.fit(X_train)
# Apply the scaler to the training and test sets
train_std=sc.transform(X_train)
test_std=sc.transform(X_test)
return train_std, test_std
Fit a model using original training data only#
# Set up train and test data
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']
# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']
# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)
# Standardise X data
X_train_std, X_test_std = standardise_data(X_train, X_test)
# Define and Fit model
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train_std, y_train)
# Get predicted probabilities
y_train_probs = model.predict_proba(X_train_std)[:,1]
y_test_probs = model.predict_proba(X_test_std)[:,1]
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 0.834
Test accuracy: 0.832
Fit a model using model probabilities only#
# Set up train and test data
X_train = model_probs_train
X_test = model_probs_test
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']
# Standardise X data
X_train_std, X_test_std = standardise_data(X_train, X_test)
# Define and Fit model
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train_std, y_train)
# Get predicted probabilities
y_train_probs = model.predict_proba(X_train_std)[:,1]
y_test_probs = model.predict_proba(X_test_std)[:,1]
# Get predicted probabilities
y_train_probs = model.predict_proba(X_train)[:,1]
y_test_probs = model.predict_proba(X_test)[:,1]
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.836
Fit a model using original features and model probabilities#
# Set up train and test data
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']
# Get X and y
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']
# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)
# Add in model probabilties
X_train = pd.concat([X_train, model_probs_train], axis=1)
X_test = pd.concat([X_test, model_probs_test], axis=1)
# Standardise X data
X_train_std, X_test_std = standardise_data(X_train, X_test)
# Define and Fit model
model = LogisticRegression(solver='lbfgs', random_state=42)
model.fit(X_train_std, y_train)
# Get predicted probabilities
y_train_probs = model.predict_proba(X_train_std)[:,1]
y_test_probs = model.predict_proba(X_test_std)[:,1]
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.839
Observations#
Including in model probability outputs from previous logistic regression, random forests, and neural networks, did not improve accuracy of the model compared to fitting a logistic regression model just on the original data.