Ensemble combine with random forests#

Aim: To combine output from logistic regression, random forests, and neural network models in a single random forests model, with or without original features.

Import libraries#

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

Load data#

train = pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv')
test = pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')
model_probs_train = pd.read_csv(
    './individual_model_output/probabilities_train.csv')
model_probs_test = pd.read_csv(
    './individual_model_output/probabilities_test.csv')

Fit a model using original training data only#

# Set up train and test data
X_train = train.drop('S2Thrombolysis', axis=1)
X_test = test.drop('S2Thrombolysis', axis=1)
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)

# Define model
model = RandomForestClassifier(
    n_estimators=100, n_jobs=-1, class_weight='balanced', random_state=42)

# Fit model
model.fit(X_train, y_train)

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train)[:,1]
y_test_probs = model.predict_proba(X_test)[:,1]
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.842

Fit a model using model probabilities only#

# Set up train and test data
X_train = model_probs_train
X_test = model_probs_test
y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# Define model
model = RandomForestClassifier(
    n_estimators=100, n_jobs=-1, class_weight='balanced', random_state=42)

# Fit model
model.fit(X_train, y_train)

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train)[:,1]
y_test_probs = model.predict_proba(X_test)[:,1]
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.838

Fit a model using original features and model probabilities#

X_train = train.drop('S2Thrombolysis', axis=1)
X_train = pd.concat([X_train, model_probs_train], axis=1)

X_test = test.drop('S2Thrombolysis', axis=1)
X_test = pd.concat([X_test, model_probs_test], axis=1)

y_train = train['S2Thrombolysis']
y_test = test['S2Thrombolysis']

# One hot encode hospitals
X_train_hosp = pd.get_dummies(X_train['StrokeTeam'], prefix = 'team')
X_train = pd.concat([X_train, X_train_hosp], axis=1)
X_train.drop('StrokeTeam', axis=1, inplace=True)
X_test_hosp = pd.get_dummies(X_test['StrokeTeam'], prefix = 'team')
X_test = pd.concat([X_test, X_test_hosp], axis=1)
X_test.drop('StrokeTeam', axis=1, inplace=True)

# Define model
model = RandomForestClassifier(
    n_estimators=100, n_jobs=-1, class_weight='balanced', random_state=42)

# Fit model
model.fit(X_train, y_train)

# Get predicted probabilities
y_train_probs = model.predict_proba(X_train)[:,1]
y_test_probs = model.predict_proba(X_test)[:,1]
                                    
# Show accuracy
train_class = y_train_probs >= 0.5
test_class = y_test_probs >= 0.5
accuracy_train = np.mean(y_train == train_class)
accuracy_test = np.mean(y_test == test_class)
print (f'Training accuracy: {accuracy_train:0.3f}')
print (f'Test accuracy: {accuracy_test:0.3f}')
Training accuracy: 1.000
Test accuracy: 0.842

Observations#

  • Including in model probability outputs from previous logistic regression, random forests, and neural networks, did not improve accuracy of the model compared to fitting a random forests model just on the original data.