Comparison of model outputs
Contents
Comparison of model outputs#
Aims:
- Compare probability predictions between logistic regression, random forests, and neural networks 
- Compare classifications between logistic regression, random forests, and neural networks 
- Compare classifications between logistic regression, random forests neural networks, and actual classification (whether thrombolysis was given). 
Import libraries#
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
Load data#
model_probs_test = pd.read_csv(
    './individual_model_output/probabilities_test.csv')
Define the cutoff for classification#
cut_off = 0.5
Define function to set the colour for each point on the plot#
def set_point_colour(s1, s2, cut_off):
    """
    s1 and s2 are series containing each model output for a number of instances
    cut_off is a float to classify each instance
    Returns series with a colour per instance to define whether there is a 
    match in the classification between the two models
    """
    # the classifications
    mask1 = s1 >= cut_off    
    mask2 = s2 >= cut_off
    mask3 = mask1 == mask2
    
    # initialise series with size the number of instances
    c = s1.copy(deep=True)
    # initialise with value 'r'
    c[:] = 'r'
    # value 'g' for those that match classification
    c[mask3] = 'g'
    
    return(c)
Compare probabilties#
Colour the points depending on whether the two models agree in their prediction.
fig = plt.figure(figsize=(15,5))
# Random Forests vs. Logistic Regression
ax1 = fig.add_subplot(131)
r_square = np.corrcoef(model_probs_test['logistic_regression'],
           model_probs_test['random_forest'])[0][1] ** 2
c = set_point_colour(model_probs_test['logistic_regression'],
                     model_probs_test['random_forest'],
                     cut_off)
ax1.scatter(model_probs_test['logistic_regression'],
            model_probs_test['random_forest'],
            alpha=0.5, s=2, color = c)
ax1.set_xlabel('logistic regression')
ax1.set_ylabel('random forests')
ax1.set_title('Random Forests vs. Logistic Regression')
txt = f'R-squared: {r_square:0.3f}'
ax1.text(0.02, 0.97, txt)
# Neural network vs. Logistic Regression
ax2 = fig.add_subplot(132)
r_square = np.corrcoef(model_probs_test['logistic_regression'],
           model_probs_test['neural_net'])[0][1] ** 2
c = set_point_colour(model_probs_test['logistic_regression'],
                     model_probs_test['neural_net'],
                     cut_off)
ax2.scatter(model_probs_test['logistic_regression'],
            model_probs_test['neural_net'],
            alpha=0.5, s=2, color = c)
ax2.set_xlabel('logistic regression')
ax2.set_ylabel('neural network')
ax2.set_title('Neural Network vs. Logistic Regression')
txt = f'R-squared: {r_square:0.3f}'
ax2.text(0.02, 0.95, txt)
# Neural Network vs. Random Forests
ax3 = fig.add_subplot(133)
r_square = np.corrcoef(model_probs_test['random_forest'],
           model_probs_test['neural_net'])[0][1] ** 2
c = set_point_colour(model_probs_test['neural_net'],
                     model_probs_test['random_forest'],
                     cut_off)
ax3.scatter(model_probs_test['random_forest'],
           model_probs_test['neural_net'],
           alpha=0.5, s=2, color = c)
ax3.set_xlabel('random forests')
ax3.set_ylabel('neural network')
ax3.set_title('Neural Network vs. Random Forests')
txt = f'R-squared: {r_square:0.3f}'
ax3.text(0.02, 0.95, txt)
plt.tight_layout(pad=2)
plt.savefig('./ensemble_output/model_fits_scatter.png', dpi=300)
plt.show()
 
Compare classification#
classification = model_probs_test >= cut_off
Check agreement between logistic regression and random forests
agree = classification['logistic_regression'] == classification['random_forest']
np.mean(agree)
0.8925
Check agreement between logistic regression and neural network
agree = classification['logistic_regression'] == classification['neural_net']
np.mean(agree)
0.9136
Check agreement between random forests and neural network
agree = classification['random_forest'] == classification['neural_net']
np.mean(agree)
0.9269
Check agreement between all three model types
agree = (classification['logistic_regression'] == classification['random_forest']) & \
    (classification['logistic_regression'] == classification['neural_net'])
np.mean(agree)
0.8665
Plot confusion matrix#
Create function that returns the classification of each column based on the cutoff used for classification
def create_df_of_predictions(model_probs_test, cols, cut_off):
    """
    Given a dataframe with columns cols, return a dataframe that calssifies each 
    instance depending on whether the value exceeds the cutoff
    """
    df = pd.DataFrame()
    for c in cols:
        df[c] = model_probs_test[c] >= cut_off    
    return(df)
def confusion_matrix(df: pd.DataFrame, col1: str, col2: str):
    """
    Given a dataframe with at least
    two categorical columns, create a 
    confusion matrix of the count of the columns
    cross-counts
    
    use like:
    
    >>> confusion_matrix(test_df, 'actual_label', 'predicted_label')
    
    https://gist.github.com/Mlawrence95/f697aa939592fa3ef465c05821e1deed
    """
    return (
            df
            .groupby([col1, col2])
            .size()
            .unstack(fill_value=0)
            )
def get_min_max_cm(df_prediction_all, model_pairings):
    """
    For each model pairings, create the confusion matrix and return the min and
    max value across all confusion matrices.
    """
    
    v_min = df_prediction_all.shape[0]
    v_max = 0
    for l in model_pairings:
        cm = confusion_matrix(df_prediction_all, l[0], l[1])
        v_min = min(v_min, cm.min().min())        
        v_max = max(v_max, cm.max().max())        
   
    return(v_min, v_max)
For each model, create the prediction based on the cut_off for classification
df_prediction = create_df_of_predictions(
                        model_probs_test,
                        ["random_forest","neural_net", "logistic_regression"],
                        cut_off)
For each model pairing, calculate the confusion matrix and store the min and max values across all. Need this to ensure all of the subplots use the same range.
model_pairings = [["logistic_regression","random_forest"],
                  ["logistic_regression","neural_net"],
                  ["random_forest","neural_net"]]
v_min, v_max = get_min_max_cm(df_prediction, model_pairings)
Create a figure with 3 heatmaps using subplots (a column for each subplot) Invert the y axis so that it has the same order as the scatter plots above (i.e. True on top)
plt.figure(figsize=(15,5))
# set label size
sn.set(font_scale=1.3)
c_bar = False
for i in range(len(model_pairings)):
    plt.subplot(1,3,i+1)
    cm = confusion_matrix(df_prediction, 
                          model_pairings[i][0], 
                          model_pairings[i][1])
    # for the last plot, show the color bar
    #if i == (len(model_pairings)-1):
    #   c_bar = True
    
    # plot the heatmap
    sn.heatmap(cm, annot=True, cbar=c_bar, cmap = "turbo",
                    linecolor="w", linewidths=2, fmt='g',
                    vmin=v_min, vmax=v_max)
    
    # invert the y axis
    plt.gca().invert_yaxis()
    
plt.tight_layout(pad=2)
plt.savefig('./ensemble_output/confusion_matrix.png', dpi=300)
plt.show()
 
Confusion matrix for models vs actual#
Add the column of data “actual” to the df_prediction dataframe
df_prediction["actual"] = model_probs_test["actual"]
df_prediction["actual"].replace({1: True, 0: False}, inplace=True)
df_prediction.head()
| random_forest | neural_net | logistic_regression | actual | |
|---|---|---|---|---|
| 0 | False | False | False | False | 
| 1 | True | False | False | False | 
| 2 | False | False | False | False | 
| 3 | True | True | False | False | 
| 4 | False | False | False | False | 
For each model pairing, calculate the confusion matrix and store the min and max values across all. Need this to ensure all of the subplots use the same range.
model_pairings = [["logistic_regression","actual"],
                  ["random_forest","actual"],
                  ["neural_net", "actual"]]
v_min, v_max = get_min_max_cm(df_prediction, model_pairings)
Create a figure with 3 heatmaps using subplots (a column for each subplot). Invert the y axis so that it has the same order as the scatter plots above (i.e. True on top)
plt.figure(figsize=(15,5))
# set label size
sn.set(font_scale=1.3)
c_bar = False
for i in range(len(model_pairings)):
    plt.subplot(1,3,i+1)
    cm = confusion_matrix(df_prediction, 
                          model_pairings[i][0], 
                          model_pairings[i][1])
    # for the last plot, show the color bar
    #if i == (len(model_pairings)-1):
    #   c_bar = True
    
    # plot the heatmap
    sn.heatmap(cm, annot=True, cbar=c_bar, cmap = "turbo",
                    linecolor="w", linewidths=2, fmt='g',
                    vmin=v_min, vmax=v_max)
    
    # invert the y axis
    plt.gca().invert_yaxis()
    
plt.tight_layout(pad=2)
plt.savefig('./ensemble_output/confusion_matrix_actual.png', dpi=300)
plt.show()
 
Observations#
- There is high agreement in prediction between different model types, with random forests and neural networks having the highest agreement. 
- There is greater agreement between the different model types than between the models and reality.