# Comparison of model outputs

## Contents

# Comparison of model outputs#

Aims:

Compare probability predictions between logistic regression, random forests, and neural networks

Compare classifications between logistic regression, random forests, and neural networks

Compare classifications between logistic regression, random forests neural networks, and actual classification (whether thrombolysis was given).

## Import libraries#

```
# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
```

## Load data#

```
model_probs_test = pd.read_csv(
'./individual_model_output/probabilities_test.csv')
```

## Define the cutoff for classification#

```
cut_off = 0.5
```

## Define function to set the colour for each point on the plot#

```
def set_point_colour(s1, s2, cut_off):
"""
s1 and s2 are series containing each model output for a number of instances
cut_off is a float to classify each instance
Returns series with a colour per instance to define whether there is a
match in the classification between the two models
"""
# the classifications
mask1 = s1 >= cut_off
mask2 = s2 >= cut_off
mask3 = mask1 == mask2
# initialise series with size the number of instances
c = s1.copy(deep=True)
# initialise with value 'r'
c[:] = 'r'
# value 'g' for those that match classification
c[mask3] = 'g'
return(c)
```

## Compare probabilties#

Colour the points depending on whether the two models agree in their prediction.

```
fig = plt.figure(figsize=(15,5))
# Random Forests vs. Logistic Regression
ax1 = fig.add_subplot(131)
r_square = np.corrcoef(model_probs_test['logistic_regression'],
model_probs_test['random_forest'])[0][1] ** 2
c = set_point_colour(model_probs_test['logistic_regression'],
model_probs_test['random_forest'],
cut_off)
ax1.scatter(model_probs_test['logistic_regression'],
model_probs_test['random_forest'],
alpha=0.5, s=2, color = c)
ax1.set_xlabel('logistic regression')
ax1.set_ylabel('random forests')
ax1.set_title('Random Forests vs. Logistic Regression')
txt = f'R-squared: {r_square:0.3f}'
ax1.text(0.02, 0.97, txt)
# Neural network vs. Logistic Regression
ax2 = fig.add_subplot(132)
r_square = np.corrcoef(model_probs_test['logistic_regression'],
model_probs_test['neural_net'])[0][1] ** 2
c = set_point_colour(model_probs_test['logistic_regression'],
model_probs_test['neural_net'],
cut_off)
ax2.scatter(model_probs_test['logistic_regression'],
model_probs_test['neural_net'],
alpha=0.5, s=2, color = c)
ax2.set_xlabel('logistic regression')
ax2.set_ylabel('neural network')
ax2.set_title('Neural Network vs. Logistic Regression')
txt = f'R-squared: {r_square:0.3f}'
ax2.text(0.02, 0.95, txt)
# Neural Network vs. Random Forests
ax3 = fig.add_subplot(133)
r_square = np.corrcoef(model_probs_test['random_forest'],
model_probs_test['neural_net'])[0][1] ** 2
c = set_point_colour(model_probs_test['neural_net'],
model_probs_test['random_forest'],
cut_off)
ax3.scatter(model_probs_test['random_forest'],
model_probs_test['neural_net'],
alpha=0.5, s=2, color = c)
ax3.set_xlabel('random forests')
ax3.set_ylabel('neural network')
ax3.set_title('Neural Network vs. Random Forests')
txt = f'R-squared: {r_square:0.3f}'
ax3.text(0.02, 0.95, txt)
plt.tight_layout(pad=2)
plt.savefig('./ensemble_output/model_fits_scatter.png', dpi=300)
plt.show()
```

## Compare classification#

```
classification = model_probs_test >= cut_off
```

Check agreement between logistic regression and random forests

```
agree = classification['logistic_regression'] == classification['random_forest']
np.mean(agree)
```

```
0.8925
```

Check agreement between logistic regression and neural network

```
agree = classification['logistic_regression'] == classification['neural_net']
np.mean(agree)
```

```
0.9136
```

Check agreement between random forests and neural network

```
agree = classification['random_forest'] == classification['neural_net']
np.mean(agree)
```

```
0.9269
```

Check agreement between all three model types

```
agree = (classification['logistic_regression'] == classification['random_forest']) & \
(classification['logistic_regression'] == classification['neural_net'])
np.mean(agree)
```

```
0.8665
```

## Plot confusion matrix#

Create function that returns the classification of each column based on the cutoff used for classification

```
def create_df_of_predictions(model_probs_test, cols, cut_off):
"""
Given a dataframe with columns cols, return a dataframe that calssifies each
instance depending on whether the value exceeds the cutoff
"""
df = pd.DataFrame()
for c in cols:
df[c] = model_probs_test[c] >= cut_off
return(df)
```

```
def confusion_matrix(df: pd.DataFrame, col1: str, col2: str):
"""
Given a dataframe with at least
two categorical columns, create a
confusion matrix of the count of the columns
cross-counts
use like:
>>> confusion_matrix(test_df, 'actual_label', 'predicted_label')
https://gist.github.com/Mlawrence95/f697aa939592fa3ef465c05821e1deed
"""
return (
df
.groupby([col1, col2])
.size()
.unstack(fill_value=0)
)
```

```
def get_min_max_cm(df_prediction_all, model_pairings):
"""
For each model pairings, create the confusion matrix and return the min and
max value across all confusion matrices.
"""
v_min = df_prediction_all.shape[0]
v_max = 0
for l in model_pairings:
cm = confusion_matrix(df_prediction_all, l[0], l[1])
v_min = min(v_min, cm.min().min())
v_max = max(v_max, cm.max().max())
return(v_min, v_max)
```

For each model, create the prediction based on the cut_off for classification

```
df_prediction = create_df_of_predictions(
model_probs_test,
["random_forest","neural_net", "logistic_regression"],
cut_off)
```

For each model pairing, calculate the confusion matrix and store the min and max values across all. Need this to ensure all of the subplots use the same range.

```
model_pairings = [["logistic_regression","random_forest"],
["logistic_regression","neural_net"],
["random_forest","neural_net"]]
v_min, v_max = get_min_max_cm(df_prediction, model_pairings)
```

Create a figure with 3 heatmaps using subplots (a column for each subplot) Invert the y axis so that it has the same order as the scatter plots above (i.e. True on top)

```
plt.figure(figsize=(15,5))
# set label size
sn.set(font_scale=1.3)
c_bar = False
for i in range(len(model_pairings)):
plt.subplot(1,3,i+1)
cm = confusion_matrix(df_prediction,
model_pairings[i][0],
model_pairings[i][1])
# for the last plot, show the color bar
#if i == (len(model_pairings)-1):
# c_bar = True
# plot the heatmap
sn.heatmap(cm, annot=True, cbar=c_bar, cmap = "turbo",
linecolor="w", linewidths=2, fmt='g',
vmin=v_min, vmax=v_max)
# invert the y axis
plt.gca().invert_yaxis()
plt.tight_layout(pad=2)
plt.savefig('./ensemble_output/confusion_matrix.png', dpi=300)
plt.show()
```

## Confusion matrix for models vs actual#

Add the column of data “actual” to the df_prediction dataframe

```
df_prediction["actual"] = model_probs_test["actual"]
df_prediction["actual"].replace({1: True, 0: False}, inplace=True)
df_prediction.head()
```

random_forest | neural_net | logistic_regression | actual | |
---|---|---|---|---|

0 | False | False | False | False |

1 | True | False | False | False |

2 | False | False | False | False |

3 | True | True | False | False |

4 | False | False | False | False |

For each model pairing, calculate the confusion matrix and store the min and max values across all. Need this to ensure all of the subplots use the same range.

```
model_pairings = [["logistic_regression","actual"],
["random_forest","actual"],
["neural_net", "actual"]]
v_min, v_max = get_min_max_cm(df_prediction, model_pairings)
```

Create a figure with 3 heatmaps using subplots (a column for each subplot). Invert the y axis so that it has the same order as the scatter plots above (i.e. True on top)

```
plt.figure(figsize=(15,5))
# set label size
sn.set(font_scale=1.3)
c_bar = False
for i in range(len(model_pairings)):
plt.subplot(1,3,i+1)
cm = confusion_matrix(df_prediction,
model_pairings[i][0],
model_pairings[i][1])
# for the last plot, show the color bar
#if i == (len(model_pairings)-1):
# c_bar = True
# plot the heatmap
sn.heatmap(cm, annot=True, cbar=c_bar, cmap = "turbo",
linecolor="w", linewidths=2, fmt='g',
vmin=v_min, vmax=v_max)
# invert the y axis
plt.gca().invert_yaxis()
plt.tight_layout(pad=2)
plt.savefig('./ensemble_output/confusion_matrix_actual.png', dpi=300)
plt.show()
```

## Observations#

There is high agreement in prediction between different model types, with random forests and neural networks having the highest agreement.

There is greater agreement between the different model types than between the models and reality.