How would thrombolysis use change if clinical decisions were made by hospitals with the highest current thrombolysis rate?#

Aims#

  • Investigate changes in thrombolysis use (for patients arriving within 4 hours of known onset) if the decision to thrombolyse is made by the 30 hospitals with the highest current thrombolysis rate.

  • Compare this to changes in thrombolysis use when the decision is made by a benchmark set of hospitals (those 30 hospitals which have the highest predicted thrombolysis use in a standard cohort of patients; see separate notebook)

Import libraries#

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle as pkl

import matplotlib.cm as cm
import matplotlib.colors as colors
from matplotlib.lines import Line2D

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

Load pre-trained hospital models into dictionary hospital2model#

keys = hospitals

values = trained_classifier, threshold, patients, outcomes

with open ('./models/trained_hospital_models.pkl', 'rb') as f:
    hospital2model = pkl.load(f)

Load data#

Create combined data set by combining cohort train/test

data = pd.concat([
    pd.read_csv('./../data/10k_training_test/cohort_10000_train.csv'),
    pd.read_csv('./../data/10k_training_test/cohort_10000_test.csv')],
    axis=0)

data = data.sample(frac=1.0, random_state=42)

hospitals = list(set(data['StrokeTeam'].values))

Find 30 hospitals with highest thrombolysis rate#

These are the hospitals with highest actual thrombolysis use.

hospitals = list(hospital2model.keys())
rates = []

for h in hospitals:
    
    patients = data.loc[data['StrokeTeam']==h]
    
    rate = sum(patients['S2Thrombolysis'].values)/len(patients)
    
    rates.append([h,rate])
rates = sorted(rates, key=lambda x: x[1], reverse=True)
sorted_hospitals, true_rates = zip(*rates)
top_hospitals = sorted_hospitals[:30]

Put patients from all other hospitals through top hospitals#

columns = np.concatenate((['Hospital','True', 'Majority'], top_hospitals)) 

results_top = pd.DataFrame(columns = columns)


for hospital in hospitals:
    
    patients = data.loc[data['StrokeTeam']==hospital]
    
    y = patients['S2Thrombolysis'].values
    X = patients.drop(['StrokeTeam', 'S2Thrombolysis'], axis=1)
    
    hospital_results = pd.DataFrame(columns = columns)           
        
    for top_hospital in top_hospitals:

        model, threshold, _, _ = hospital2model[top_hospital]
        
        y_prob = model.predict_proba(X)[:,1]
        
        y_pred = [1 if p >= threshold else 0 for p in y_prob]

        hospital_results[top_hospital] = y_pred

        
    hospital_results['Hospital'] = [hospital for person in y]
    hospital_results['True'] = y
    

    results_top = results_top.append(hospital_results, ignore_index=True)

Add majority outcome#

majority_threshold = 15/30

for index,row in results_top.iterrows():
    

    no = sum([1 for val in row[3:].values if val ==0])
    yes = sum(row[3:].values)
    
    
    if yes/(no+yes)>=majority_threshold:
        
        results_top.loc[index, 'Majority'] = 1
        
    else:
        
        results_top.loc[index, 'Majority'] = 0

Find new thrombolysis rates#

Find thrombolysis use rates for each hospital if the decision to thrombolyse for their patients was made by a majority vote of 30 hospitals with the current highest thrombolysis use.

# Get current and 'top 30 majority' vote thrombolysis use rates
top_30_results = (results_top.groupby('Hospital')['True', 'Majority'].sum() /
                  results_top.groupby('Hospital')['True', 'Majority'].count())

# Add in admissions
top_30_results['admissions'] = results_top.groupby('Hospital').count()['True']

# Add in whether hospital is in top 30
top_30_results['top_30'] = [x in top_hospitals for x in top_30_results.index ]

# Show DataFrame
top_30_results
True Majority admissions top_30
Hospital
AGNOF1041H 0.352064 0.433486 872 True
AKCGO9726K 0.369748 0.313725 1428 True
AOBTM3098N 0.218803 0.305983 585 False
APXEE8191H 0.226481 0.238676 574 False
ATDID5461S 0.239617 0.255591 313 False
... ... ... ... ...
YPKYH1768F 0.246057 0.312303 317 False
YQMZV4284N 0.236170 0.236170 470 False
ZBVSO0975W 0.250000 0.363426 432 False
ZHCLE1578P 0.223639 0.164966 1176 False
ZRRCV7012C 0.157447 0.259574 705 False

132 rows × 4 columns

Plot#

fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot()

# Plot non-benchmark hospitals in blue
mask = top_30_results['top_30'] == False
non_bench = top_30_results[mask]

for i, val in non_bench.iterrows():
    start = [non_bench['True'] * 100,
             non_bench['True'] * 100]
    end = [non_bench['True'] * 100,
             non_bench['Majority'] * 100]
    ax.plot(start, end, c='b', lw=1, zorder=1)
    ax.scatter(start[0], start[1], marker='o', facecolors='b', edgecolors='b', 
               s=20, zorder=2, alpha=0.6)
    ax.scatter(end[0], end[1], marker='o', facecolors='w', edgecolors='b',
               s=20, zorder=2, alpha=0.6)

# Plot benchmark hospitals in red
mask = top_30_results['top_30'] == True
bench = top_30_results[mask]

for i, val in bench.iterrows():
    start = [bench['True'] * 100,
             bench['True'] * 100]
    end = [bench['True'] * 100,
             bench['Majority'] * 100]
    ax.plot(start, end, c='r', lw=1, zorder=1)
    ax.scatter(start[0], start[1], marker='o', facecolors='r', edgecolors='r', 
               s=20, zorder=2, alpha=0.6)
    ax.scatter(end[0], end[1], marker='o', facecolors='w', edgecolors='r',
               s=20, zorder=2, alpha=0.6)

# Add mods 
ax.set_xlabel('Actual thrombolysis rate (%)')
ax.set_ylabel('Predicted benchmark thrombolysis rate (%)')
ax.set_xlim(0, 57)
ax.set_ylim(0, 57)
ax.grid()

custom_lines = [Line2D([0], [0], color='r', alpha=0.6, lw=2),
                Line2D([0], [0], color='b', alpha = 0.6,lw=2)]

plt.legend(custom_lines, ['Current top 30', 'Outside current top 30'],
          loc='lower right')

plt.tight_layout()
plt.savefig('output/top_30_actual_thrombolysis.jpg', dpi=300)

plt.show()
../_images/random_forest_compare_highest_thrombolysing_with_benchmark_22_0.png

Calculated weighted average#

Weight thrombolysis by admission numbers

base_rates = (
    (top_30_results['True'] * top_30_results['admissions']).sum() / 
    top_30_results['admissions'].sum())

top_30_rates = (
    (top_30_results['Majority'] * top_30_results['admissions']).sum() /
    top_30_results['admissions'].sum())
print (f'Baseline thrombolysis: {base_rates*100:0.1f}')
print (f'Top 30 decision thrombolysis: {top_30_rates*100:0.1f}')
ratio = top_30_rates / base_rates
print (f'Ratio of top 30 deiciions to base rate: {ratio:0.3f}')
Baseline thrombolysis: 29.5
Top 30 decision thrombolysis: 32.7
Ratio of top 30 deiciions to base rate: 1.108

Observations#

  • When the decision to thrombolyse is made by a majority vote of the top thrombolysing hospitals, the overall thrombolysis rate increases from 29.5% to 32.7%.

  • The change in thrombolysis use is significantly smaller than if the comparator hospitals are those 30 hospitals which have the highest predicted thrombolysis use in a standard cohort of patients (where thrombolysis use is predicted to rise to 36.9%).

  • The thrombolysis rate does not increase in all hospitals.