Plotting thrombolysis rate by feature value for low and high thrombolysing hopsitals#

Plain English summary#

This experiment plots the relationships between feature values and thrombolysis use for low and high thrombolysing hospitals. The high and low thrombolysing hopsitals are taken as the top and bottom 30 hospitals as ranked by the predicted thrombolysis use in the same 10k cohort of patients. We also call the top 30 thrombolysing hospitals ‘benchmark’ hospitals.

We find that thrombolysis use in low thrombolysing hospitals follows the same general relationship with feature values as the high thrombolysing hospitals, but thrombolysis is consistently lower.

Aims#

  • Plots the relationships between feature values and thrombolysis use for low and high thrombolysing hospitals

Observations#

  • Thrombolysis use in low thrombolysing hospitals follows the same general relationship with feature values as the high thrombolysing hospitals, but thrombolysis is consistently lower.

# Turn warnings off to keep notebook tidy
import warnings
warnings.filterwarnings("ignore")

import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Read in JSON file#

Contains a dictionary for plain English feature names for the 8 features selected in the model. Use these as the column titles in the DataFrame.

with open("./output/feature_name_dict.json") as json_file:
    feature_name_dict = json.load(json_file)

Load data on predicted 10k cohort thrombolysis use at each hospital#

Use the hospitals thrombolysis rate on the same set of 10k patients to select the 30 hospitals with the highest thrombolysis rates.

thrombolysis_by_hosp = pd.read_csv(
    './output/10k_thrombolysis_rate_by_hosp_key_features.csv', index_col='stroke_team')
thrombolysis_by_hosp.sort_values(
    'Thrombolysis rate', ascending=False, inplace=True)
thrombolysis_by_hosp.head()
Thrombolysis rate
stroke_team
VKKDD9172T 0.4610
GKONI0110I 0.4356
CNBGF2713O 0.4207
HPWIF9956L 0.4191
MHMYL4920B 0.3981

Load 10K test data#

data_loc = '../data/10k_training_test/'
test = pd.read_csv(data_loc + 'cohort_10000_test.csv')
test.head()
StrokeTeam S1AgeOnArrival S1OnsetToArrival_min S2RankinBeforeStroke Loc LocQuestions LocCommands BestGaze Visual FacialPalsy ... AFAnticoagulentHeparin_missing S2NewAFDiagnosis_No S2NewAFDiagnosis_Yes S2NewAFDiagnosis_missing S2StrokeType_Infarction S2TIAInLastMonth_No S2TIAInLastMonth_No but S2TIAInLastMonth_Yes S2TIAInLastMonth_missing S2Thrombolysis
0 VUHVS8909F 72.5 80.0 0 3 2.0 2.0 2.0 3.0 3.0 ... 0 1 0 0 0 0 0 0 1 0
1 HZNVT9936G 72.5 87.0 2 0 0.0 0.0 0.0 0.0 1.0 ... 0 0 0 1 1 0 0 0 1 0
2 FAJKD7118X 67.5 140.0 3 0 2.0 0.0 1.0 2.0 2.0 ... 0 1 0 0 1 0 0 0 1 1
3 TPXYE0168D 77.5 108.0 0 0 0.0 0.0 0.0 1.0 2.0 ... 1 0 0 1 1 0 0 0 1 1
4 DNOYM6465G 87.5 51.0 4 1 1.0 1.0 0.0 1.0 1.0 ... 0 0 0 1 1 0 0 0 1 0

5 rows × 87 columns

Get data for benchmark hospitals and low thrombolysing hopsitals#

# Get list of key features to plot
number_of_features_to_use = 8
key_features = pd.read_csv('./output/feature_selection.csv')
key_features = list(key_features['feature'])[:number_of_features_to_use]
key_features.append('S2Thrombolysis')

# Get benchmark data
benchmark_hospitals = list(thrombolysis_by_hosp.index)[0:30]
mask = test.apply(lambda x: x['StrokeTeam'] in benchmark_hospitals, axis=1)
benchmark_data = test[mask]
benchmark_data = benchmark_data[key_features]
benchmark_data.rename(columns=feature_name_dict, inplace=True)
benchmark_data.sort_values('Stroke team', inplace=True, axis=0)

# Get low thrombolysis hospital data
low_thrombolysis_hospitals = list(thrombolysis_by_hosp.index)[-30:]
mask = test.apply(lambda x: x['StrokeTeam'] in low_thrombolysis_hospitals, axis=1)
low_thrombolysis_data = test[mask]
low_thrombolysis_data = low_thrombolysis_data[key_features]
low_thrombolysis_data.rename(columns=feature_name_dict, inplace=True)
low_thrombolysis_data.sort_values('Stroke team', inplace=True, axis=0)

Plot thrombolysis by feature value#

feat_to_show = [feature_name_dict[feat] for feat in key_features][0:7]
feat_to_show.remove('Stroke team')
fig = plt.figure(figsize=(12,9))

for n, feat in enumerate(feat_to_show):
    
    ax = fig.add_subplot(2,3,n+1)
    
    ### Plot benchmark units ###
    
    feature_data = benchmark_data[feat]
    
    # if feature has more that 50 unique values, then assume it needs to be 
    # binned (other assume they are unique categories)
    if np.unique(feature_data).shape[0] > 50:
        # bin the data
       
        # settings for the plot
        rotation = 45
        step = 30
        n_bins = 8
        
        # create list of bin values
        bin_list = [(i*step) for i in range(n_bins)]
        bin_list.append(feature_data.max())

        # create list of bins (the unique categories)
        category_list =  [f'{i*step}-{(i+1)*step}' for i in range(n_bins-1)]
        category_list.append(f'{(n_bins)*step}+')

        # bin the feature data
        feature_data = pd.cut(feature_data, bin_list, labels=category_list)

    else:
        # create a violin per unique value
        
        # settings for the plot
        rotation = 45
        
        # create list of unique categories in the feature data
        category_list = np.unique(feature_data)
        
        # if numerical set as int, otherwise set as a range
        try:
            category_list = [int(i) for i in category_list]
        except: 
            category_list = range(len(category_list))
            
    feature_data = pd.DataFrame(feature_data)
    feature_data['Thrombolysis'] = benchmark_data['Thrombolysis']
    
    df = feature_data.groupby(feat).mean()

    ax.plot(df, color='b', marker='o', linestyle='solid', label='Top 30')
    
    ### Plot low thrombolysing units ###
    
    feature_data = low_thrombolysis_data[feat]
    
    # if feature has more that 50 unique values, then assume it needs to be 
    # binned (other assume they are unique categories)
    if np.unique(feature_data).shape[0] > 50:
        # bin the data
       
        # settings for the plot
        rotation = 45
        step = 30
        n_bins = 8
        
        # create list of bin values
        bin_list = [(i*step) for i in range(n_bins)]
        bin_list.append(feature_data.max())

        # create list of bins (the unique categories)
        category_list =  [f'{i*step}-{(i+1)*step}' for i in range(n_bins-1)]
        category_list.append(f'{(n_bins)*step}+')

        # bin the feature data
        feature_data = pd.cut(feature_data, bin_list, labels=category_list)

    else:
        # create a violin per unique value
        
        # settings for the plot
        rotation = 45
        
        # create list of unique categories in the feature data
        category_list = np.unique(feature_data)
        
        # if numerical set as int, otherwise set as a range
        try:
            category_list = [int(i) for i in category_list]
        except: 
            category_list = range(len(category_list))
            
    feature_data = pd.DataFrame(feature_data)
    feature_data['Thrombolysis'] = low_thrombolysis_data['Thrombolysis']
    
    df = feature_data.groupby(feat).mean()

    ax.plot(df, color='r', marker='s', linestyle='dashed', 
            label='Bottom 30')
    ax.set_title(feat)
    ax.set_ylim(0, 0.62)
    ax.set_xlabel(feat) 
    ax.set_ylabel('Proportion receiving thrombolysis')
    ax.legend()
    
    # Customise axis for particular plots
    # Force just two points on axis when a binary feature
    if len(df.index)  == 2:
        ax.set_xticks(df.index)
        ax.set_xlim(-0.5, 1.5)
    # Censor stroke severity at 35 due to lack of data
    if feat == 'Stroke severity':
        ax.set_xlim(0,35)
       
    # Rotate ticks for arrival-to-scan
    if feat == 'Arrival-to-scan time':
        for tick in ax.get_xticklabels():
            tick.set_rotation(45)

plt.tight_layout(pad=2)
plt.savefig('output/compare_thrombolysis_by_feature.jpg', dpi=300,
     bbox_inches='tight', pad_inches=0.2)
../../_images/09_plot_thrombolysis_at_high_vs_low_thrombolysing_units_13_0.png

Observations#

Thrombolysis use in low thrombolysing hospitals follows the same general relationship with feature values as the high thrombolysing hospitals, but thrombolysis is consistently lower.