Create region-averaged data

Create region-averaged data#

This is a copy of notebook 3 (“Create maps”) with the mapping elements removed.

Average the data for all LSOAs in a given region to find new values for e.g. population weighted mean IMD score per IVT catchment, or mean transfer time by ambulance trust.

Most of this notebook involves:

Load the data for all LSOAs
Pick out only the LSOAs in a given region
Average or sum the data for those LSOAs

Notes:

Change from Notebook 3: ethnicity now looks at “white British” vs. “other than white British” groups rather than “white” vs “other than white”.

1 Set up#

1.1 Import libraries and define file paths#

from dataclasses import dataclass
import geopandas as gpd
import numpy as np
import os
import pandas as pd
from pandas.api.types import is_numeric_dtype

# Define file paths
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data = './data'
    collated = 'collated_data_amb.csv'

    hospitals = 'hospitals'
    stroke_hospitals = 'stroke_hospitals_2022.csv'

    shapefiles = 'shapefiles'
    lsoa_shp = ('Lower_layer_super_output_areas_(E+W)_2011_Boundaries_' +
                '(Generalised_Clipped)_V2.zip')


paths = Paths()

1.2 Load data#

LSOA information - load collated_data.csv produced by 01_combine_demographic_data.ipynb which has information on each LSOA.

df_lsoa = pd.read_csv(os.path.join(paths.data, paths.collated))

# Load in extra region information:
df_lsoa_regions = pd.read_csv('data/lsoa_2021/lsoa_regions.csv', index_col=0)
# Remove any repeated columns:
cols_to_keep = [c for c in df_lsoa_regions if c not in df_lsoa.columns]
# Merge with region information:
df_lsoa = pd.merge(df_lsoa, df_lsoa_regions[cols_to_keep],
                   left_on='LSOA', right_on='LSOA11NM', how='left')

df_lsoa.head(2)

	LSOA	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	...	lsoa_code	region	region_code	region_type	short_code	icb	icb_code	isdn	lhb	icb_lhb
0	Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	...	E01033311	NHS Hertfordshire and West Essex ICB - 06K	E38000049	SICBL	HE1	NHS Hertfordshire and West Essex Integrated Ca...	E54000025	East of England (South)	NaN	NHS Hertfordshire and West Essex Integrated Ca...
1	Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	...	E01023920	NHS Hertfordshire and West Essex ICB - 06K	E38000049	SICBL	HE1	NHS Hertfordshire and West Essex Integrated Ca...	E54000025	East of England (South)	NaN	NHS Hertfordshire and West Essex Integrated Ca...

2 rows × 128 columns

Check the contents of this dataframe:

for c in df_lsoa.columns:
    print(c)

LSOA
admissions
closest_ivt_unit
closest_ivt_unit_time
closest_mt_unit
closest_mt_unit_time
closest_mt_transfer
closest_mt_transfer_time
total_mt_time
ivt_rate
imd_2019_score
la_district_name_2019
income_domain_score
income_domain_rank
idaci_score
idaci_rank
idaopi_score
idaopi_rank
ethnic_group_all_categories_ethnic_group
ethnic_group_white_total
ethnic_group_white_english_welsh_scottish_northern_irish_british
ethnic_group_white_irish
ethnic_group_white_gypsy_or_irish_traveller
ethnic_group_white_other_white
ethnic_group_mixed_multiple_ethnic_group_total
ethnic_group_mixed_multiple_ethnic_group_white_and_black_caribbean
ethnic_group_mixed_multiple_ethnic_group_white_and_black_african
ethnic_group_mixed_multiple_ethnic_group_white_and_asian
ethnic_group_mixed_multiple_ethnic_group_other_mixed
ethnic_group_asian_asian_british_total
ethnic_group_asian_asian_british_indian
ethnic_group_asian_asian_british_pakistani
ethnic_group_asian_asian_british_bangladeshi
ethnic_group_asian_asian_british_chinese
ethnic_group_asian_asian_british_other_asian
ethnic_group_black_african_caribbean_black_british_total
ethnic_group_black_african_caribbean_black_british_african
ethnic_group_black_african_caribbean_black_british_caribbean
ethnic_group_black_african_caribbean_black_british_other_black
ethnic_group_other_ethnic_group_total
ethnic_group_other_ethnic_group_arab
ethnic_group_other_ethnic_group_any_other_ethnic_group
all_categories_general_health
very_good_or_good_health
fair_health
bad_or_very_bad_health
all_categories_long_term_health_problem_or_disability
day_to_day_activities_limited_a_lot
day_to_day_activities_limited_a_little
day_to_day_activities_not_limited
rural_urban_2011
population_all
age_band_all_0
age_band_all_5
age_band_all_10
age_band_all_15
age_band_all_20
age_band_all_25
age_band_all_30
age_band_all_35
age_band_all_40
age_band_all_45
age_band_all_50
age_band_all_55
age_band_all_60
age_band_all_65
age_band_all_70
age_band_all_75
age_band_all_80
age_band_all_85
age_band_all_90
population_females
age_band_females_0
age_band_females_5
age_band_females_10
age_band_females_15
age_band_females_20
age_band_females_25
age_band_females_30
age_band_females_35
age_band_females_40
age_band_females_45
age_band_females_50
age_band_females_55
age_band_females_60
age_band_females_65
age_band_females_70
age_band_females_75
age_band_females_80
age_band_females_85
age_band_females_90
population_males
age_band_males_0
age_band_males_5
age_band_males_10
age_band_males_15
age_band_males_20
age_band_males_25
age_band_males_30
age_band_males_35
age_band_males_40
age_band_males_45
age_band_males_50
age_band_males_55
age_band_males_60
age_band_males_65
age_band_males_70
age_band_males_75
age_band_males_80
age_band_males_85
age_band_males_90
ambulance_service
local_authority_district_22
LAD22NM
country
LSOA11NM
LSOA11CD
msoa11cd
lsoa_code
region
region_code
region_type
short_code
icb
icb_code
isdn
lhb
icb_lhb

Hospital data - load data on each hospital, with aim of getting locations of hospitals that provide IVT.

# Read in data on each hospital
gdf_units = gpd.read_file(
    os.path.join(paths.data, paths.hospitals, paths.stroke_hospitals))

# Combine get geometry from Easting and Northing columns
gdf_units["geometry"] = gpd.points_from_xy(
        gdf_units.Easting, gdf_units.Northing)
gdf_units = gdf_units.set_crs(epsg=27700)

# Restrict to the units delivering thrombolysis
mask = gdf_units['Use_IVT'] == '1'
gdf_units = gdf_units[mask]

# Convert crs to epsg 3857
gdf_units = gdf_units.to_crs(epsg=3857)

# Preview dataframe
gdf_units.head(2)

	Postcode	Hospital_name	Use_IVT	Use_MT	Use_MSU	Country	Strategic Clinical Network	Health Board / Trust	Stroke Team	SSNAP name	...	ivt_rate	Easting	Northing	long	lat	Neuroscience	30 England Thrombectomy Example	hospital_city	Notes	geometry
0	RM70AG	RM70AG	1	1	1	England	London SCN	Barking	Havering and Redbridge University Hospitals N...	Queens Hospital Romford HASU	...	11.9	551118	187780	0.179030640661934	51.5686465521504	1	0	Romford		POINT (19932.617 6722504.230)
1	E11BB	E11BB	1	1	1	England	London SCN	Barts Health NHS Trust	The Royal London Hospital	Royal London Hospital HASU	...	13.4	534829	181798	-0.0581329916047372	51.5190178361295	1	1	Royal London		POINT (-6468.377 6713620.832)

2 rows × 22 columns

Area data - Import geojson of all LSOA to calculate area of each region.

(use the proper Office for National Statistics file, not the simplified one):

gdf_lsoa = gpd.read_file(os.path.join(
    paths.data, paths.shapefiles, paths.lsoa_shp),
    crs='EPSG:27700')

# Change the projection to a Cartesian system (EPSG:3857) and get area in
# square kilometers (through /10**6)
gdf_lsoa['polygon_area_km2'] = (gdf_lsoa.to_crs('EPSG:3857').area / 10**6)

# Merge with region information:
df_lsoa_regions = pd.read_csv('data/lsoa_2021/lsoa_regions.csv', index_col=0)

gdf_lsoa = pd.merge(gdf_lsoa, df_lsoa_regions.drop('LSOA11NM', axis='columns'),
                    on='LSOA11CD', how='left')

# Rename LSOA column to match df_lsoa:
gdf_lsoa = gdf_lsoa.rename(columns={'LSOA11NM': 'LSOA'})

gdf_lsoa.head()

	LSOA11CD	LSOA	LSOA11NMW	geometry	polygon_area_km2	closest_ivt_unit	closest_mt_unit	closest_mt_transfer	la_district_name_2019	rural_urban_2011	...	lsoa_code	region	region_code	region_type	short_code	icb	icb_code	isdn	lhb	icb_lhb
0	E01000001	City of London 001A	City of London 001A	POLYGON ((532105.092 182011.230, 532162.491 18...	0.343907	E11BB	E11BB	E11BB	City of London	Urban major conurbation	...	E01000001	NHS North East London ICB - A3A8R	E38000255	SICBL	NEL	NHS North East London Integrated Care Board	E54000029	London	NaN	NHS North East London Integrated Care Board
1	E01000002	City of London 001B	City of London 001B	POLYGON ((532746.813 181786.891, 532671.688 18...	0.583474	E11BB	E11BB	E11BB	City of London	Urban major conurbation	...	E01000002	NHS North East London ICB - A3A8R	E38000255	SICBL	NEL	NHS North East London Integrated Care Board	E54000029	London	NaN	NHS North East London Integrated Care Board
2	E01000003	City of London 001C	City of London 001C	POLYGON ((532135.145 182198.119, 532158.250 18...	0.147840	E11BB	E11BB	E11BB	City of London	Urban major conurbation	...	E01000003	NHS North East London ICB - A3A8R	E38000255	SICBL	NEL	NHS North East London Integrated Care Board	E54000029	London	NaN	NHS North East London Integrated Care Board
3	E01000005	City of London 001E	City of London 001E	POLYGON ((533807.946 180767.770, 533649.063 18...	0.491918	E11BB	E11BB	E11BB	City of London	Urban major conurbation	...	E01000005	NHS North East London ICB - A3A8R	E38000255	SICBL	NEL	NHS North East London Integrated Care Board	E54000029	London	NaN	NHS North East London Integrated Care Board
4	E01000006	Barking and Dagenham 016A	Barking and Dagenham 016A	POLYGON ((545122.049 184314.931, 545271.917 18...	0.372257	RM70AG	RM70AG	RM70AG	Barking and Dagenham	Urban major conurbation	...	E01000006	NHS North East London ICB - A3A8R	E38000255	SICBL	NEL	NHS North East London Integrated Care Board	E54000029	London	NaN	NHS North East London Integrated Care Board

5 rows × 25 columns

1.3 Define functions for calculating weighted mean and proportions#

Group data by specified geography (e.g. closest IVT unit) and calculate the weighted average of specified columns - see stackoverflow tutorial here.

def weighted_av(df, group_by, col, new_col, weight='population_all'):
    '''
    Groupby specified column and then find weighted average of another column
    Returns normal mean and weighted mean
    Inputs:
    - df - dataframe, contains data and columns to do calculation
    - group_by - string, column to group by
    - col - string, column to find average of
    - new_col - string, name of column which will contain weighted mean
    - weight - string, name of column to weight by, default 'population_all'
    '''
    res = (df
           .groupby(group_by)
           .apply(lambda x: pd.Series([np.mean(x[col]),
                                       np.average(x[col], weights=x[weight])],
                                      index=['mean', new_col])))
    return (res)

Group data by specific geography (e.g. ambulance trust) and calculate proportion.

def find_proportion(subgroup_col, overall_col, proportion_col, group_col, df):
    '''
    Groupby a particular geography, and find a proportion from two provided
    columns
    Inputs:
    subgroup_col - string, numerator, has population counts for a subgroup
    overall_col - string, denominator, has population count overall
    proportion_col - string, name for the column produced
    group_col - string, column to groupby when calculate proportions
    df - dataframe, with columns used above
    '''
    # Sum the columns for each group
    res = df.groupby(group_col).agg({
        subgroup_col: 'sum',
        overall_col: 'sum'
    })

    # Find the proportions for each group
    res[proportion_col] = res[subgroup_col] / res[overall_col]

    return res

Find counts of people in each category, then calculate proportion.

def find_count_and_proportion(df_lsoa, area, col, count, prop):
    '''
    Finds count of people in each category for each area (based on LSOA info
    and counts), and then calculates proportion in particular category.
    Requires col to contain 'True' 'False' where 'True' is the proportion you
    want to find (so calculates 'True' / 'True'+'False')
    Inputs:
    - df_lsoa - dataframe with information on each LSOA
    - area - string, column to group areas by
    - col - string, column with data of interest
    - count - string, column with population count
    - prop - string, name of new column with proportion
    '''
    # Find count for each area of people in each category
    counts = (df_lsoa.groupby([area, col])[count].sum())

    # Reformat dataframe
    counts = counts.reset_index(name='n')
    counts = counts.pivot(
        index=area,
        columns=col,
        values='n').reset_index().rename_axis(None, axis=1)

    # Set NaN to 0
    counts = counts.replace(np.nan, 0)

    # Find proportion
    counts[prop] = counts['True'] / (counts['True'] + counts['False'])

    # Set index to the area unit
    counts = counts.set_index(area)

    # Rename 'True' and 'False' columns to make their meanings clearer
    # when the data is merged into another dataframe:
    s = '_'.join(prop.split('_')[1:])
    counts = counts.rename(columns={'True': f'{s}_True', 'False': f'{s}_False'})

    return (counts)

2. Calculate new LSOA-level data#

Must create the following columns in df_lsoa:

age_65_plus_count
ethnic_group_other_than_white_british
long_term_health_count
rural
ivt_within_30
closest_hospital_is_mt

# Add count of "other than white British"
df_lsoa['ethnic_group_other_than_white_british'] = (
    df_lsoa['ethnic_group_all_categories_ethnic_group'] -
    df_lsoa['ethnic_group_white_english_welsh_scottish_northern_irish_british']
)

# Add column with count of long-term health problem (limited a little or alot)
df_lsoa['long_term_health_count'] = (
    df_lsoa['all_categories_long_term_health_problem_or_disability'] -
    df_lsoa['day_to_day_activities_not_limited']
)

# Extract if is rural or if is urban (each classification starts with that)
df_lsoa['ruc_overall'] = df_lsoa['rural_urban_2011'].str[:5]
# Add column where True if rural
df_lsoa['rural'] = (df_lsoa['ruc_overall'] == 'Rural').map({
    True: 'True', False: 'False'})

# Find count of people age 65 plus
df_lsoa['age_65_plus_count'] = df_lsoa[[
    'age_band_all_65', 'age_band_all_70', 'age_band_all_75',
    'age_band_all_80', 'age_band_all_85', 'age_band_all_90']].sum(axis=1)

# Create column indicating if IVT unit is within 30 minutes
df_lsoa['ivt_within_30'] = (df_lsoa['closest_ivt_unit_time'] < 30).map(
    {True: 'True', False: 'False'})

# Add column with whether closest hospital offers thrombectomy
df_lsoa['closest_hospital_is_mt'] = (
    df_lsoa['closest_mt_transfer_time'] == 0).map({True: 'True',
                                                   False: 'False'})

3. Big function for region-level data creation#

Create one big function that does all of the calculations needed for averaging data across any given region.

def big_data_creation(df_lsoa, gdf_lsoa, group_by):
    df_lsoa = df_lsoa.copy()  # just in case
    gdf_lsoa = gdf_lsoa.copy()  # just in case
    # Find all of the possible options for this region type:
    index_values = list(set(df_lsoa[group_by]))
    try:
        index_values = sorted(index_values)
    except TypeError:
        pass
    
    # Empty dataframe to place the results into:
    df_results = pd.DataFrame(index=index_values)

    # Find population of each region:
    pop = df_lsoa[[group_by, 'population_all']].groupby(group_by)['population_all'].sum()
    # Find area of each region:
    gdf_lsoa = gdf_lsoa[[group_by, 'polygon_area_km2']].groupby(group_by).sum()
    gdf_lsoa = pd.merge(gdf_lsoa, pop, left_index=True, right_index=True)
    # Divide population by area to get population density
    gdf_lsoa['population_density'] = (
        gdf_lsoa['population_all'] / gdf_lsoa['polygon_area_km2'])
    
    df_results = pd.merge(df_results, gdf_lsoa.drop('population_all', axis='columns'),
                          left_index=True, right_index=True, how='left')
    
    # --- Weighted averages ---
    # Pre-requisites:
    # Must have already calculated the following columns in df_lsoa:
    # 'age_65_plus_count'
    
    cols_for_weighted_av = [
        # Population-weighted mean income domain score:
        dict(col='income_domain_score',
             new_col='income_domain_weighted_mean'),
        # Population-weighted mean IMD score:
        dict(col='imd_2019_score', new_col='imd_weighted_mean'),
        # Average travel time to IVT unit weighted by number of
        # over 65 year olds in catchment:
        dict(col='closest_ivt_unit_time', new_col='weighted_ivt_time',
             weight='age_65_plus_count'),
        # Population-weighted mean time to MT unit:
        dict(col='total_mt_time', new_col='mt_time_weighted_mean'),
        # Population-weighted mean time to IVT unit:
        dict(col='closest_ivt_unit_time',
             new_col='ivt_time_weighted_mean'),
        # Population-weighted mean transfer time between IVT and MT unit:
        dict(col='closest_mt_transfer_time',
             new_col='mt_transfer_time_weighted_mean'),   
    ]
    
    for kwargs in cols_for_weighted_av:
        # Calculate results.
        # 'weighted_av()' function returns a df with index named group_by and
        # columns named 'mean' and new_col.
        df_w = weighted_av(
            df_lsoa,
            group_by=group_by,
            **kwargs
        )
        # Store in big results df.
        # Discard 'mean', keep new_col.
        df_results = pd.merge(df_results, df_w[kwargs['new_col']],
                              left_index=True, right_index=True, how='left')
    
    # --- Find proportions ---
    # Pre-requisites:
    # Must have already calculated the following columns in df_lsoa:
    # 'ethnic_group_other_than_white_british'
    # 'long_term_health_count'
    # 'age_65_plus_count'
    cols_for_proportion = [
        # Bad or very bad general health
        dict(subgroup_col='ethnic_group_other_than_white_british',
             overall_col='ethnic_group_all_categories_ethnic_group',
             proportion_col='ethnic_minority_proportion',
             df=df_lsoa),
        # General health - proportion with bad/very bad health
        dict(subgroup_col='bad_or_very_bad_health',
             overall_col='all_categories_general_health',
             proportion_col='bad_health_proportion',
             df=df_lsoa),
        # Long-term health problem or disability
        dict(subgroup_col='long_term_health_count',
             overall_col='all_categories_long_term_health_problem_or_disability',
             proportion_col='long_term_health_proportion',
             df=df_lsoa),
        # Age - proportion aged 65+
        dict(subgroup_col='age_65_plus_count',
             overall_col='population_all',
             proportion_col='age_65_plus_proportion',
             df=df_lsoa),
    ]
    # 'find_proportion()' function.
    # Notebook 3 stored the proportion_col column, discarded the rest.
    # Here, keep all resulting columns. First is number of True,
    # second is total number (True + False), third is proportion.
    for kwargs in cols_for_proportion:
        # Calculate results.
        df_p = find_proportion(group_col=group_by, **kwargs)
        
        # Store in big results df.
        df_results = pd.merge(df_results, df_p,
                              left_index=True, right_index=True, how='left')
    
    # --- Find count and proportions ---
    # Pre-requisites:
    # Must have already calculated the following columns in df_lsoa:
    # 'rural'
    # 'ivt_within_30'
    # 'closest_hospital_is_mt'
    cols_for_count_and_proportion = [
        # Rural/urban - proportion of people living in rural areas
        dict(col='rural', count='population_all', prop='proportion_rural'),
        # Proportion of 65+ year olds living within 30 minutes of their closest IVT unit
        dict(col='ivt_within_30', count='age_65_plus_count',
             prop='proportion_over_65_within_30'),
        # Proportion of patients whose nearest hospital is an MT unit
        dict(col='closest_hospital_is_mt', count='population_all',
             prop='proportion_closest_is_mt'),
    ]
    # 'find_count_and_proportion()' function.
    for kwargs in cols_for_count_and_proportion:
        # Calculate results.
        df_c = find_count_and_proportion(df_lsoa=df_lsoa, area=group_by, **kwargs)
        
        # Store in big results df.
        df_results = pd.merge(df_results, df_c,
                              left_index=True, right_index=True, how='left')
    return df_results

Run the big data collection function for each different region type:

region_types = [
    'LSOA',
    'closest_ivt_unit',
    'closest_mt_unit',
    'closest_mt_transfer',
    'ambulance_service',
    # 'LAD22NM',
    'icb_code',
    'isdn',
    'lhb',
    'icb_lhb'
]

for region_type in region_types:
    df_results = big_data_creation(df_lsoa, gdf_lsoa, region_type)

    if region_type == 'closest_ivt_unit':
        # Do bonus calculations.
        # Extract rate for each unit, with unit as index
        ivt_rate = (df_lsoa[['closest_ivt_unit', 'ivt_rate']]
                    .drop_duplicates()
                    .set_index('closest_ivt_unit')['ivt_rate'])
        df_results = pd.merge(df_results, ivt_rate,
                              left_index=True, right_index=True, how='left')

        # Get admissions to each unit from 2021/22
        admissions_2122 = (
            gdf_units[['Postcode', 'Admissions 21/22']]
               .set_index('Postcode')
               .rename(columns={'Admissions 21/22': 'admissions_2122'})
               .astype(int)
        )
        df_results = pd.merge(df_results, admissions_2122,
                              left_index=True, right_index=True, how='left')
    else:
        # Don't do anything extra.
        pass

    # Make sure data is not overly-precise before saving:
    cols_to_round = df_results.select_dtypes(include=['float']).columns
    df_results[cols_to_round] = df_results[cols_to_round].round(4)

    # Set index name explicitly to make sure something gets saved:
    df_results.index.name = region_type
    
    # Save to file:
    df_results.to_csv(f'data/collated_data_by_region/collated_data_regional_{region_type}.csv')