Create region-averaged data#
This is a copy of notebook 3 (“Create maps”) with the mapping elements removed.
Average the data for all LSOAs in a given region to find new values for e.g. population weighted mean IMD score per IVT catchment, or mean transfer time by ambulance trust.
Most of this notebook involves:
Load the data for all LSOAs
Pick out only the LSOAs in a given region
Average or sum the data for those LSOAs
Notes:
Change from Notebook 3: ethnicity now looks at “white British” vs. “other than white British” groups rather than “white” vs “other than white”.
1 Set up#
1.1 Import libraries and define file paths#
from dataclasses import dataclass
import geopandas as gpd
import numpy as np
import os
import pandas as pd
from pandas.api.types import is_numeric_dtype
# Define file paths
@dataclass(frozen=True)
class Paths:
'''Singleton object for storing paths to data and database.'''
data = './data'
collated = 'collated_data_amb.csv'
hospitals = 'hospitals'
stroke_hospitals = 'stroke_hospitals_2022.csv'
shapefiles = 'shapefiles'
lsoa_shp = ('Lower_layer_super_output_areas_(E+W)_2011_Boundaries_' +
'(Generalised_Clipped)_V2.zip')
paths = Paths()
1.2 Load data#
LSOA information - load collated_data.csv
produced by 01_combine_demographic_data.ipynb
which has information on each LSOA.
df_lsoa = pd.read_csv(os.path.join(paths.data, paths.collated))
# Load in extra region information:
df_lsoa_regions = pd.read_csv('data/lsoa_2021/lsoa_regions.csv', index_col=0)
# Remove any repeated columns:
cols_to_keep = [c for c in df_lsoa_regions if c not in df_lsoa.columns]
# Merge with region information:
df_lsoa = pd.merge(df_lsoa, df_lsoa_regions[cols_to_keep],
left_on='LSOA', right_on='LSOA11NM', how='left')
df_lsoa.head(2)
LSOA | admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | ... | lsoa_code | region | region_code | region_type | short_code | icb | icb_code | isdn | lhb | icb_lhb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | ... | E01033311 | NHS Hertfordshire and West Essex ICB - 06K | E38000049 | SICBL | HE1 | NHS Hertfordshire and West Essex Integrated Ca... | E54000025 | East of England (South) | NaN | NHS Hertfordshire and West Essex Integrated Ca... |
1 | Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | ... | E01023920 | NHS Hertfordshire and West Essex ICB - 06K | E38000049 | SICBL | HE1 | NHS Hertfordshire and West Essex Integrated Ca... | E54000025 | East of England (South) | NaN | NHS Hertfordshire and West Essex Integrated Ca... |
2 rows × 128 columns
Check the contents of this dataframe:
for c in df_lsoa.columns:
print(c)
LSOA
admissions
closest_ivt_unit
closest_ivt_unit_time
closest_mt_unit
closest_mt_unit_time
closest_mt_transfer
closest_mt_transfer_time
total_mt_time
ivt_rate
imd_2019_score
la_district_name_2019
income_domain_score
income_domain_rank
idaci_score
idaci_rank
idaopi_score
idaopi_rank
ethnic_group_all_categories_ethnic_group
ethnic_group_white_total
ethnic_group_white_english_welsh_scottish_northern_irish_british
ethnic_group_white_irish
ethnic_group_white_gypsy_or_irish_traveller
ethnic_group_white_other_white
ethnic_group_mixed_multiple_ethnic_group_total
ethnic_group_mixed_multiple_ethnic_group_white_and_black_caribbean
ethnic_group_mixed_multiple_ethnic_group_white_and_black_african
ethnic_group_mixed_multiple_ethnic_group_white_and_asian
ethnic_group_mixed_multiple_ethnic_group_other_mixed
ethnic_group_asian_asian_british_total
ethnic_group_asian_asian_british_indian
ethnic_group_asian_asian_british_pakistani
ethnic_group_asian_asian_british_bangladeshi
ethnic_group_asian_asian_british_chinese
ethnic_group_asian_asian_british_other_asian
ethnic_group_black_african_caribbean_black_british_total
ethnic_group_black_african_caribbean_black_british_african
ethnic_group_black_african_caribbean_black_british_caribbean
ethnic_group_black_african_caribbean_black_british_other_black
ethnic_group_other_ethnic_group_total
ethnic_group_other_ethnic_group_arab
ethnic_group_other_ethnic_group_any_other_ethnic_group
all_categories_general_health
very_good_or_good_health
fair_health
bad_or_very_bad_health
all_categories_long_term_health_problem_or_disability
day_to_day_activities_limited_a_lot
day_to_day_activities_limited_a_little
day_to_day_activities_not_limited
rural_urban_2011
population_all
age_band_all_0
age_band_all_5
age_band_all_10
age_band_all_15
age_band_all_20
age_band_all_25
age_band_all_30
age_band_all_35
age_band_all_40
age_band_all_45
age_band_all_50
age_band_all_55
age_band_all_60
age_band_all_65
age_band_all_70
age_band_all_75
age_band_all_80
age_band_all_85
age_band_all_90
population_females
age_band_females_0
age_band_females_5
age_band_females_10
age_band_females_15
age_band_females_20
age_band_females_25
age_band_females_30
age_band_females_35
age_band_females_40
age_band_females_45
age_band_females_50
age_band_females_55
age_band_females_60
age_band_females_65
age_band_females_70
age_band_females_75
age_band_females_80
age_band_females_85
age_band_females_90
population_males
age_band_males_0
age_band_males_5
age_band_males_10
age_band_males_15
age_band_males_20
age_band_males_25
age_band_males_30
age_band_males_35
age_band_males_40
age_band_males_45
age_band_males_50
age_band_males_55
age_band_males_60
age_band_males_65
age_band_males_70
age_band_males_75
age_band_males_80
age_band_males_85
age_band_males_90
ambulance_service
local_authority_district_22
LAD22NM
country
LSOA11NM
LSOA11CD
msoa11cd
lsoa_code
region
region_code
region_type
short_code
icb
icb_code
isdn
lhb
icb_lhb
Hospital data - load data on each hospital, with aim of getting locations of hospitals that provide IVT.
# Read in data on each hospital
gdf_units = gpd.read_file(
os.path.join(paths.data, paths.hospitals, paths.stroke_hospitals))
# Combine get geometry from Easting and Northing columns
gdf_units["geometry"] = gpd.points_from_xy(
gdf_units.Easting, gdf_units.Northing)
gdf_units = gdf_units.set_crs(epsg=27700)
# Restrict to the units delivering thrombolysis
mask = gdf_units['Use_IVT'] == '1'
gdf_units = gdf_units[mask]
# Convert crs to epsg 3857
gdf_units = gdf_units.to_crs(epsg=3857)
# Preview dataframe
gdf_units.head(2)
Postcode | Hospital_name | Use_IVT | Use_MT | Use_MSU | Country | Strategic Clinical Network | Health Board / Trust | Stroke Team | SSNAP name | ... | ivt_rate | Easting | Northing | long | lat | Neuroscience | 30 England Thrombectomy Example | hospital_city | Notes | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | RM70AG | RM70AG | 1 | 1 | 1 | England | London SCN | Barking | Havering and Redbridge University Hospitals N... | Queens Hospital Romford HASU | ... | 11.9 | 551118 | 187780 | 0.179030640661934 | 51.5686465521504 | 1 | 0 | Romford | POINT (19932.617 6722504.230) | |
1 | E11BB | E11BB | 1 | 1 | 1 | England | London SCN | Barts Health NHS Trust | The Royal London Hospital | Royal London Hospital HASU | ... | 13.4 | 534829 | 181798 | -0.0581329916047372 | 51.5190178361295 | 1 | 1 | Royal London | POINT (-6468.377 6713620.832) |
2 rows × 22 columns
Area data - Import geojson of all LSOA to calculate area of each region.
(use the proper Office for National Statistics file, not the simplified one):
gdf_lsoa = gpd.read_file(os.path.join(
paths.data, paths.shapefiles, paths.lsoa_shp),
crs='EPSG:27700')
# Change the projection to a Cartesian system (EPSG:3857) and get area in
# square kilometers (through /10**6)
gdf_lsoa['polygon_area_km2'] = (gdf_lsoa.to_crs('EPSG:3857').area / 10**6)
# Merge with region information:
df_lsoa_regions = pd.read_csv('data/lsoa_2021/lsoa_regions.csv', index_col=0)
gdf_lsoa = pd.merge(gdf_lsoa, df_lsoa_regions.drop('LSOA11NM', axis='columns'),
on='LSOA11CD', how='left')
# Rename LSOA column to match df_lsoa:
gdf_lsoa = gdf_lsoa.rename(columns={'LSOA11NM': 'LSOA'})
gdf_lsoa.head()
LSOA11CD | LSOA | LSOA11NMW | geometry | polygon_area_km2 | closest_ivt_unit | closest_mt_unit | closest_mt_transfer | la_district_name_2019 | rural_urban_2011 | ... | lsoa_code | region | region_code | region_type | short_code | icb | icb_code | isdn | lhb | icb_lhb | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | E01000001 | City of London 001A | City of London 001A | POLYGON ((532105.092 182011.230, 532162.491 18... | 0.343907 | E11BB | E11BB | E11BB | City of London | Urban major conurbation | ... | E01000001 | NHS North East London ICB - A3A8R | E38000255 | SICBL | NEL | NHS North East London Integrated Care Board | E54000029 | London | NaN | NHS North East London Integrated Care Board |
1 | E01000002 | City of London 001B | City of London 001B | POLYGON ((532746.813 181786.891, 532671.688 18... | 0.583474 | E11BB | E11BB | E11BB | City of London | Urban major conurbation | ... | E01000002 | NHS North East London ICB - A3A8R | E38000255 | SICBL | NEL | NHS North East London Integrated Care Board | E54000029 | London | NaN | NHS North East London Integrated Care Board |
2 | E01000003 | City of London 001C | City of London 001C | POLYGON ((532135.145 182198.119, 532158.250 18... | 0.147840 | E11BB | E11BB | E11BB | City of London | Urban major conurbation | ... | E01000003 | NHS North East London ICB - A3A8R | E38000255 | SICBL | NEL | NHS North East London Integrated Care Board | E54000029 | London | NaN | NHS North East London Integrated Care Board |
3 | E01000005 | City of London 001E | City of London 001E | POLYGON ((533807.946 180767.770, 533649.063 18... | 0.491918 | E11BB | E11BB | E11BB | City of London | Urban major conurbation | ... | E01000005 | NHS North East London ICB - A3A8R | E38000255 | SICBL | NEL | NHS North East London Integrated Care Board | E54000029 | London | NaN | NHS North East London Integrated Care Board |
4 | E01000006 | Barking and Dagenham 016A | Barking and Dagenham 016A | POLYGON ((545122.049 184314.931, 545271.917 18... | 0.372257 | RM70AG | RM70AG | RM70AG | Barking and Dagenham | Urban major conurbation | ... | E01000006 | NHS North East London ICB - A3A8R | E38000255 | SICBL | NEL | NHS North East London Integrated Care Board | E54000029 | London | NaN | NHS North East London Integrated Care Board |
5 rows × 25 columns
1.3 Define functions for calculating weighted mean and proportions#
Group data by specified geography (e.g. closest IVT unit) and calculate the weighted average of specified columns - see stackoverflow tutorial here.
def weighted_av(df, group_by, col, new_col, weight='population_all'):
'''
Groupby specified column and then find weighted average of another column
Returns normal mean and weighted mean
Inputs:
- df - dataframe, contains data and columns to do calculation
- group_by - string, column to group by
- col - string, column to find average of
- new_col - string, name of column which will contain weighted mean
- weight - string, name of column to weight by, default 'population_all'
'''
res = (df
.groupby(group_by)
.apply(lambda x: pd.Series([np.mean(x[col]),
np.average(x[col], weights=x[weight])],
index=['mean', new_col])))
return (res)
Group data by specific geography (e.g. ambulance trust) and calculate proportion.
def find_proportion(subgroup_col, overall_col, proportion_col, group_col, df):
'''
Groupby a particular geography, and find a proportion from two provided
columns
Inputs:
subgroup_col - string, numerator, has population counts for a subgroup
overall_col - string, denominator, has population count overall
proportion_col - string, name for the column produced
group_col - string, column to groupby when calculate proportions
df - dataframe, with columns used above
'''
# Sum the columns for each group
res = df.groupby(group_col).agg({
subgroup_col: 'sum',
overall_col: 'sum'
})
# Find the proportions for each group
res[proportion_col] = res[subgroup_col] / res[overall_col]
return res
Find counts of people in each category, then calculate proportion.
def find_count_and_proportion(df_lsoa, area, col, count, prop):
'''
Finds count of people in each category for each area (based on LSOA info
and counts), and then calculates proportion in particular category.
Requires col to contain 'True' 'False' where 'True' is the proportion you
want to find (so calculates 'True' / 'True'+'False')
Inputs:
- df_lsoa - dataframe with information on each LSOA
- area - string, column to group areas by
- col - string, column with data of interest
- count - string, column with population count
- prop - string, name of new column with proportion
'''
# Find count for each area of people in each category
counts = (df_lsoa.groupby([area, col])[count].sum())
# Reformat dataframe
counts = counts.reset_index(name='n')
counts = counts.pivot(
index=area,
columns=col,
values='n').reset_index().rename_axis(None, axis=1)
# Set NaN to 0
counts = counts.replace(np.nan, 0)
# Find proportion
counts[prop] = counts['True'] / (counts['True'] + counts['False'])
# Set index to the area unit
counts = counts.set_index(area)
# Rename 'True' and 'False' columns to make their meanings clearer
# when the data is merged into another dataframe:
s = '_'.join(prop.split('_')[1:])
counts = counts.rename(columns={'True': f'{s}_True', 'False': f'{s}_False'})
return (counts)
2. Calculate new LSOA-level data#
Must create the following columns in df_lsoa:
age_65_plus_count
ethnic_group_other_than_white_british
long_term_health_count
rural
ivt_within_30
closest_hospital_is_mt
# Add count of "other than white British"
df_lsoa['ethnic_group_other_than_white_british'] = (
df_lsoa['ethnic_group_all_categories_ethnic_group'] -
df_lsoa['ethnic_group_white_english_welsh_scottish_northern_irish_british']
)
# Add column with count of long-term health problem (limited a little or alot)
df_lsoa['long_term_health_count'] = (
df_lsoa['all_categories_long_term_health_problem_or_disability'] -
df_lsoa['day_to_day_activities_not_limited']
)
# Extract if is rural or if is urban (each classification starts with that)
df_lsoa['ruc_overall'] = df_lsoa['rural_urban_2011'].str[:5]
# Add column where True if rural
df_lsoa['rural'] = (df_lsoa['ruc_overall'] == 'Rural').map({
True: 'True', False: 'False'})
# Find count of people age 65 plus
df_lsoa['age_65_plus_count'] = df_lsoa[[
'age_band_all_65', 'age_band_all_70', 'age_band_all_75',
'age_band_all_80', 'age_band_all_85', 'age_band_all_90']].sum(axis=1)
# Create column indicating if IVT unit is within 30 minutes
df_lsoa['ivt_within_30'] = (df_lsoa['closest_ivt_unit_time'] < 30).map(
{True: 'True', False: 'False'})
# Add column with whether closest hospital offers thrombectomy
df_lsoa['closest_hospital_is_mt'] = (
df_lsoa['closest_mt_transfer_time'] == 0).map({True: 'True',
False: 'False'})
3. Big function for region-level data creation#
Create one big function that does all of the calculations needed for averaging data across any given region.
def big_data_creation(df_lsoa, gdf_lsoa, group_by):
df_lsoa = df_lsoa.copy() # just in case
gdf_lsoa = gdf_lsoa.copy() # just in case
# Find all of the possible options for this region type:
index_values = list(set(df_lsoa[group_by]))
try:
index_values = sorted(index_values)
except TypeError:
pass
# Empty dataframe to place the results into:
df_results = pd.DataFrame(index=index_values)
# Find population of each region:
pop = df_lsoa[[group_by, 'population_all']].groupby(group_by)['population_all'].sum()
# Find area of each region:
gdf_lsoa = gdf_lsoa[[group_by, 'polygon_area_km2']].groupby(group_by).sum()
gdf_lsoa = pd.merge(gdf_lsoa, pop, left_index=True, right_index=True)
# Divide population by area to get population density
gdf_lsoa['population_density'] = (
gdf_lsoa['population_all'] / gdf_lsoa['polygon_area_km2'])
df_results = pd.merge(df_results, gdf_lsoa.drop('population_all', axis='columns'),
left_index=True, right_index=True, how='left')
# --- Weighted averages ---
# Pre-requisites:
# Must have already calculated the following columns in df_lsoa:
# 'age_65_plus_count'
cols_for_weighted_av = [
# Population-weighted mean income domain score:
dict(col='income_domain_score',
new_col='income_domain_weighted_mean'),
# Population-weighted mean IMD score:
dict(col='imd_2019_score', new_col='imd_weighted_mean'),
# Average travel time to IVT unit weighted by number of
# over 65 year olds in catchment:
dict(col='closest_ivt_unit_time', new_col='weighted_ivt_time',
weight='age_65_plus_count'),
# Population-weighted mean time to MT unit:
dict(col='total_mt_time', new_col='mt_time_weighted_mean'),
# Population-weighted mean time to IVT unit:
dict(col='closest_ivt_unit_time',
new_col='ivt_time_weighted_mean'),
# Population-weighted mean transfer time between IVT and MT unit:
dict(col='closest_mt_transfer_time',
new_col='mt_transfer_time_weighted_mean'),
]
for kwargs in cols_for_weighted_av:
# Calculate results.
# 'weighted_av()' function returns a df with index named group_by and
# columns named 'mean' and new_col.
df_w = weighted_av(
df_lsoa,
group_by=group_by,
**kwargs
)
# Store in big results df.
# Discard 'mean', keep new_col.
df_results = pd.merge(df_results, df_w[kwargs['new_col']],
left_index=True, right_index=True, how='left')
# --- Find proportions ---
# Pre-requisites:
# Must have already calculated the following columns in df_lsoa:
# 'ethnic_group_other_than_white_british'
# 'long_term_health_count'
# 'age_65_plus_count'
cols_for_proportion = [
# Bad or very bad general health
dict(subgroup_col='ethnic_group_other_than_white_british',
overall_col='ethnic_group_all_categories_ethnic_group',
proportion_col='ethnic_minority_proportion',
df=df_lsoa),
# General health - proportion with bad/very bad health
dict(subgroup_col='bad_or_very_bad_health',
overall_col='all_categories_general_health',
proportion_col='bad_health_proportion',
df=df_lsoa),
# Long-term health problem or disability
dict(subgroup_col='long_term_health_count',
overall_col='all_categories_long_term_health_problem_or_disability',
proportion_col='long_term_health_proportion',
df=df_lsoa),
# Age - proportion aged 65+
dict(subgroup_col='age_65_plus_count',
overall_col='population_all',
proportion_col='age_65_plus_proportion',
df=df_lsoa),
]
# 'find_proportion()' function.
# Notebook 3 stored the proportion_col column, discarded the rest.
# Here, keep all resulting columns. First is number of True,
# second is total number (True + False), third is proportion.
for kwargs in cols_for_proportion:
# Calculate results.
df_p = find_proportion(group_col=group_by, **kwargs)
# Store in big results df.
df_results = pd.merge(df_results, df_p,
left_index=True, right_index=True, how='left')
# --- Find count and proportions ---
# Pre-requisites:
# Must have already calculated the following columns in df_lsoa:
# 'rural'
# 'ivt_within_30'
# 'closest_hospital_is_mt'
cols_for_count_and_proportion = [
# Rural/urban - proportion of people living in rural areas
dict(col='rural', count='population_all', prop='proportion_rural'),
# Proportion of 65+ year olds living within 30 minutes of their closest IVT unit
dict(col='ivt_within_30', count='age_65_plus_count',
prop='proportion_over_65_within_30'),
# Proportion of patients whose nearest hospital is an MT unit
dict(col='closest_hospital_is_mt', count='population_all',
prop='proportion_closest_is_mt'),
]
# 'find_count_and_proportion()' function.
for kwargs in cols_for_count_and_proportion:
# Calculate results.
df_c = find_count_and_proportion(df_lsoa=df_lsoa, area=group_by, **kwargs)
# Store in big results df.
df_results = pd.merge(df_results, df_c,
left_index=True, right_index=True, how='left')
return df_results
Run the big data collection function for each different region type:
region_types = [
'LSOA',
'closest_ivt_unit',
'closest_mt_unit',
'closest_mt_transfer',
'ambulance_service',
# 'LAD22NM',
'icb_code',
'isdn',
'lhb',
'icb_lhb'
]
for region_type in region_types:
df_results = big_data_creation(df_lsoa, gdf_lsoa, region_type)
if region_type == 'closest_ivt_unit':
# Do bonus calculations.
# Extract rate for each unit, with unit as index
ivt_rate = (df_lsoa[['closest_ivt_unit', 'ivt_rate']]
.drop_duplicates()
.set_index('closest_ivt_unit')['ivt_rate'])
df_results = pd.merge(df_results, ivt_rate,
left_index=True, right_index=True, how='left')
# Get admissions to each unit from 2021/22
admissions_2122 = (
gdf_units[['Postcode', 'Admissions 21/22']]
.set_index('Postcode')
.rename(columns={'Admissions 21/22': 'admissions_2122'})
.astype(int)
)
df_results = pd.merge(df_results, admissions_2122,
left_index=True, right_index=True, how='left')
else:
# Don't do anything extra.
pass
# Make sure data is not overly-precise before saving:
cols_to_round = df_results.select_dtypes(include=['float']).columns
df_results[cols_to_round] = df_results[cols_to_round].round(4)
# Set index name explicitly to make sure something gets saved:
df_results.index.name = region_type
# Save to file:
df_results.to_csv(f'data/collated_data_by_region/collated_data_regional_{region_type}.csv')