Collate demographic data for each LSOA#

This notebook collates various data sources and saves as collated_data.csv in data folder. For each LSOA, it finds:

  • Hospital information - yearly stroke admissions, travel time to and thrombolysis rate of nearest IVT, and travel time to nearest IVT (considering transfers or not)

  • Demographic information - IMD, IoD Income/IDACI/IDAOIP, ethnicity, health, rural/urban, age

For source of demographic data, see data/demographic/demographic_soucres.md

Import libraries and set file paths#

# Import required libraries
from dataclasses import dataclass
import geopandas as gpd
import numpy as np
import os
import pandas as pd
import zipfile
# Define file paths
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data = './data'

    hospitals = 'hospitals'
    travel_matrix_zip = 'lsoa_travel_time_matrix_calibrated.zip'
    travel_matrix_csv = 'lsoa_travel_time_matrix_calibrated.csv'
    admissions = 'admissions_2017-2019.csv'
    units = 'stroke_hospitals_2022.csv'
    inter_hospital = 'inter_hospital_time_calibrated.csv'

    demographic = 'demographic'
    welsh_imd = 'WIMD2019_Scores.csv'
    english_imd = 'IMD_2019.shp'
    deprivation = ('File_7a_-_All_IoD2019_Scores__Ranks__Deciles' +
                   '_and_Population_Denominators_3.csv')
    ethnicity = 'ethnicity.csv'
    health = 'general_health.csv'
    long_term_health = 'long_term_health_by_general_health.csv'
    rural_urban = ('Rural_Urban_Classification_2011_of_Lower_Layer_Super' +
                   '_Output_Areas_in_England_and_Wales.csv')
    age_all = 'mid_2020_persons.csv'
    age_female = 'mid_2020_females.csv'
    age_male = 'mid_2020_males.csv'

    look_ups = 'look_ups'
    welsh_lsoa = 'wales_lsoa_code_name.csv'

    collated = 'collated_data.csv'


paths = Paths()

Hospital information#

Import admissions, units and travel times#

Admissions#

This is the average yearly admissions for each LSOA from 2017 to 2019.

data = pd.read_csv(os.path.join(
    paths.data, paths.hospitals, paths.admissions), index_col='area')
data.rename(columns={'Admissions': 'admissions'}, inplace=True)

data.head(2)
admissions
area
Welwyn Hatfield 010F 0.666667
Welwyn Hatfield 012A 4.000000

Units#

This has a record of information about every hospital - including whether they provide thrombolysis (IVT) and thrombectomy (MT).

units = pd.read_csv(os.path.join(
    paths.data, paths.hospitals, paths.units))

display(units.shape)
units.head(2)
(142, 21)
Postcode Hospital_name Use_IVT Use_MT Use_MSU Country Strategic Clinical Network Health Board / Trust Stroke Team SSNAP name ... Thrombolysis ivt_rate Easting Northing long lat Neuroscience 30 England Thrombectomy Example hospital_city Notes
0 RM70AG RM70AG 1 1 1 England London SCN Barking Havering and Redbridge University Hospitals N... Queens Hospital Romford HASU ... 117.0 11.9 551118 187780 0.179031 51.568647 1 0 Romford NaN
1 E11BB E11BB 1 1 1 England London SCN Barts Health NHS Trust The Royal London Hospital Royal London Hospital HASU ... 115.0 13.4 534829 181798 -0.058133 51.519018 1 1 Royal London NaN

2 rows × 21 columns

LSOA to hospital travel times#

This is a matrix with travel times (in minutes) from every LSOA to every hospital.

# Unzip travel matrix folder and save the csv file to data path
travel_path = os.path.join(
    paths.data, paths.hospitals, paths.travel_matrix_zip)
with zipfile.ZipFile(travel_path, 'r') as zip_ref:
    zip_ref.extractall(os.path.join(paths.data, paths.hospitals))

# Import the csv file produced
travel_time = pd.read_csv(
    os.path.join(paths.data, paths.hospitals, paths.travel_matrix_csv),
    index_col='LSOA')

# Remove any leading spaces in column titles
travel_time.columns = travel_time.columns.str.lstrip()

travel_time.head(2)
B152TH B714HJ B95SS BA13NG BA214AT BB23HH BD96RJ BH77DW BL97TD BN112DH ... TN240LZ TQ27AA TR13LQ TS198PE TS43BW WD180HB WF14DG WR51DD WV100QP YO318HE
LSOA
Adur 001A 173.3 179.8 171.2 161.5 152.9 274.3 261.4 100.3 262.5 18.7 ... 89.6 212.0 291.5 310.8 306.5 92.8 242.1 178.7 186.2 267.9
Adur 001B 173.3 179.8 172.3 161.5 152.9 274.3 261.4 99.2 263.6 18.7 ... 90.6 212.0 290.4 311.9 306.5 92.8 242.1 178.7 186.2 267.9

2 rows × 125 columns

Inter-hospital travel times#

This is a matrix with travel times (in minutes) from each hospital to every other hospital.

inter_hospital_times = pd.read_csv(
    os.path.join(paths.data, paths.hospitals, paths.inter_hospital),
    index_col='from_postcode')

# Remove any leading spaces in column titles
inter_hospital_times.columns = inter_hospital_times.columns.str.lstrip()

inter_hospital_times.head(2)
B152TH B714HJ B95SS BA13NG BA214AT BB23HH BD96RJ BH77DW BL97TD BN112DH ... TN240LZ TQ27AA TR13LQ TS198PE TS43BW WD180HB WF14DG WR51DD WV100QP YO318HE
from_postcode
B152TH 0.0 18.7 18.7 107.8 155.1 117.5 135.8 173.3 106.8 176.6 ... 184.1 176.6 256.0 191.6 187.3 114.3 122.9 36.9 29.4 148.6
B714HJ 18.7 0.0 24.1 106.8 154.0 110.0 128.2 179.8 98.2 183.0 ... 184.1 176.6 255.0 188.4 184.1 111.1 119.6 35.9 21.9 145.4

2 rows × 125 columns

Get postcodes of units providing IVT or MT#

Get postcodes of unit in use for IVT

mask = units['Use_IVT'] == 1
ivt_units = list(units[mask]['Postcode'])
display(len(ivt_units))
ivt_units[0:5]
113
['RM70AG', 'E11BB', 'SW66SX', 'SE59RW', 'BR68ND']

Get postcodes of units providing MT.

mask = units['Use_MT'] == 1
mt_units = list(units[mask]['Postcode'])
display(len(mt_units))
mt_units[0:5]
24
['RM70AG', 'E11BB', 'SW66SX', 'SE59RW', 'SW170QT']

Travel time to nearest thrombolysis unit#

Limit to units that use IVT

travel_time_ivt = travel_time[ivt_units]
display(travel_time_ivt.shape)
travel_time_ivt.head(2)
(34752, 113)
RM70AG E11BB SW66SX SE59RW BR68ND HA13UJ SW170QT NW12BU DE223NE NN15BD ... LL137TD LL572PW CF144XW CF479DT CF311RQ SY231ER SA148QF SA312AF SA612PZ SA66NL
LSOA
Adur 001A 85.3 87.4 76.7 74.5 67.0 87.4 69.2 86.4 191.6 131.5 ... 248.5 321.6 192.7 217.4 206.6 297.9 238.9 255.0 287.2 230.3
Adur 001B 85.3 87.4 76.7 74.5 67.0 87.4 69.2 86.4 191.6 131.5 ... 248.5 321.6 192.7 217.4 207.7 297.9 238.9 255.0 287.2 230.3

2 rows × 113 columns

Get closest unit and time for IVT (convert series to dataframes)

closest_ivt_unit = pd.DataFrame(travel_time_ivt.idxmin(axis=1), columns=['closest_ivt_unit'])
closest_ivt_unit_time = pd.DataFrame(travel_time_ivt.min(axis=1), columns=['closest_ivt_unit_time'])

Merge into admissions

data = data.merge(closest_ivt_unit, left_index=True, right_index=True, how='left')
data = data.merge(closest_ivt_unit_time, left_index=True, right_index=True,  how='left')
data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7
Welwyn Hatfield 012A 4.000000 SG14AB 19.8

Travel time to nearest thrombectomy unit from home (ignoring transfers)#

Limit to units that use MT.

travel_time_mt = travel_time[mt_units]
travel_time_mt.head(2)
RM70AG E11BB SW66SX SE59RW SW170QT NW12BU NG72UH CB20QQ B152TH CV22DX ... NE14LP HU32JZ LS13EX S102JF BN25BE BS105NB PL68DH OX39DU SO166YD CF144XW
LSOA
Adur 001A 85.3 87.4 76.7 74.5 69.2 86.4 188.4 122.9 173.3 159.4 ... 332.3 267.9 251.7 223.8 17.6 162.6 237.8 112.1 76.7 192.7
Adur 001B 85.3 87.4 76.7 74.5 69.2 86.4 188.4 122.9 173.3 159.4 ... 333.4 267.9 251.7 224.9 18.7 162.6 237.8 112.1 76.7 192.7

2 rows × 24 columns

Get closest unit and time for MT (converting series to dataframes).

closest_mt_unit = pd.DataFrame(travel_time_mt.idxmin(axis=1), columns=['closest_mt_unit'])
closest_mt_unit_time = pd.DataFrame(travel_time_mt.min(axis=1), columns=['closest_mt_unit_time'])

Merge into admissions.

data = data.merge(closest_mt_unit, left_index=True, right_index=True, how='left')
data = data.merge(closest_mt_unit_time, left_index=True, right_index=True, how='left')
data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9

Travel time from closest IVT unit to its closest MT unit (considers transfers)#

Find MT unit that is closest to each IVT unit, and the travel time between them

# Filter to just rows of IVT units, and columns of MT units
inter_ivt = inter_hospital_times.loc[
    inter_hospital_times.index.isin(ivt_units), mt_units]
inter_ivt.head(2)
RM70AG E11BB SW66SX SE59RW SW170QT NW12BU NG72UH CB20QQ B152TH CV22DX ... NE14LP HU32JZ LS13EX S102JF BN25BE BS105NB PL68DH OX39DU SO166YD CF144XW
from_postcode
B152TH 143.3 140.0 128.2 141.1 141.1 130.4 69.2 122.9 0.0 42.3 ... 213.1 147.6 131.5 104.6 178.7 92.8 203.4 81.0 149.7 116.4
B714HJ 140.0 137.9 134.7 141.1 148.6 129.3 65.9 119.6 18.7 39.1 ... 209.9 145.4 129.3 101.4 185.2 91.7 202.3 87.4 156.2 116.4

2 rows × 24 columns

# Find nearest MT unit to each IVT unit and the travel times
ivt_nearest_mt = pd.DataFrame(
    {'closest_mt_transfer': inter_ivt.idxmin(axis=1),
     'closest_mt_transfer_time': inter_ivt.min(axis=1)})
ivt_nearest_mt.head(2)
closest_mt_transfer closest_mt_transfer_time
from_postcode
B152TH B152TH 0.0
B714HJ B152TH 18.7

Merge into admissions (based on closest IVT unit for each LSOA)

data = pd.merge(data, ivt_nearest_mt, how='left', left_on='closest_ivt_unit', right_index=True)

Add column with total time (for travel from LSOA to closest IVT to then the MT closest to that)

data['total_mt_time'] = (data['closest_ivt_unit_time'] +
                         data['closest_mt_transfer_time'])
data.head(5)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9
Welwyn Hatfield 002F 2.000000 SG14AB 18.7 NW12BU 38.0 CB20QQ 39.1 57.8
Welwyn Hatfield 002E 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8
Welwyn Hatfield 010A 3.333333 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8

Thrombolysis rate of closest IVT unit#

# Get index from data (which is originally index from admissions file)
lsoa_index = data.index
lsoa_index
Index(['Welwyn Hatfield 010F', 'Welwyn Hatfield 012A', 'Welwyn Hatfield 002F',
       'Welwyn Hatfield 002E', 'Welwyn Hatfield 010A', 'Welwyn Hatfield 010B',
       'Welwyn Hatfield 012B', 'Welwyn Hatfield 011D', 'Welwyn Hatfield 011A',
       'Welwyn Hatfield 012C',
       ...
       'Cardiff 005F', 'Cardiff 049D', 'Cardiff 049E', 'Cardiff 049F',
       'Cardiff 005G', 'Cardiff 006F', 'Swansea 025F', 'Swansea 023E',
       'Swansea 025G', 'Swansea 025H'],
      dtype='object', name='area', length=34752)
# Get postcode and IVT rate of units that offer IVT
mask = units['Use_IVT'] == 1
ivt_rate = units[mask][['Postcode', 'ivt_rate']]

# Merge into admissions (adding rate for closest IVT unit to each LSOA)
data = data.merge(
    ivt_rate, left_on='closest_ivt_unit', right_on='Postcode', how='left')

# Drop the postcode column (duplicates closest_IVT_unit)
data.drop('Postcode', axis=1, inplace=True)

# Replace index (as was lost when merged)
data.set_index(lsoa_index, inplace=True)
data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8

Demographic information#

Index of multiple deprivation (IMD)#

These are imported seperately:

They are then joined and added to the main dataframe.

# Import common Welsh LSOA
wales_lsoa = pd.read_csv(
    os.path.join(paths.data, paths.look_ups, paths.welsh_lsoa),
    index_col='LSOA11Code')

# Import welsh IMD scores
imd_wales = pd.read_csv(
    os.path.join(paths.data, paths.demographic, paths.welsh_imd),
    index_col='LSOA_Code')

# Add common LSOA name to the dataframe with Welsh IMD scores
imd_wales = pd.merge(imd_wales, wales_lsoa,
                     left_index=True, right_index=True, how='left')

# Use common LSOA name as index
imd_wales.set_index('lsoa11name', inplace=True)

# Set index as LSOA, then just keep the index and IMD column (renamed)
imd_wales = imd_wales[['WIMD2019_Score']]
imd_wales = imd_wales.rename(columns={'WIMD2019_Score': 'imd_2019_score'})

imd_wales.head()
imd_2019_score
lsoa11name
Isle of Anglesey 007A 19.2
Isle of Anglesey 007B 12.2
Isle of Anglesey 001A 26.8
Isle of Anglesey 001B 15.5
Isle of Anglesey 005A 16.0
os.path.join(paths.data, paths.demographic, paths.english_imd)
'./data/demographic/IMD_2019.shp'
# Import dataframe with English LSOA and IMD
imd_eng = gpd.read_file(
    os.path.join(paths.data, paths.demographic, paths.english_imd),
    crs='EPSG:27700')

# Set index as LSOA, then just keep the index and IMD column (renamed)
imd_eng.set_index('lsoa11nm', inplace=True)
imd_eng = imd_eng[['IMDScore']]
imd_eng = imd_eng.rename(columns={'IMDScore': 'imd_2019_score'})

imd_eng.head()
imd_2019_score
lsoa11nm
City of London 001A 6.208
City of London 001B 5.143
City of London 001C 19.402
City of London 001E 28.652
Barking and Dagenham 016A 19.837
# Create list containing the two dataframes
gdf_list = [imd_eng, imd_wales]

# Combine two dataframes and convert into GeoDataFrame
imd_eng_wales = gpd.GeoDataFrame(pd.concat(gdf_list))

# Merge on index (which is the LSOA)
data = data.merge(imd_eng_wales, left_index=True, right_index=True,  how='left')

data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313

Deprivation#

The Indices of Deprivation (IoD) is calculated from thirty indicators grouped into seven domains. One such domain is Income.

  • Income domain - proportion of population experiencing deprivation relating to low levels of income

  • Income Deprivation Affecting Children Index (IDACI) - proportion of children aged 0-15 living in income deprived households - subset of Income Deprivation domain

  • Income Deprivation Affecting Older People Index (IDAOPI) - proportion of adults aged 60 or older who experience income deprivation - subset of Income Deprivation domain

# Import deprivation data
deprivation = pd.read_csv(
    os.path.join(paths.data, paths.demographic, paths.deprivation),
    index_col='LSOA name (2011)')

# Drop unwanted columns
cols_to_drop = ['LSOA Code (2011)','Local Authority District code (2019)']
deprivation.drop(cols_to_drop, axis=1, inplace=True)

# Rename columns
deprivation = deprivation.rename(columns={
    'Local Authority District name (2019)': 'la_district_name_2019',
    'Income Domain Score': 'income_domain_score',
    'Income Domain Rank (where 1 is most deprived)': 'income_domain_rank',
    'IDACI Score': 'idaci_score',
    'IDACI Rank (where 1 is most deprived)': 'idaci_rank',
    'IDAOPI Score': 'idaopi_score',
    'IDAOPI Rank (where 1 is most deprived)': 'idaopi_rank'})

deprivation.head(2)
la_district_name_2019 income_domain_score income_domain_rank idaci_score idaci_rank idaopi_score idaopi_rank
LSOA name (2011)
City of London 001A City of London 0.007 34740 0.006 34715 0.012 34729
City of London 001B City of London 0.034 31762 0.037 31529 0.030 33839
# Merge into data
data = data.merge(deprivation, left_index=True, right_index=True,  how='left')

# Reset index back to LSOA (as lost when merged)
data.set_index(lsoa_index, inplace=True)

data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score la_district_name_2019 income_domain_score income_domain_rank idaci_score idaci_rank idaopi_score idaopi_rank
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616 Welwyn Hatfield 0.104 17062 0.125 17700 0.186 11760
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313 Welwyn Hatfield 0.176 9058 0.270 6329 0.269 6337

Ethnicity and health#

This is the number of people in each ethnic or health group in each LSOA.

def clean_ethnic_or_health(path, prefix, suffix, data):
    '''
    Cleans ethnicity or health dataset and adds to main dataframe
    Inputs:
    - path: string - path to dataset from dataclass of paths
    - prefix: string - prefix to remove from column names
    - suffix: string - suffix to remove from column names
    - data: dataframe - with combined hospital and demographic results
    Outputs:
    - data: dataframe - with addition of ethnicity or health columns
    '''
    # Import data
    df = pd.read_csv(
        os.path.join(paths.data, paths.demographic, path),
        index_col='geography')

    # Only get columns including the specific prefix and suffix (e.g. all age)
    cols_to_include = [
        x for x in list(df) if prefix in x]
    cols_to_include = [
        x for x in cols_to_include if suffix in x]
    extract = df[cols_to_include]

    # Remove prefix and suffix
    col_names = extract.columns.str.removeprefix(prefix)
    col_names = col_names.str.removesuffix(suffix)
    # Change everything to lower case
    col_names = col_names.str.lower()
    # Replace spaces and punctuation
    col_names = col_names.str.replace(' ', '_')
    col_names = col_names.str.replace('/', '_')
    col_names = col_names.str.replace('-', '_')
    col_names = col_names.str.replace(':', '')

    # Rename the columns
    extract.columns = col_names

    # Merge in with data
    data = data.merge(extract, left_index=True, right_index=True,  how='left')

    # Reset index to LSOA (as lost on merge)
    data.set_index(lsoa_index, inplace=True)

    return(data)
# Add ethnicity data
data = clean_ethnic_or_health(
    path=paths.ethnicity,
    prefix='Sex: All persons; Age: All categories: Age; ',
    suffix='; measures: Value',
    data=data)

data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score ... ethnic_group_asian_asian_british_bangladeshi ethnic_group_asian_asian_british_chinese ethnic_group_asian_asian_british_other_asian ethnic_group_black_african_caribbean_black_british_total ethnic_group_black_african_caribbean_black_british_african ethnic_group_black_african_caribbean_black_british_caribbean ethnic_group_black_african_caribbean_black_british_other_black ethnic_group_other_ethnic_group_total ethnic_group_other_ethnic_group_arab ethnic_group_other_ethnic_group_any_other_ethnic_group
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616 ... 14 23 35 105 73 30 2 14 6 8
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313 ... 28 37 23 151 127 11 13 25 11 14

2 rows × 41 columns

Health is general health from the 2011 census. Each person in the household was asked to rate their health in general; the possible responses were ‘Very good’, ‘Good’, ‘Fair’, ‘Bad’ and ‘Very bad’. For more information, see ONS report on 2011 general health.

# Add health data
data = clean_ethnic_or_health(
    path=paths.health,
    prefix='Sex: All persons; Age: All categories: Age; General Health: ',
    suffix='; Ethnic Group: All categories: Ethnic group; measures: Value',
    data=data)

data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score ... ethnic_group_black_african_caribbean_black_british_african ethnic_group_black_african_caribbean_black_british_caribbean ethnic_group_black_african_caribbean_black_british_other_black ethnic_group_other_ethnic_group_total ethnic_group_other_ethnic_group_arab ethnic_group_other_ethnic_group_any_other_ethnic_group all_categories_general_health very_good_or_good_health fair_health bad_or_very_bad_health
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616 ... 73 30 2 14 6 8 1523 1284 175 64
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313 ... 127 11 13 25 11 14 1637 1269 265 103

2 rows × 45 columns

Long-term health problem or disability is from the 2011 census. People were asked “Are your day-to-day activities limited because of a health problem or disability which has lasted, or is expected to last, at least 12 months, include problems related to old age”. Respondents could choose not, or “limited a little”, or “limited a lot”. For more information, see this surrey report on general health and long-term health.

# Add long-term health or disability
data = clean_ethnic_or_health(
    path=paths.long_term_health,
    prefix='Sex: All persons; Age: All categories: Age; Disability: ',
    suffix='; General Health: All categories: General health; measures: Value',
    data=data)

data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score ... ethnic_group_other_ethnic_group_arab ethnic_group_other_ethnic_group_any_other_ethnic_group all_categories_general_health very_good_or_good_health fair_health bad_or_very_bad_health all_categories_long_term_health_problem_or_disability day_to_day_activities_limited_a_lot day_to_day_activities_limited_a_little day_to_day_activities_not_limited
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616 ... 6 8 1523 1284 175 64 1523 81 113 1329
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313 ... 11 14 1637 1269 265 103 1559 118 129 1312

2 rows × 49 columns

Rural v.s. urban#

# Import rural/urban data
rural_urban = pd.read_csv(
    os.path.join(paths.data, paths.demographic, paths.rural_urban),
    index_col='LSOA11NM')

# Drop unwanted columns
cols_to_drop = ['LSOA11CD','RUC11CD', 'FID']
rural_urban.drop(cols_to_drop, axis=1, inplace=True)

# Rename column
rural_urban = rural_urban.rename(columns={'RUC11': 'rural_urban_2011'})

# Merge on LSOA
data = data.merge(rural_urban, left_index=True, right_index=True,  how='left')
data.set_index(lsoa_index, inplace=True)

data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score ... ethnic_group_other_ethnic_group_any_other_ethnic_group all_categories_general_health very_good_or_good_health fair_health bad_or_very_bad_health all_categories_long_term_health_problem_or_disability day_to_day_activities_limited_a_lot day_to_day_activities_limited_a_little day_to_day_activities_not_limited rural_urban_2011
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616 ... 8 1523 1284 175 64 1523 81 113 1329 Urban city and town
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313 ... 14 1637 1269 265 103 1559 118 129 1312 Urban city and town

2 rows × 50 columns

Age#

When using these age columns, be careful to ensure that you remember how they were created - e.g.

  • age_band_75 is count of ages 75 to 79

  • age_band_80 is count of ages 80 to 84

  • age_band_85 is count of ages 85 to 89

  • age_band_90 is count of everyone aged 90 or above

def age_bands(file_path, data, group, all_col, lsoa_index=lsoa_index):
    '''
    Finds count of individuals in each age band in each LSOA, using data
    provided (which has all persons, all males or all females). Adds these
    columns to data.
    Input:
    - file_path: string - path to age data
    - data: dataframe - contains hospital and demographic information by LSOA
    - group: string - group of people who the age data is from
    - all_col: string - name of column with all ages
    - lsoa_index: index to reset to
    Output:
    - data: dataframe - as before, but with added age counts
    '''
    # Import age data, setting index as LSOA
    ages = pd.read_csv(file_path, index_col='LSOA Name')

    # Save total person count to data
    data[all_col] = ages['All Ages']

    # Drop 'All Ages'
    ages.drop('All Ages', axis=1, inplace=True)

    # Change 90+ to 90
    ages = ages.rename({'90+': '90'}, axis='columns')

    # Create empty dataframe to store age bands
    age_bands = pd.DataFrame()

    # Loop through numbers in 5s (0, 5, 10, 15... 90)
    for band in np.arange(0, 91, 5):
        # Create empty list
        cols_to_get = []
        # Create lists with ages 0, 1, 2... 89, 90 as strings
        age_list = [str(x) for x in range(0,91)]
        for field in age_list:
            # Divide by 5 and convert to nearest int (e.g. int(14/5)=2)
            # Multiply by 5 to get the band - save if in current band of loop
            if int(int(field)/5) * 5 == band:
                cols_to_get.append(field)
        # Extract columns with ages in that band (e.g. for 0 - 0, 1, 2, 3, 4)
        extract = ages[cols_to_get]
        # Sum of people in that age band
        age_bands[f'age_band_{group}_{band}'] = extract.sum(axis=1)

    # Merge age band sums with data on LSOA 
    data = data.merge(age_bands, left_index=True, right_index=True,  how='left')
    data.set_index(lsoa_index, inplace=True)

    return(data)
# For all people, add count in each age band
data = age_bands(
    os.path.join(paths.data, paths.demographic, paths.age_all),
    data, 'all', 'population_all')
data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score ... age_band_all_45 age_band_all_50 age_band_all_55 age_band_all_60 age_band_all_65 age_band_all_70 age_band_all_75 age_band_all_80 age_band_all_85 age_band_all_90
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616 ... 88 115 102 74 72 55 44 22 10 9
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313 ... 117 122 127 87 47 45 39 29 20 23

2 rows × 70 columns

# For all females, add count in each age band
data = age_bands(
    os.path.join(paths.data, paths.demographic, paths.age_female),
    data, 'females', 'population_females')
data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score ... age_band_females_45 age_band_females_50 age_band_females_55 age_band_females_60 age_band_females_65 age_band_females_70 age_band_females_75 age_band_females_80 age_band_females_85 age_band_females_90
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616 ... 41 49 43 46 39 27 18 8 5 6
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313 ... 55 62 47 34 23 27 18 17 15 19

2 rows × 90 columns

# For all males, add count in each age band
data = age_bands(
    os.path.join(paths.data, paths.demographic, paths.age_male),
    data, 'males', 'population_males')
data.head(2)
admissions closest_ivt_unit closest_ivt_unit_time closest_mt_unit closest_mt_unit_time closest_mt_transfer closest_mt_transfer_time total_mt_time ivt_rate imd_2019_score ... age_band_males_45 age_band_males_50 age_band_males_55 age_band_males_60 age_band_males_65 age_band_males_70 age_band_males_75 age_band_males_80 age_band_males_85 age_band_males_90
area
Welwyn Hatfield 010F 0.666667 SG14AB 18.7 NW12BU 36.9 CB20QQ 39.1 57.8 6.8 15.616 ... 47 66 59 28 33 28 26 14 5 3
Welwyn Hatfield 012A 4.000000 SG14AB 19.8 NW12BU 36.9 CB20QQ 39.1 58.9 6.8 33.313 ... 62 60 80 53 24 18 21 12 5 4

2 rows × 110 columns

Save output#

data.to_csv(os.path.join(paths.data, paths.collated), index_label='LSOA')