Collate demographic data for each LSOA#
This notebook collates various data sources and saves as collated_data.csv
in data folder. For each LSOA, it finds:
Hospital information - yearly stroke admissions, travel time to and thrombolysis rate of nearest IVT, and travel time to nearest IVT (considering transfers or not)
Demographic information - IMD, IoD Income/IDACI/IDAOIP, ethnicity, health, rural/urban, age
For source of demographic data, see data/demographic/demographic_soucres.md
Import libraries and set file paths#
# Import required libraries
from dataclasses import dataclass
import geopandas as gpd
import numpy as np
import os
import pandas as pd
import zipfile
# Define file paths
@dataclass(frozen=True)
class Paths:
'''Singleton object for storing paths to data and database.'''
data = './data'
hospitals = 'hospitals'
travel_matrix_zip = 'lsoa_travel_time_matrix_calibrated.zip'
travel_matrix_csv = 'lsoa_travel_time_matrix_calibrated.csv'
admissions = 'admissions_2017-2019.csv'
units = 'stroke_hospitals_2022.csv'
inter_hospital = 'inter_hospital_time_calibrated.csv'
demographic = 'demographic'
welsh_imd = 'WIMD2019_Scores.csv'
english_imd = 'IMD_2019.shp'
deprivation = ('File_7a_-_All_IoD2019_Scores__Ranks__Deciles' +
'_and_Population_Denominators_3.csv')
ethnicity = 'ethnicity.csv'
health = 'general_health.csv'
long_term_health = 'long_term_health_by_general_health.csv'
rural_urban = ('Rural_Urban_Classification_2011_of_Lower_Layer_Super' +
'_Output_Areas_in_England_and_Wales.csv')
age_all = 'mid_2020_persons.csv'
age_female = 'mid_2020_females.csv'
age_male = 'mid_2020_males.csv'
look_ups = 'look_ups'
welsh_lsoa = 'wales_lsoa_code_name.csv'
collated = 'collated_data.csv'
paths = Paths()
Hospital information#
Import admissions, units and travel times#
Admissions#
This is the average yearly admissions for each LSOA from 2017 to 2019.
data = pd.read_csv(os.path.join(
paths.data, paths.hospitals, paths.admissions), index_col='area')
data.rename(columns={'Admissions': 'admissions'}, inplace=True)
data.head(2)
admissions | |
---|---|
area | |
Welwyn Hatfield 010F | 0.666667 |
Welwyn Hatfield 012A | 4.000000 |
Units#
This has a record of information about every hospital - including whether they provide thrombolysis (IVT) and thrombectomy (MT).
units = pd.read_csv(os.path.join(
paths.data, paths.hospitals, paths.units))
display(units.shape)
units.head(2)
(142, 21)
Postcode | Hospital_name | Use_IVT | Use_MT | Use_MSU | Country | Strategic Clinical Network | Health Board / Trust | Stroke Team | SSNAP name | ... | Thrombolysis | ivt_rate | Easting | Northing | long | lat | Neuroscience | 30 England Thrombectomy Example | hospital_city | Notes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | RM70AG | RM70AG | 1 | 1 | 1 | England | London SCN | Barking | Havering and Redbridge University Hospitals N... | Queens Hospital Romford HASU | ... | 117.0 | 11.9 | 551118 | 187780 | 0.179031 | 51.568647 | 1 | 0 | Romford | NaN |
1 | E11BB | E11BB | 1 | 1 | 1 | England | London SCN | Barts Health NHS Trust | The Royal London Hospital | Royal London Hospital HASU | ... | 115.0 | 13.4 | 534829 | 181798 | -0.058133 | 51.519018 | 1 | 1 | Royal London | NaN |
2 rows × 21 columns
LSOA to hospital travel times#
This is a matrix with travel times (in minutes) from every LSOA to every hospital.
# Unzip travel matrix folder and save the csv file to data path
travel_path = os.path.join(
paths.data, paths.hospitals, paths.travel_matrix_zip)
with zipfile.ZipFile(travel_path, 'r') as zip_ref:
zip_ref.extractall(os.path.join(paths.data, paths.hospitals))
# Import the csv file produced
travel_time = pd.read_csv(
os.path.join(paths.data, paths.hospitals, paths.travel_matrix_csv),
index_col='LSOA')
# Remove any leading spaces in column titles
travel_time.columns = travel_time.columns.str.lstrip()
travel_time.head(2)
B152TH | B714HJ | B95SS | BA13NG | BA214AT | BB23HH | BD96RJ | BH77DW | BL97TD | BN112DH | ... | TN240LZ | TQ27AA | TR13LQ | TS198PE | TS43BW | WD180HB | WF14DG | WR51DD | WV100QP | YO318HE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LSOA | |||||||||||||||||||||
Adur 001A | 173.3 | 179.8 | 171.2 | 161.5 | 152.9 | 274.3 | 261.4 | 100.3 | 262.5 | 18.7 | ... | 89.6 | 212.0 | 291.5 | 310.8 | 306.5 | 92.8 | 242.1 | 178.7 | 186.2 | 267.9 |
Adur 001B | 173.3 | 179.8 | 172.3 | 161.5 | 152.9 | 274.3 | 261.4 | 99.2 | 263.6 | 18.7 | ... | 90.6 | 212.0 | 290.4 | 311.9 | 306.5 | 92.8 | 242.1 | 178.7 | 186.2 | 267.9 |
2 rows × 125 columns
Inter-hospital travel times#
This is a matrix with travel times (in minutes) from each hospital to every other hospital.
inter_hospital_times = pd.read_csv(
os.path.join(paths.data, paths.hospitals, paths.inter_hospital),
index_col='from_postcode')
# Remove any leading spaces in column titles
inter_hospital_times.columns = inter_hospital_times.columns.str.lstrip()
inter_hospital_times.head(2)
B152TH | B714HJ | B95SS | BA13NG | BA214AT | BB23HH | BD96RJ | BH77DW | BL97TD | BN112DH | ... | TN240LZ | TQ27AA | TR13LQ | TS198PE | TS43BW | WD180HB | WF14DG | WR51DD | WV100QP | YO318HE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
from_postcode | |||||||||||||||||||||
B152TH | 0.0 | 18.7 | 18.7 | 107.8 | 155.1 | 117.5 | 135.8 | 173.3 | 106.8 | 176.6 | ... | 184.1 | 176.6 | 256.0 | 191.6 | 187.3 | 114.3 | 122.9 | 36.9 | 29.4 | 148.6 |
B714HJ | 18.7 | 0.0 | 24.1 | 106.8 | 154.0 | 110.0 | 128.2 | 179.8 | 98.2 | 183.0 | ... | 184.1 | 176.6 | 255.0 | 188.4 | 184.1 | 111.1 | 119.6 | 35.9 | 21.9 | 145.4 |
2 rows × 125 columns
Get postcodes of units providing IVT or MT#
Get postcodes of unit in use for IVT
mask = units['Use_IVT'] == 1
ivt_units = list(units[mask]['Postcode'])
display(len(ivt_units))
ivt_units[0:5]
113
['RM70AG', 'E11BB', 'SW66SX', 'SE59RW', 'BR68ND']
Get postcodes of units providing MT.
mask = units['Use_MT'] == 1
mt_units = list(units[mask]['Postcode'])
display(len(mt_units))
mt_units[0:5]
24
['RM70AG', 'E11BB', 'SW66SX', 'SE59RW', 'SW170QT']
Travel time to nearest thrombolysis unit#
Limit to units that use IVT
travel_time_ivt = travel_time[ivt_units]
display(travel_time_ivt.shape)
travel_time_ivt.head(2)
(34752, 113)
RM70AG | E11BB | SW66SX | SE59RW | BR68ND | HA13UJ | SW170QT | NW12BU | DE223NE | NN15BD | ... | LL137TD | LL572PW | CF144XW | CF479DT | CF311RQ | SY231ER | SA148QF | SA312AF | SA612PZ | SA66NL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LSOA | |||||||||||||||||||||
Adur 001A | 85.3 | 87.4 | 76.7 | 74.5 | 67.0 | 87.4 | 69.2 | 86.4 | 191.6 | 131.5 | ... | 248.5 | 321.6 | 192.7 | 217.4 | 206.6 | 297.9 | 238.9 | 255.0 | 287.2 | 230.3 |
Adur 001B | 85.3 | 87.4 | 76.7 | 74.5 | 67.0 | 87.4 | 69.2 | 86.4 | 191.6 | 131.5 | ... | 248.5 | 321.6 | 192.7 | 217.4 | 207.7 | 297.9 | 238.9 | 255.0 | 287.2 | 230.3 |
2 rows × 113 columns
Get closest unit and time for IVT (convert series to dataframes)
closest_ivt_unit = pd.DataFrame(travel_time_ivt.idxmin(axis=1), columns=['closest_ivt_unit'])
closest_ivt_unit_time = pd.DataFrame(travel_time_ivt.min(axis=1), columns=['closest_ivt_unit_time'])
Merge into admissions
data = data.merge(closest_ivt_unit, left_index=True, right_index=True, how='left')
data = data.merge(closest_ivt_unit_time, left_index=True, right_index=True, how='left')
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | |
---|---|---|---|
area | |||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 |
Travel time to nearest thrombectomy unit from home (ignoring transfers)#
Limit to units that use MT.
travel_time_mt = travel_time[mt_units]
travel_time_mt.head(2)
RM70AG | E11BB | SW66SX | SE59RW | SW170QT | NW12BU | NG72UH | CB20QQ | B152TH | CV22DX | ... | NE14LP | HU32JZ | LS13EX | S102JF | BN25BE | BS105NB | PL68DH | OX39DU | SO166YD | CF144XW | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LSOA | |||||||||||||||||||||
Adur 001A | 85.3 | 87.4 | 76.7 | 74.5 | 69.2 | 86.4 | 188.4 | 122.9 | 173.3 | 159.4 | ... | 332.3 | 267.9 | 251.7 | 223.8 | 17.6 | 162.6 | 237.8 | 112.1 | 76.7 | 192.7 |
Adur 001B | 85.3 | 87.4 | 76.7 | 74.5 | 69.2 | 86.4 | 188.4 | 122.9 | 173.3 | 159.4 | ... | 333.4 | 267.9 | 251.7 | 224.9 | 18.7 | 162.6 | 237.8 | 112.1 | 76.7 | 192.7 |
2 rows × 24 columns
Get closest unit and time for MT (converting series to dataframes).
closest_mt_unit = pd.DataFrame(travel_time_mt.idxmin(axis=1), columns=['closest_mt_unit'])
closest_mt_unit_time = pd.DataFrame(travel_time_mt.min(axis=1), columns=['closest_mt_unit_time'])
Merge into admissions.
data = data.merge(closest_mt_unit, left_index=True, right_index=True, how='left')
data = data.merge(closest_mt_unit_time, left_index=True, right_index=True, how='left')
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | |
---|---|---|---|---|---|
area | |||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 |
Travel time from closest IVT unit to its closest MT unit (considers transfers)#
Find MT unit that is closest to each IVT unit, and the travel time between them
# Filter to just rows of IVT units, and columns of MT units
inter_ivt = inter_hospital_times.loc[
inter_hospital_times.index.isin(ivt_units), mt_units]
inter_ivt.head(2)
RM70AG | E11BB | SW66SX | SE59RW | SW170QT | NW12BU | NG72UH | CB20QQ | B152TH | CV22DX | ... | NE14LP | HU32JZ | LS13EX | S102JF | BN25BE | BS105NB | PL68DH | OX39DU | SO166YD | CF144XW | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
from_postcode | |||||||||||||||||||||
B152TH | 143.3 | 140.0 | 128.2 | 141.1 | 141.1 | 130.4 | 69.2 | 122.9 | 0.0 | 42.3 | ... | 213.1 | 147.6 | 131.5 | 104.6 | 178.7 | 92.8 | 203.4 | 81.0 | 149.7 | 116.4 |
B714HJ | 140.0 | 137.9 | 134.7 | 141.1 | 148.6 | 129.3 | 65.9 | 119.6 | 18.7 | 39.1 | ... | 209.9 | 145.4 | 129.3 | 101.4 | 185.2 | 91.7 | 202.3 | 87.4 | 156.2 | 116.4 |
2 rows × 24 columns
# Find nearest MT unit to each IVT unit and the travel times
ivt_nearest_mt = pd.DataFrame(
{'closest_mt_transfer': inter_ivt.idxmin(axis=1),
'closest_mt_transfer_time': inter_ivt.min(axis=1)})
ivt_nearest_mt.head(2)
closest_mt_transfer | closest_mt_transfer_time | |
---|---|---|
from_postcode | ||
B152TH | B152TH | 0.0 |
B714HJ | B152TH | 18.7 |
Merge into admissions (based on closest IVT unit for each LSOA)
data = pd.merge(data, ivt_nearest_mt, how='left', left_on='closest_ivt_unit', right_index=True)
Add column with total time (for travel from LSOA to closest IVT to then the MT closest to that)
data['total_mt_time'] = (data['closest_ivt_unit_time'] +
data['closest_mt_transfer_time'])
data.head(5)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | |
---|---|---|---|---|---|---|---|---|
area | ||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 |
Welwyn Hatfield 002F | 2.000000 | SG14AB | 18.7 | NW12BU | 38.0 | CB20QQ | 39.1 | 57.8 |
Welwyn Hatfield 002E | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 |
Welwyn Hatfield 010A | 3.333333 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 |
Thrombolysis rate of closest IVT unit#
# Get index from data (which is originally index from admissions file)
lsoa_index = data.index
lsoa_index
Index(['Welwyn Hatfield 010F', 'Welwyn Hatfield 012A', 'Welwyn Hatfield 002F',
'Welwyn Hatfield 002E', 'Welwyn Hatfield 010A', 'Welwyn Hatfield 010B',
'Welwyn Hatfield 012B', 'Welwyn Hatfield 011D', 'Welwyn Hatfield 011A',
'Welwyn Hatfield 012C',
...
'Cardiff 005F', 'Cardiff 049D', 'Cardiff 049E', 'Cardiff 049F',
'Cardiff 005G', 'Cardiff 006F', 'Swansea 025F', 'Swansea 023E',
'Swansea 025G', 'Swansea 025H'],
dtype='object', name='area', length=34752)
# Get postcode and IVT rate of units that offer IVT
mask = units['Use_IVT'] == 1
ivt_rate = units[mask][['Postcode', 'ivt_rate']]
# Merge into admissions (adding rate for closest IVT unit to each LSOA)
data = data.merge(
ivt_rate, left_on='closest_ivt_unit', right_on='Postcode', how='left')
# Drop the postcode column (duplicates closest_IVT_unit)
data.drop('Postcode', axis=1, inplace=True)
# Replace index (as was lost when merged)
data.set_index(lsoa_index, inplace=True)
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | |
---|---|---|---|---|---|---|---|---|---|
area | |||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 |
Demographic information#
Index of multiple deprivation (IMD)#
These are imported seperately:
English - IMD scores extracted from shapefile
Welsh - csv file with IMD scores has uncommon LSOA names, so the common LSOA names are added (as from ‘wales_lsoa_code_name.csv’ which was created from https://datamap.gov.wales/layers/appdata-ons:lsoa_wales_2011)
They are then joined and added to the main dataframe.
# Import common Welsh LSOA
wales_lsoa = pd.read_csv(
os.path.join(paths.data, paths.look_ups, paths.welsh_lsoa),
index_col='LSOA11Code')
# Import welsh IMD scores
imd_wales = pd.read_csv(
os.path.join(paths.data, paths.demographic, paths.welsh_imd),
index_col='LSOA_Code')
# Add common LSOA name to the dataframe with Welsh IMD scores
imd_wales = pd.merge(imd_wales, wales_lsoa,
left_index=True, right_index=True, how='left')
# Use common LSOA name as index
imd_wales.set_index('lsoa11name', inplace=True)
# Set index as LSOA, then just keep the index and IMD column (renamed)
imd_wales = imd_wales[['WIMD2019_Score']]
imd_wales = imd_wales.rename(columns={'WIMD2019_Score': 'imd_2019_score'})
imd_wales.head()
imd_2019_score | |
---|---|
lsoa11name | |
Isle of Anglesey 007A | 19.2 |
Isle of Anglesey 007B | 12.2 |
Isle of Anglesey 001A | 26.8 |
Isle of Anglesey 001B | 15.5 |
Isle of Anglesey 005A | 16.0 |
os.path.join(paths.data, paths.demographic, paths.english_imd)
'./data/demographic/IMD_2019.shp'
# Import dataframe with English LSOA and IMD
imd_eng = gpd.read_file(
os.path.join(paths.data, paths.demographic, paths.english_imd),
crs='EPSG:27700')
# Set index as LSOA, then just keep the index and IMD column (renamed)
imd_eng.set_index('lsoa11nm', inplace=True)
imd_eng = imd_eng[['IMDScore']]
imd_eng = imd_eng.rename(columns={'IMDScore': 'imd_2019_score'})
imd_eng.head()
imd_2019_score | |
---|---|
lsoa11nm | |
City of London 001A | 6.208 |
City of London 001B | 5.143 |
City of London 001C | 19.402 |
City of London 001E | 28.652 |
Barking and Dagenham 016A | 19.837 |
# Create list containing the two dataframes
gdf_list = [imd_eng, imd_wales]
# Combine two dataframes and convert into GeoDataFrame
imd_eng_wales = gpd.GeoDataFrame(pd.concat(gdf_list))
# Merge on index (which is the LSOA)
data = data.merge(imd_eng_wales, left_index=True, right_index=True, how='left')
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | |
---|---|---|---|---|---|---|---|---|---|---|
area | ||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 |
Deprivation#
The Indices of Deprivation (IoD) is calculated from thirty indicators grouped into seven domains. One such domain is Income.
Income domain - proportion of population experiencing deprivation relating to low levels of income
Income Deprivation Affecting Children Index (IDACI) - proportion of children aged 0-15 living in income deprived households - subset of Income Deprivation domain
Income Deprivation Affecting Older People Index (IDAOPI) - proportion of adults aged 60 or older who experience income deprivation - subset of Income Deprivation domain
# Import deprivation data
deprivation = pd.read_csv(
os.path.join(paths.data, paths.demographic, paths.deprivation),
index_col='LSOA name (2011)')
# Drop unwanted columns
cols_to_drop = ['LSOA Code (2011)','Local Authority District code (2019)']
deprivation.drop(cols_to_drop, axis=1, inplace=True)
# Rename columns
deprivation = deprivation.rename(columns={
'Local Authority District name (2019)': 'la_district_name_2019',
'Income Domain Score': 'income_domain_score',
'Income Domain Rank (where 1 is most deprived)': 'income_domain_rank',
'IDACI Score': 'idaci_score',
'IDACI Rank (where 1 is most deprived)': 'idaci_rank',
'IDAOPI Score': 'idaopi_score',
'IDAOPI Rank (where 1 is most deprived)': 'idaopi_rank'})
deprivation.head(2)
la_district_name_2019 | income_domain_score | income_domain_rank | idaci_score | idaci_rank | idaopi_score | idaopi_rank | |
---|---|---|---|---|---|---|---|
LSOA name (2011) | |||||||
City of London 001A | City of London | 0.007 | 34740 | 0.006 | 34715 | 0.012 | 34729 |
City of London 001B | City of London | 0.034 | 31762 | 0.037 | 31529 | 0.030 | 33839 |
# Merge into data
data = data.merge(deprivation, left_index=True, right_index=True, how='left')
# Reset index back to LSOA (as lost when merged)
data.set_index(lsoa_index, inplace=True)
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | la_district_name_2019 | income_domain_score | income_domain_rank | idaci_score | idaci_rank | idaopi_score | idaopi_rank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
area | |||||||||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 | Welwyn Hatfield | 0.104 | 17062 | 0.125 | 17700 | 0.186 | 11760 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 | Welwyn Hatfield | 0.176 | 9058 | 0.270 | 6329 | 0.269 | 6337 |
Ethnicity and health#
This is the number of people in each ethnic or health group in each LSOA.
def clean_ethnic_or_health(path, prefix, suffix, data):
'''
Cleans ethnicity or health dataset and adds to main dataframe
Inputs:
- path: string - path to dataset from dataclass of paths
- prefix: string - prefix to remove from column names
- suffix: string - suffix to remove from column names
- data: dataframe - with combined hospital and demographic results
Outputs:
- data: dataframe - with addition of ethnicity or health columns
'''
# Import data
df = pd.read_csv(
os.path.join(paths.data, paths.demographic, path),
index_col='geography')
# Only get columns including the specific prefix and suffix (e.g. all age)
cols_to_include = [
x for x in list(df) if prefix in x]
cols_to_include = [
x for x in cols_to_include if suffix in x]
extract = df[cols_to_include]
# Remove prefix and suffix
col_names = extract.columns.str.removeprefix(prefix)
col_names = col_names.str.removesuffix(suffix)
# Change everything to lower case
col_names = col_names.str.lower()
# Replace spaces and punctuation
col_names = col_names.str.replace(' ', '_')
col_names = col_names.str.replace('/', '_')
col_names = col_names.str.replace('-', '_')
col_names = col_names.str.replace(':', '')
# Rename the columns
extract.columns = col_names
# Merge in with data
data = data.merge(extract, left_index=True, right_index=True, how='left')
# Reset index to LSOA (as lost on merge)
data.set_index(lsoa_index, inplace=True)
return(data)
# Add ethnicity data
data = clean_ethnic_or_health(
path=paths.ethnicity,
prefix='Sex: All persons; Age: All categories: Age; ',
suffix='; measures: Value',
data=data)
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | ... | ethnic_group_asian_asian_british_bangladeshi | ethnic_group_asian_asian_british_chinese | ethnic_group_asian_asian_british_other_asian | ethnic_group_black_african_caribbean_black_british_total | ethnic_group_black_african_caribbean_black_british_african | ethnic_group_black_african_caribbean_black_british_caribbean | ethnic_group_black_african_caribbean_black_british_other_black | ethnic_group_other_ethnic_group_total | ethnic_group_other_ethnic_group_arab | ethnic_group_other_ethnic_group_any_other_ethnic_group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
area | |||||||||||||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 | ... | 14 | 23 | 35 | 105 | 73 | 30 | 2 | 14 | 6 | 8 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 | ... | 28 | 37 | 23 | 151 | 127 | 11 | 13 | 25 | 11 | 14 |
2 rows × 41 columns
Health is general health from the 2011 census. Each person in the household was asked to rate their health in general; the possible responses were ‘Very good’, ‘Good’, ‘Fair’, ‘Bad’ and ‘Very bad’. For more information, see ONS report on 2011 general health.
# Add health data
data = clean_ethnic_or_health(
path=paths.health,
prefix='Sex: All persons; Age: All categories: Age; General Health: ',
suffix='; Ethnic Group: All categories: Ethnic group; measures: Value',
data=data)
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | ... | ethnic_group_black_african_caribbean_black_british_african | ethnic_group_black_african_caribbean_black_british_caribbean | ethnic_group_black_african_caribbean_black_british_other_black | ethnic_group_other_ethnic_group_total | ethnic_group_other_ethnic_group_arab | ethnic_group_other_ethnic_group_any_other_ethnic_group | all_categories_general_health | very_good_or_good_health | fair_health | bad_or_very_bad_health | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
area | |||||||||||||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 | ... | 73 | 30 | 2 | 14 | 6 | 8 | 1523 | 1284 | 175 | 64 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 | ... | 127 | 11 | 13 | 25 | 11 | 14 | 1637 | 1269 | 265 | 103 |
2 rows × 45 columns
Long-term health problem or disability is from the 2011 census. People were asked “Are your day-to-day activities limited because of a health problem or disability which has lasted, or is expected to last, at least 12 months, include problems related to old age”. Respondents could choose not, or “limited a little”, or “limited a lot”. For more information, see this surrey report on general health and long-term health.
# Add long-term health or disability
data = clean_ethnic_or_health(
path=paths.long_term_health,
prefix='Sex: All persons; Age: All categories: Age; Disability: ',
suffix='; General Health: All categories: General health; measures: Value',
data=data)
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | ... | ethnic_group_other_ethnic_group_arab | ethnic_group_other_ethnic_group_any_other_ethnic_group | all_categories_general_health | very_good_or_good_health | fair_health | bad_or_very_bad_health | all_categories_long_term_health_problem_or_disability | day_to_day_activities_limited_a_lot | day_to_day_activities_limited_a_little | day_to_day_activities_not_limited | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
area | |||||||||||||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 | ... | 6 | 8 | 1523 | 1284 | 175 | 64 | 1523 | 81 | 113 | 1329 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 | ... | 11 | 14 | 1637 | 1269 | 265 | 103 | 1559 | 118 | 129 | 1312 |
2 rows × 49 columns
Rural v.s. urban#
# Import rural/urban data
rural_urban = pd.read_csv(
os.path.join(paths.data, paths.demographic, paths.rural_urban),
index_col='LSOA11NM')
# Drop unwanted columns
cols_to_drop = ['LSOA11CD','RUC11CD', 'FID']
rural_urban.drop(cols_to_drop, axis=1, inplace=True)
# Rename column
rural_urban = rural_urban.rename(columns={'RUC11': 'rural_urban_2011'})
# Merge on LSOA
data = data.merge(rural_urban, left_index=True, right_index=True, how='left')
data.set_index(lsoa_index, inplace=True)
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | ... | ethnic_group_other_ethnic_group_any_other_ethnic_group | all_categories_general_health | very_good_or_good_health | fair_health | bad_or_very_bad_health | all_categories_long_term_health_problem_or_disability | day_to_day_activities_limited_a_lot | day_to_day_activities_limited_a_little | day_to_day_activities_not_limited | rural_urban_2011 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
area | |||||||||||||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 | ... | 8 | 1523 | 1284 | 175 | 64 | 1523 | 81 | 113 | 1329 | Urban city and town |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 | ... | 14 | 1637 | 1269 | 265 | 103 | 1559 | 118 | 129 | 1312 | Urban city and town |
2 rows × 50 columns
Age#
When using these age columns, be careful to ensure that you remember how they were created - e.g.
age_band_75 is count of ages 75 to 79
age_band_80 is count of ages 80 to 84
age_band_85 is count of ages 85 to 89
age_band_90 is count of everyone aged 90 or above
def age_bands(file_path, data, group, all_col, lsoa_index=lsoa_index):
'''
Finds count of individuals in each age band in each LSOA, using data
provided (which has all persons, all males or all females). Adds these
columns to data.
Input:
- file_path: string - path to age data
- data: dataframe - contains hospital and demographic information by LSOA
- group: string - group of people who the age data is from
- all_col: string - name of column with all ages
- lsoa_index: index to reset to
Output:
- data: dataframe - as before, but with added age counts
'''
# Import age data, setting index as LSOA
ages = pd.read_csv(file_path, index_col='LSOA Name')
# Save total person count to data
data[all_col] = ages['All Ages']
# Drop 'All Ages'
ages.drop('All Ages', axis=1, inplace=True)
# Change 90+ to 90
ages = ages.rename({'90+': '90'}, axis='columns')
# Create empty dataframe to store age bands
age_bands = pd.DataFrame()
# Loop through numbers in 5s (0, 5, 10, 15... 90)
for band in np.arange(0, 91, 5):
# Create empty list
cols_to_get = []
# Create lists with ages 0, 1, 2... 89, 90 as strings
age_list = [str(x) for x in range(0,91)]
for field in age_list:
# Divide by 5 and convert to nearest int (e.g. int(14/5)=2)
# Multiply by 5 to get the band - save if in current band of loop
if int(int(field)/5) * 5 == band:
cols_to_get.append(field)
# Extract columns with ages in that band (e.g. for 0 - 0, 1, 2, 3, 4)
extract = ages[cols_to_get]
# Sum of people in that age band
age_bands[f'age_band_{group}_{band}'] = extract.sum(axis=1)
# Merge age band sums with data on LSOA
data = data.merge(age_bands, left_index=True, right_index=True, how='left')
data.set_index(lsoa_index, inplace=True)
return(data)
# For all people, add count in each age band
data = age_bands(
os.path.join(paths.data, paths.demographic, paths.age_all),
data, 'all', 'population_all')
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | ... | age_band_all_45 | age_band_all_50 | age_band_all_55 | age_band_all_60 | age_band_all_65 | age_band_all_70 | age_band_all_75 | age_band_all_80 | age_band_all_85 | age_band_all_90 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
area | |||||||||||||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 | ... | 88 | 115 | 102 | 74 | 72 | 55 | 44 | 22 | 10 | 9 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 | ... | 117 | 122 | 127 | 87 | 47 | 45 | 39 | 29 | 20 | 23 |
2 rows × 70 columns
# For all females, add count in each age band
data = age_bands(
os.path.join(paths.data, paths.demographic, paths.age_female),
data, 'females', 'population_females')
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | ... | age_band_females_45 | age_band_females_50 | age_band_females_55 | age_band_females_60 | age_band_females_65 | age_band_females_70 | age_band_females_75 | age_band_females_80 | age_band_females_85 | age_band_females_90 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
area | |||||||||||||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 | ... | 41 | 49 | 43 | 46 | 39 | 27 | 18 | 8 | 5 | 6 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 | ... | 55 | 62 | 47 | 34 | 23 | 27 | 18 | 17 | 15 | 19 |
2 rows × 90 columns
# For all males, add count in each age band
data = age_bands(
os.path.join(paths.data, paths.demographic, paths.age_male),
data, 'males', 'population_males')
data.head(2)
admissions | closest_ivt_unit | closest_ivt_unit_time | closest_mt_unit | closest_mt_unit_time | closest_mt_transfer | closest_mt_transfer_time | total_mt_time | ivt_rate | imd_2019_score | ... | age_band_males_45 | age_band_males_50 | age_band_males_55 | age_band_males_60 | age_band_males_65 | age_band_males_70 | age_band_males_75 | age_band_males_80 | age_band_males_85 | age_band_males_90 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
area | |||||||||||||||||||||
Welwyn Hatfield 010F | 0.666667 | SG14AB | 18.7 | NW12BU | 36.9 | CB20QQ | 39.1 | 57.8 | 6.8 | 15.616 | ... | 47 | 66 | 59 | 28 | 33 | 28 | 26 | 14 | 5 | 3 |
Welwyn Hatfield 012A | 4.000000 | SG14AB | 19.8 | NW12BU | 36.9 | CB20QQ | 39.1 | 58.9 | 6.8 | 33.313 | ... | 62 | 60 | 80 | 53 | 24 | 18 | 21 | 12 | 5 | 4 |
2 rows × 110 columns
Save output#
data.to_csv(os.path.join(paths.data, paths.collated), index_label='LSOA')