Collate demographic data for each LSOA

Collate demographic data for each LSOA#

This notebook collates various data sources and saves as collated_data.csv in data folder. For each LSOA, it finds:

Hospital information - yearly stroke admissions, travel time to and thrombolysis rate of nearest IVT, and travel time to nearest IVT (considering transfers or not)
Demographic information - IMD, IoD Income/IDACI/IDAOIP, ethnicity, health, rural/urban, age

For source of demographic data, see data/demographic/demographic_soucres.md

Import libraries and set file paths#

# Import required libraries
from dataclasses import dataclass
import geopandas as gpd
import numpy as np
import os
import pandas as pd
import zipfile

# Define file paths
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    data = './data'

    hospitals = 'hospitals'
    travel_matrix_zip = 'lsoa_travel_time_matrix_calibrated.zip'
    travel_matrix_csv = 'lsoa_travel_time_matrix_calibrated.csv'
    admissions = 'admissions_2017-2019.csv'
    units = 'stroke_hospitals_2022.csv'
    inter_hospital = 'inter_hospital_time_calibrated.csv'

    demographic = 'demographic'
    welsh_imd = 'WIMD2019_Scores.csv'
    english_imd = 'IMD_2019.shp'
    deprivation = ('File_7a_-_All_IoD2019_Scores__Ranks__Deciles' +
                   '_and_Population_Denominators_3.csv')
    ethnicity = 'ethnicity.csv'
    health = 'general_health.csv'
    long_term_health = 'long_term_health_by_general_health.csv'
    rural_urban = ('Rural_Urban_Classification_2011_of_Lower_Layer_Super' +
                   '_Output_Areas_in_England_and_Wales.csv')
    age_all = 'mid_2020_persons.csv'
    age_female = 'mid_2020_females.csv'
    age_male = 'mid_2020_males.csv'

    look_ups = 'look_ups'
    welsh_lsoa = 'wales_lsoa_code_name.csv'

    collated = 'collated_data.csv'


paths = Paths()

Hospital information#

Import admissions, units and travel times#

Admissions#

This is the average yearly admissions for each LSOA from 2017 to 2019.

data = pd.read_csv(os.path.join(
    paths.data, paths.hospitals, paths.admissions), index_col='area')
data.rename(columns={'Admissions': 'admissions'}, inplace=True)

data.head(2)

	admissions
area
Welwyn Hatfield 010F	0.666667
Welwyn Hatfield 012A	4.000000

Units#

This has a record of information about every hospital - including whether they provide thrombolysis (IVT) and thrombectomy (MT).

units = pd.read_csv(os.path.join(
    paths.data, paths.hospitals, paths.units))

display(units.shape)
units.head(2)

(142, 21)

	Postcode	Hospital_name	Use_IVT	Use_MT	Use_MSU	Country	Strategic Clinical Network	Health Board / Trust	Stroke Team	SSNAP name	...	Thrombolysis	ivt_rate	Easting	Northing	long	lat	Neuroscience	30 England Thrombectomy Example	hospital_city	Notes
0	RM70AG	RM70AG	1	1	1	England	London SCN	Barking	Havering and Redbridge University Hospitals N...	Queens Hospital Romford HASU	...	117.0	11.9	551118	187780	0.179031	51.568647	1	0	Romford	NaN
1	E11BB	E11BB	1	1	1	England	London SCN	Barts Health NHS Trust	The Royal London Hospital	Royal London Hospital HASU	...	115.0	13.4	534829	181798	-0.058133	51.519018	1	1	Royal London	NaN

2 rows × 21 columns

LSOA to hospital travel times#

This is a matrix with travel times (in minutes) from every LSOA to every hospital.

# Unzip travel matrix folder and save the csv file to data path
travel_path = os.path.join(
    paths.data, paths.hospitals, paths.travel_matrix_zip)
with zipfile.ZipFile(travel_path, 'r') as zip_ref:
    zip_ref.extractall(os.path.join(paths.data, paths.hospitals))

# Import the csv file produced
travel_time = pd.read_csv(
    os.path.join(paths.data, paths.hospitals, paths.travel_matrix_csv),
    index_col='LSOA')

# Remove any leading spaces in column titles
travel_time.columns = travel_time.columns.str.lstrip()

travel_time.head(2)

	B152TH	B714HJ	B95SS	BA13NG	BA214AT	BB23HH	BD96RJ	BH77DW	BL97TD	BN112DH	...	TN240LZ	TQ27AA	TR13LQ	TS198PE	TS43BW	WD180HB	WF14DG	WR51DD	WV100QP	YO318HE
LSOA
Adur 001A	173.3	179.8	171.2	161.5	152.9	274.3	261.4	100.3	262.5	18.7	...	89.6	212.0	291.5	310.8	306.5	92.8	242.1	178.7	186.2	267.9
Adur 001B	173.3	179.8	172.3	161.5	152.9	274.3	261.4	99.2	263.6	18.7	...	90.6	212.0	290.4	311.9	306.5	92.8	242.1	178.7	186.2	267.9

2 rows × 125 columns

Inter-hospital travel times#

This is a matrix with travel times (in minutes) from each hospital to every other hospital.

inter_hospital_times = pd.read_csv(
    os.path.join(paths.data, paths.hospitals, paths.inter_hospital),
    index_col='from_postcode')

# Remove any leading spaces in column titles
inter_hospital_times.columns = inter_hospital_times.columns.str.lstrip()

inter_hospital_times.head(2)

	B152TH	B714HJ	B95SS	BA13NG	BA214AT	BB23HH	BD96RJ	BH77DW	BL97TD	BN112DH	...	TN240LZ	TQ27AA	TR13LQ	TS198PE	TS43BW	WD180HB	WF14DG	WR51DD	WV100QP	YO318HE
from_postcode
B152TH	0.0	18.7	18.7	107.8	155.1	117.5	135.8	173.3	106.8	176.6	...	184.1	176.6	256.0	191.6	187.3	114.3	122.9	36.9	29.4	148.6
B714HJ	18.7	0.0	24.1	106.8	154.0	110.0	128.2	179.8	98.2	183.0	...	184.1	176.6	255.0	188.4	184.1	111.1	119.6	35.9	21.9	145.4

2 rows × 125 columns

Get postcodes of units providing IVT or MT#

Get postcodes of unit in use for IVT

mask = units['Use_IVT'] == 1
ivt_units = list(units[mask]['Postcode'])
display(len(ivt_units))
ivt_units[0:5]

['RM70AG', 'E11BB', 'SW66SX', 'SE59RW', 'BR68ND']

Get postcodes of units providing MT.

mask = units['Use_MT'] == 1
mt_units = list(units[mask]['Postcode'])
display(len(mt_units))
mt_units[0:5]

['RM70AG', 'E11BB', 'SW66SX', 'SE59RW', 'SW170QT']

Travel time to nearest thrombolysis unit#

Limit to units that use IVT

travel_time_ivt = travel_time[ivt_units]
display(travel_time_ivt.shape)
travel_time_ivt.head(2)

(34752, 113)

	RM70AG	E11BB	SW66SX	SE59RW	BR68ND	HA13UJ	SW170QT	NW12BU	DE223NE	NN15BD	...	LL137TD	LL572PW	CF144XW	CF479DT	CF311RQ	SY231ER	SA148QF	SA312AF	SA612PZ	SA66NL
LSOA
Adur 001A	85.3	87.4	76.7	74.5	67.0	87.4	69.2	86.4	191.6	131.5	...	248.5	321.6	192.7	217.4	206.6	297.9	238.9	255.0	287.2	230.3
Adur 001B	85.3	87.4	76.7	74.5	67.0	87.4	69.2	86.4	191.6	131.5	...	248.5	321.6	192.7	217.4	207.7	297.9	238.9	255.0	287.2	230.3

2 rows × 113 columns

Get closest unit and time for IVT (convert series to dataframes)

closest_ivt_unit = pd.DataFrame(travel_time_ivt.idxmin(axis=1), columns=['closest_ivt_unit'])
closest_ivt_unit_time = pd.DataFrame(travel_time_ivt.min(axis=1), columns=['closest_ivt_unit_time'])

Merge into admissions

data = data.merge(closest_ivt_unit, left_index=True, right_index=True, how='left')
data = data.merge(closest_ivt_unit_time, left_index=True, right_index=True,  how='left')
data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7
Welwyn Hatfield 012A	4.000000	SG14AB	19.8

Travel time to nearest thrombectomy unit from home (ignoring transfers)#

Limit to units that use MT.

travel_time_mt = travel_time[mt_units]
travel_time_mt.head(2)

	RM70AG	E11BB	SW66SX	SE59RW	SW170QT	NW12BU	NG72UH	CB20QQ	B152TH	CV22DX	...	NE14LP	HU32JZ	LS13EX	S102JF	BN25BE	BS105NB	PL68DH	OX39DU	SO166YD	CF144XW
LSOA
Adur 001A	85.3	87.4	76.7	74.5	69.2	86.4	188.4	122.9	173.3	159.4	...	332.3	267.9	251.7	223.8	17.6	162.6	237.8	112.1	76.7	192.7
Adur 001B	85.3	87.4	76.7	74.5	69.2	86.4	188.4	122.9	173.3	159.4	...	333.4	267.9	251.7	224.9	18.7	162.6	237.8	112.1	76.7	192.7

2 rows × 24 columns

Get closest unit and time for MT (converting series to dataframes).

closest_mt_unit = pd.DataFrame(travel_time_mt.idxmin(axis=1), columns=['closest_mt_unit'])
closest_mt_unit_time = pd.DataFrame(travel_time_mt.min(axis=1), columns=['closest_mt_unit_time'])

Merge into admissions.

data = data.merge(closest_mt_unit, left_index=True, right_index=True, how='left')
data = data.merge(closest_mt_unit_time, left_index=True, right_index=True, how='left')
data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9

Travel time from closest IVT unit to its closest MT unit (considers transfers)#

Find MT unit that is closest to each IVT unit, and the travel time between them

# Filter to just rows of IVT units, and columns of MT units
inter_ivt = inter_hospital_times.loc[
    inter_hospital_times.index.isin(ivt_units), mt_units]
inter_ivt.head(2)

	RM70AG	E11BB	SW66SX	SE59RW	SW170QT	NW12BU	NG72UH	CB20QQ	B152TH	CV22DX	...	NE14LP	HU32JZ	LS13EX	S102JF	BN25BE	BS105NB	PL68DH	OX39DU	SO166YD	CF144XW
from_postcode
B152TH	143.3	140.0	128.2	141.1	141.1	130.4	69.2	122.9	0.0	42.3	...	213.1	147.6	131.5	104.6	178.7	92.8	203.4	81.0	149.7	116.4
B714HJ	140.0	137.9	134.7	141.1	148.6	129.3	65.9	119.6	18.7	39.1	...	209.9	145.4	129.3	101.4	185.2	91.7	202.3	87.4	156.2	116.4

2 rows × 24 columns

# Find nearest MT unit to each IVT unit and the travel times
ivt_nearest_mt = pd.DataFrame(
    {'closest_mt_transfer': inter_ivt.idxmin(axis=1),
     'closest_mt_transfer_time': inter_ivt.min(axis=1)})
ivt_nearest_mt.head(2)

	closest_mt_transfer	closest_mt_transfer_time
from_postcode
B152TH	B152TH	0.0
B714HJ	B152TH	18.7

Merge into admissions (based on closest IVT unit for each LSOA)

data = pd.merge(data, ivt_nearest_mt, how='left', left_on='closest_ivt_unit', right_index=True)

Add column with total time (for travel from LSOA to closest IVT to then the MT closest to that)

data['total_mt_time'] = (data['closest_ivt_unit_time'] +
                         data['closest_mt_transfer_time'])
data.head(5)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9
Welwyn Hatfield 002F	2.000000	SG14AB	18.7	NW12BU	38.0	CB20QQ	39.1	57.8
Welwyn Hatfield 002E	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8
Welwyn Hatfield 010A	3.333333	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8

Thrombolysis rate of closest IVT unit#

# Get index from data (which is originally index from admissions file)
lsoa_index = data.index
lsoa_index

Index(['Welwyn Hatfield 010F', 'Welwyn Hatfield 012A', 'Welwyn Hatfield 002F',
       'Welwyn Hatfield 002E', 'Welwyn Hatfield 010A', 'Welwyn Hatfield 010B',
       'Welwyn Hatfield 012B', 'Welwyn Hatfield 011D', 'Welwyn Hatfield 011A',
       'Welwyn Hatfield 012C',
       ...
       'Cardiff 005F', 'Cardiff 049D', 'Cardiff 049E', 'Cardiff 049F',
       'Cardiff 005G', 'Cardiff 006F', 'Swansea 025F', 'Swansea 023E',
       'Swansea 025G', 'Swansea 025H'],
      dtype='object', name='area', length=34752)

# Get postcode and IVT rate of units that offer IVT
mask = units['Use_IVT'] == 1
ivt_rate = units[mask][['Postcode', 'ivt_rate']]

# Merge into admissions (adding rate for closest IVT unit to each LSOA)
data = data.merge(
    ivt_rate, left_on='closest_ivt_unit', right_on='Postcode', how='left')

# Drop the postcode column (duplicates closest_IVT_unit)
data.drop('Postcode', axis=1, inplace=True)

# Replace index (as was lost when merged)
data.set_index(lsoa_index, inplace=True)

data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8

Demographic information#

Index of multiple deprivation (IMD)#

These are imported seperately:

English - IMD scores extracted from shapefile
Welsh - csv file with IMD scores has uncommon LSOA names, so the common LSOA names are added (as from ‘wales_lsoa_code_name.csv’ which was created from https://datamap.gov.wales/layers/appdata-ons:lsoa_wales_2011)

They are then joined and added to the main dataframe.

# Import common Welsh LSOA
wales_lsoa = pd.read_csv(
    os.path.join(paths.data, paths.look_ups, paths.welsh_lsoa),
    index_col='LSOA11Code')

# Import welsh IMD scores
imd_wales = pd.read_csv(
    os.path.join(paths.data, paths.demographic, paths.welsh_imd),
    index_col='LSOA_Code')

# Add common LSOA name to the dataframe with Welsh IMD scores
imd_wales = pd.merge(imd_wales, wales_lsoa,
                     left_index=True, right_index=True, how='left')

# Use common LSOA name as index
imd_wales.set_index('lsoa11name', inplace=True)

# Set index as LSOA, then just keep the index and IMD column (renamed)
imd_wales = imd_wales[['WIMD2019_Score']]
imd_wales = imd_wales.rename(columns={'WIMD2019_Score': 'imd_2019_score'})

imd_wales.head()

	imd_2019_score
lsoa11name
Isle of Anglesey 007A	19.2
Isle of Anglesey 007B	12.2
Isle of Anglesey 001A	26.8
Isle of Anglesey 001B	15.5
Isle of Anglesey 005A	16.0

os.path.join(paths.data, paths.demographic, paths.english_imd)

'./data/demographic/IMD_2019.shp'

# Import dataframe with English LSOA and IMD
imd_eng = gpd.read_file(
    os.path.join(paths.data, paths.demographic, paths.english_imd),
    crs='EPSG:27700')

# Set index as LSOA, then just keep the index and IMD column (renamed)
imd_eng.set_index('lsoa11nm', inplace=True)
imd_eng = imd_eng[['IMDScore']]
imd_eng = imd_eng.rename(columns={'IMDScore': 'imd_2019_score'})

imd_eng.head()

	imd_2019_score
lsoa11nm
City of London 001A	6.208
City of London 001B	5.143
City of London 001C	19.402
City of London 001E	28.652
Barking and Dagenham 016A	19.837

# Create list containing the two dataframes
gdf_list = [imd_eng, imd_wales]

# Combine two dataframes and convert into GeoDataFrame
imd_eng_wales = gpd.GeoDataFrame(pd.concat(gdf_list))

# Merge on index (which is the LSOA)
data = data.merge(imd_eng_wales, left_index=True, right_index=True,  how='left')

data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313

Deprivation#

The Indices of Deprivation (IoD) is calculated from thirty indicators grouped into seven domains. One such domain is Income.

Income domain - proportion of population experiencing deprivation relating to low levels of income
Income Deprivation Affecting Children Index (IDACI) - proportion of children aged 0-15 living in income deprived households - subset of Income Deprivation domain
Income Deprivation Affecting Older People Index (IDAOPI) - proportion of adults aged 60 or older who experience income deprivation - subset of Income Deprivation domain

# Import deprivation data
deprivation = pd.read_csv(
    os.path.join(paths.data, paths.demographic, paths.deprivation),
    index_col='LSOA name (2011)')

# Drop unwanted columns
cols_to_drop = ['LSOA Code (2011)','Local Authority District code (2019)']
deprivation.drop(cols_to_drop, axis=1, inplace=True)

# Rename columns
deprivation = deprivation.rename(columns={
    'Local Authority District name (2019)': 'la_district_name_2019',
    'Income Domain Score': 'income_domain_score',
    'Income Domain Rank (where 1 is most deprived)': 'income_domain_rank',
    'IDACI Score': 'idaci_score',
    'IDACI Rank (where 1 is most deprived)': 'idaci_rank',
    'IDAOPI Score': 'idaopi_score',
    'IDAOPI Rank (where 1 is most deprived)': 'idaopi_rank'})

deprivation.head(2)

	la_district_name_2019	income_domain_score	income_domain_rank	idaci_score	idaci_rank	idaopi_score	idaopi_rank
LSOA name (2011)
City of London 001A	City of London	0.007	34740	0.006	34715	0.012	34729
City of London 001B	City of London	0.034	31762	0.037	31529	0.030	33839

# Merge into data
data = data.merge(deprivation, left_index=True, right_index=True,  how='left')

# Reset index back to LSOA (as lost when merged)
data.set_index(lsoa_index, inplace=True)

data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score	la_district_name_2019	income_domain_score	income_domain_rank	idaci_score	idaci_rank	idaopi_score	idaopi_rank
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616	Welwyn Hatfield	0.104	17062	0.125	17700	0.186	11760
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313	Welwyn Hatfield	0.176	9058	0.270	6329	0.269	6337

Ethnicity and health#

This is the number of people in each ethnic or health group in each LSOA.

def clean_ethnic_or_health(path, prefix, suffix, data):
    '''
    Cleans ethnicity or health dataset and adds to main dataframe
    Inputs:
    - path: string - path to dataset from dataclass of paths
    - prefix: string - prefix to remove from column names
    - suffix: string - suffix to remove from column names
    - data: dataframe - with combined hospital and demographic results
    Outputs:
    - data: dataframe - with addition of ethnicity or health columns
    '''
    # Import data
    df = pd.read_csv(
        os.path.join(paths.data, paths.demographic, path),
        index_col='geography')

    # Only get columns including the specific prefix and suffix (e.g. all age)
    cols_to_include = [
        x for x in list(df) if prefix in x]
    cols_to_include = [
        x for x in cols_to_include if suffix in x]
    extract = df[cols_to_include]

    # Remove prefix and suffix
    col_names = extract.columns.str.removeprefix(prefix)
    col_names = col_names.str.removesuffix(suffix)
    # Change everything to lower case
    col_names = col_names.str.lower()
    # Replace spaces and punctuation
    col_names = col_names.str.replace(' ', '_')
    col_names = col_names.str.replace('/', '_')
    col_names = col_names.str.replace('-', '_')
    col_names = col_names.str.replace(':', '')

    # Rename the columns
    extract.columns = col_names

    # Merge in with data
    data = data.merge(extract, left_index=True, right_index=True,  how='left')

    # Reset index to LSOA (as lost on merge)
    data.set_index(lsoa_index, inplace=True)

    return(data)

# Add ethnicity data
data = clean_ethnic_or_health(
    path=paths.ethnicity,
    prefix='Sex: All persons; Age: All categories: Age; ',
    suffix='; measures: Value',
    data=data)

data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score	...	ethnic_group_asian_asian_british_bangladeshi	ethnic_group_asian_asian_british_chinese	ethnic_group_asian_asian_british_other_asian	ethnic_group_black_african_caribbean_black_british_total	ethnic_group_black_african_caribbean_black_british_african	ethnic_group_black_african_caribbean_black_british_caribbean	ethnic_group_black_african_caribbean_black_british_other_black	ethnic_group_other_ethnic_group_total	ethnic_group_other_ethnic_group_arab	ethnic_group_other_ethnic_group_any_other_ethnic_group
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616	...	14	23	35	105	73	30	2	14	6	8
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313	...	28	37	23	151	127	11	13	25	11	14

2 rows × 41 columns

Health is general health from the 2011 census. Each person in the household was asked to rate their health in general; the possible responses were ‘Very good’, ‘Good’, ‘Fair’, ‘Bad’ and ‘Very bad’. For more information, see ONS report on 2011 general health.

# Add health data
data = clean_ethnic_or_health(
    path=paths.health,
    prefix='Sex: All persons; Age: All categories: Age; General Health: ',
    suffix='; Ethnic Group: All categories: Ethnic group; measures: Value',
    data=data)

data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score	...	ethnic_group_black_african_caribbean_black_british_african	ethnic_group_black_african_caribbean_black_british_caribbean	ethnic_group_black_african_caribbean_black_british_other_black	ethnic_group_other_ethnic_group_total	ethnic_group_other_ethnic_group_arab	ethnic_group_other_ethnic_group_any_other_ethnic_group	all_categories_general_health	very_good_or_good_health	fair_health	bad_or_very_bad_health
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616	...	73	30	2	14	6	8	1523	1284	175	64
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313	...	127	11	13	25	11	14	1637	1269	265	103

2 rows × 45 columns

Long-term health problem or disability is from the 2011 census. People were asked “Are your day-to-day activities limited because of a health problem or disability which has lasted, or is expected to last, at least 12 months, include problems related to old age”. Respondents could choose not, or “limited a little”, or “limited a lot”. For more information, see this surrey report on general health and long-term health.

# Add long-term health or disability
data = clean_ethnic_or_health(
    path=paths.long_term_health,
    prefix='Sex: All persons; Age: All categories: Age; Disability: ',
    suffix='; General Health: All categories: General health; measures: Value',
    data=data)

data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score	...	ethnic_group_other_ethnic_group_arab	ethnic_group_other_ethnic_group_any_other_ethnic_group	all_categories_general_health	very_good_or_good_health	fair_health	bad_or_very_bad_health	all_categories_long_term_health_problem_or_disability	day_to_day_activities_limited_a_lot	day_to_day_activities_limited_a_little	day_to_day_activities_not_limited
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616	...	6	8	1523	1284	175	64	1523	81	113	1329
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313	...	11	14	1637	1269	265	103	1559	118	129	1312

2 rows × 49 columns

Rural v.s. urban#

# Import rural/urban data
rural_urban = pd.read_csv(
    os.path.join(paths.data, paths.demographic, paths.rural_urban),
    index_col='LSOA11NM')

# Drop unwanted columns
cols_to_drop = ['LSOA11CD','RUC11CD', 'FID']
rural_urban.drop(cols_to_drop, axis=1, inplace=True)

# Rename column
rural_urban = rural_urban.rename(columns={'RUC11': 'rural_urban_2011'})

# Merge on LSOA
data = data.merge(rural_urban, left_index=True, right_index=True,  how='left')
data.set_index(lsoa_index, inplace=True)

data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score	...	ethnic_group_other_ethnic_group_any_other_ethnic_group	all_categories_general_health	very_good_or_good_health	fair_health	bad_or_very_bad_health	all_categories_long_term_health_problem_or_disability	day_to_day_activities_limited_a_lot	day_to_day_activities_limited_a_little	day_to_day_activities_not_limited	rural_urban_2011
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616	...	8	1523	1284	175	64	1523	81	113	1329	Urban city and town
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313	...	14	1637	1269	265	103	1559	118	129	1312	Urban city and town

2 rows × 50 columns

Age#

When using these age columns, be careful to ensure that you remember how they were created - e.g.

age_band_75 is count of ages 75 to 79
age_band_80 is count of ages 80 to 84
age_band_85 is count of ages 85 to 89
age_band_90 is count of everyone aged 90 or above

def age_bands(file_path, data, group, all_col, lsoa_index=lsoa_index):
    '''
    Finds count of individuals in each age band in each LSOA, using data
    provided (which has all persons, all males or all females). Adds these
    columns to data.
    Input:
    - file_path: string - path to age data
    - data: dataframe - contains hospital and demographic information by LSOA
    - group: string - group of people who the age data is from
    - all_col: string - name of column with all ages
    - lsoa_index: index to reset to
    Output:
    - data: dataframe - as before, but with added age counts
    '''
    # Import age data, setting index as LSOA
    ages = pd.read_csv(file_path, index_col='LSOA Name')

    # Save total person count to data
    data[all_col] = ages['All Ages']

    # Drop 'All Ages'
    ages.drop('All Ages', axis=1, inplace=True)

    # Change 90+ to 90
    ages = ages.rename({'90+': '90'}, axis='columns')

    # Create empty dataframe to store age bands
    age_bands = pd.DataFrame()

    # Loop through numbers in 5s (0, 5, 10, 15... 90)
    for band in np.arange(0, 91, 5):
        # Create empty list
        cols_to_get = []
        # Create lists with ages 0, 1, 2... 89, 90 as strings
        age_list = [str(x) for x in range(0,91)]
        for field in age_list:
            # Divide by 5 and convert to nearest int (e.g. int(14/5)=2)
            # Multiply by 5 to get the band - save if in current band of loop
            if int(int(field)/5) * 5 == band:
                cols_to_get.append(field)
        # Extract columns with ages in that band (e.g. for 0 - 0, 1, 2, 3, 4)
        extract = ages[cols_to_get]
        # Sum of people in that age band
        age_bands[f'age_band_{group}_{band}'] = extract.sum(axis=1)

    # Merge age band sums with data on LSOA 
    data = data.merge(age_bands, left_index=True, right_index=True,  how='left')
    data.set_index(lsoa_index, inplace=True)

    return(data)

# For all people, add count in each age band
data = age_bands(
    os.path.join(paths.data, paths.demographic, paths.age_all),
    data, 'all', 'population_all')
data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score	...	age_band_all_45	age_band_all_50	age_band_all_55	age_band_all_60	age_band_all_65	age_band_all_70	age_band_all_75	age_band_all_80	age_band_all_85	age_band_all_90
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616	...	88	115	102	74	72	55	44	22	10	9
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313	...	117	122	127	87	47	45	39	29	20	23

2 rows × 70 columns

# For all females, add count in each age band
data = age_bands(
    os.path.join(paths.data, paths.demographic, paths.age_female),
    data, 'females', 'population_females')
data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score	...	age_band_females_45	age_band_females_50	age_band_females_55	age_band_females_60	age_band_females_65	age_band_females_70	age_band_females_75	age_band_females_80	age_band_females_85	age_band_females_90
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616	...	41	49	43	46	39	27	18	8	5	6
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313	...	55	62	47	34	23	27	18	17	15	19

2 rows × 90 columns

# For all males, add count in each age band
data = age_bands(
    os.path.join(paths.data, paths.demographic, paths.age_male),
    data, 'males', 'population_males')
data.head(2)

	admissions	closest_ivt_unit	closest_ivt_unit_time	closest_mt_unit	closest_mt_unit_time	closest_mt_transfer	closest_mt_transfer_time	total_mt_time	ivt_rate	imd_2019_score	...	age_band_males_45	age_band_males_50	age_band_males_55	age_band_males_60	age_band_males_65	age_band_males_70	age_band_males_75	age_band_males_80	age_band_males_85	age_band_males_90
area
Welwyn Hatfield 010F	0.666667	SG14AB	18.7	NW12BU	36.9	CB20QQ	39.1	57.8	6.8	15.616	...	47	66	59	28	33	28	26	14	5	3
Welwyn Hatfield 012A	4.000000	SG14AB	19.8	NW12BU	36.9	CB20QQ	39.1	58.9	6.8	33.313	...	62	60	80	53	24	18	21	12	5	4

2 rows × 110 columns

Save output#

data.to_csv(os.path.join(paths.data, paths.collated), index_label='LSOA')