Create simplified regions for Streamlit app#
The stroke demographics app shows a map of the demographic data at the LSOA level. It is useful to show which LSOAs can be grouped together, for example into stroke unit catchment areas or into ambulance service areas.
The app uses simplified LSOA shapes to speed up the calculations. This means that the LSOA in the app are different shapes than those that were used to create the region files available from the Office for National Statistics geoportal and similar services.
In this notebook we will recreate the region boundary shapes from the simplified LSOA shapes.
Notebook setup#
from dataclasses import dataclass
import geopandas as gpd
import os
import pandas as pd
from importlib_resources import files
from shapely.validation import make_valid # for fixing dodgy polygons
# The stroke-maps package from our other stroke work contains data
# linking different region types.
import stroke_maps
# Define file paths
@dataclass(frozen=True)
class Paths:
'''Singleton object for storing paths to data and database.'''
# Inputs:
data = './data'
collated = 'collated_data_amb.csv'
# Outputs:
shapefiles = 'shapefiles'
lsoa_shp = 'LSOA_V3_reduced_simplified.geojson'
paths = Paths()
Load data#
Firstly the tabular data to say which LSOA belongs to which region, and only keep the region columns:
df_lsoa = pd.read_csv(os.path.join(paths.data, paths.collated), index_col='LSOA')
cols_regions = [
'closest_ivt_unit',
'closest_mt_unit',
'closest_mt_transfer',
'la_district_name_2019',
'rural_urban_2011',
'ambulance_service',
'local_authority_district_22',
'LAD22NM',
'country'
]
df_lsoa = df_lsoa[cols_regions]
df_lsoa.head().T
LSOA | Welwyn Hatfield 010F | Welwyn Hatfield 012A | Welwyn Hatfield 002F | Welwyn Hatfield 002E | Welwyn Hatfield 010A |
---|---|---|---|---|---|
closest_ivt_unit | SG14AB | SG14AB | SG14AB | SG14AB | SG14AB |
closest_mt_unit | NW12BU | NW12BU | NW12BU | NW12BU | NW12BU |
closest_mt_transfer | CB20QQ | CB20QQ | CB20QQ | CB20QQ | CB20QQ |
la_district_name_2019 | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield |
rural_urban_2011 | Urban city and town | Urban city and town | Urban city and town | Urban city and town | Urban city and town |
ambulance_service | East of England | East of England | East of England | East of England | East of England |
local_authority_district_22 | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield |
LAD22NM | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield | Welwyn Hatfield |
country | England | England | England | England | England |
Then load the LSOA shape files:
gdf_lsoa = gpd.read_file(os.path.join(
paths.data, paths.shapefiles, paths.lsoa_shp)
)
# Make geometry valid:
gdf_lsoa['geometry'] = [
make_valid(g) if g is not None else g
for g in gdf_lsoa['geometry'].values
]
gdf_lsoa.head()
LSOA11NM | LSOA11CD | geometry | |
---|---|---|---|
0 | City of London 001A | E01000001 | POLYGON ((-0.09470 51.52060, -0.09730 51.52160... |
1 | City of London 001B | E01000002 | POLYGON ((-0.08810 51.51940, -0.09270 51.52140... |
2 | City of London 001C | E01000003 | POLYGON ((-0.09450 51.52200, -0.09680 51.52330... |
3 | City of London 001E | E01000005 | POLYGON ((-0.07590 51.51590, -0.07730 51.51740... |
4 | Barking and Dagenham 016A | E01000006 | POLYGON ((0.09330 51.53790, 0.08650 51.54180, ... |
And a bonus file that links LSOA to more region types:
# Relative import from `stroke-maps` package files:
path_to_file = files('stroke_maps.data').joinpath('regions_lsoa_ew.csv')
df_lsoa_regions = pd.read_csv(path_to_file, index_col='lsoa')
# Rename index to match df_lsoa:
df_lsoa_regions.index.name = 'LSOA'
df_lsoa_regions.head()
lsoa_code | region | region_code | region_type | |
---|---|---|---|---|
LSOA | ||||
Halton 007A | E01012367 | NHS Cheshire and Merseyside ICB - 01F | E38000068 | SICBL |
Halton 003A | E01012368 | NHS Cheshire and Merseyside ICB - 01F | E38000068 | SICBL |
Halton 005A | E01012369 | NHS Cheshire and Merseyside ICB - 01F | E38000068 | SICBL |
Halton 007B | E01012370 | NHS Cheshire and Merseyside ICB - 01F | E38000068 | SICBL |
Halton 016A | E01012371 | NHS Cheshire and Merseyside ICB - 01F | E38000068 | SICBL |
And a second bonus file that links those region types to yet more region types:
(These two files are stored separately to save disk space)
# Load further region data linking SICBL to other regions:
path_to_file = files('stroke_maps.data').joinpath('regions_ew.csv')
df_regions = pd.read_csv(path_to_file)
df_regions.head()
region | region_code | region_type | short_code | country | icb | icb_code | isdn | |
---|---|---|---|---|---|---|---|---|
0 | Aneurin Bevan University Health Board | W11000028 | LHB | AB | Wales | NaN | NaN | NaN |
1 | NHS Bath and North East Somerset, Swindon and ... | E38000231 | SICBL | BA | England | NHS Bath and North East Somerset, Swindon and ... | E54000040 | Gloucester, BSW, BNSSG and Somerset |
2 | Betsi Cadwaladr University Health Board | W11000023 | LHB | BC | Wales | NaN | NaN | NaN |
3 | NHS Bedfordshire, Luton and Milton Keynes ICB ... | E38000249 | SICBL | BD | England | NHS Bedfordshire, Luton and Milton Keynes Inte... | E54000024 | East of England (South) |
4 | NHS Black Country ICB - D2P2L | E38000259 | SICBL | BL | England | NHS Black Country Integrated Care Board | E54000062 | North Midlands |
Finally a file to link LSOA to MSOA:
df_msoa = pd.read_csv(
os.path.join(paths.data, 'lsoa_2021', 'lsoa_to_msoa.csv'),
index_col='lsoa11nm'
)
# Rename index to match df_lsoa:
df_msoa.index.name = 'LSOA'
df_msoa.head()
lsoa11cd | msoa11cd | ladcd | msoa11nm | country | |
---|---|---|---|---|---|
LSOA | |||||
City of London 001A | E01000001 | E02000001 | E09000001 | City of London 001 | E |
City of London 001B | E01000002 | E02000001 | E09000001 | City of London 001 | E |
City of London 001C | E01000003 | E02000001 | E09000001 | City of London 001 | E |
City of London 001E | E01000005 | E02000001 | E09000001 | City of London 001 | E |
Barking and Dagenham 016A | E01000006 | E02000017 | E09000002 | Barking and Dagenham 016 | E |
Link region data#
Link MSOA names to LSOA:
df_lsoa = pd.merge(
df_lsoa, df_msoa['msoa11cd'],
left_index=True, right_index=True, how='left'
)
Make a new column for separate LHB (Wales only) and a new column for either LHB (Wales only) or ICB (England only).
mask_wales = df_regions['region_type'] == 'LHB'
df_regions['lhb'] = df_regions['region'].copy()
df_regions.loc[~mask_wales, 'lhb'] = pd.NA
df_regions['icb_lhb'] = df_regions['icb'].copy()
df_regions.loc[mask_wales, 'icb_lhb'] = df_regions.loc[mask_wales, 'region']
Combine the two bonus files:
# Drop columns already in df_lsoa_regions or df_lsoa:
df_regions = df_regions.drop(
['region', 'region_type', 'country'], axis='columns')
df_lsoa_regions = df_lsoa_regions.reset_index()
df_lsoa_regions = pd.merge(
df_lsoa_regions, df_regions,
on='region_code', how='left'
)
df_lsoa_regions = df_lsoa_regions.set_index('LSOA')
Combine bonus files with collated data:
df_lsoa = pd.merge(
df_lsoa, df_lsoa_regions,
left_index=True, right_index=True, how='left'
)
Combine collated data with geometry:
gdf_lsoa = pd.merge(
gdf_lsoa, df_lsoa,
left_on='LSOA11NM', right_index=True, how='right'
)
gdf_lsoa.index = range(len(gdf_lsoa))
gdf_lsoa.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
LSOA11NM | Adur 001A | Adur 001B | Adur 001C | Adur 001D | Adur 001E |
LSOA11CD | E01031349 | E01031350 | E01031351 | E01031352 | E01031370 |
geometry | POLYGON ((-0.2318 50.8398, -0.2319 50.8399, -0... | POLYGON ((-0.2259 50.8429, -0.2302 50.8476, -0... | POLYGON ((-0.2377 50.8407, -0.2337 50.8438, -0... | POLYGON ((-0.2319 50.8399, -0.2318 50.8398, -0... | POLYGON ((-0.2484 50.8416, -0.2554 50.8418, -0... |
closest_ivt_unit | BN25BE | BN25BE | BN112DH | BN112DH | BN112DH |
closest_mt_unit | BN25BE | BN25BE | BN25BE | BN25BE | BN25BE |
closest_mt_transfer | BN25BE | BN25BE | BN25BE | BN25BE | BN25BE |
la_district_name_2019 | Adur | Adur | Adur | Adur | Adur |
rural_urban_2011 | Urban city and town | Urban city and town | Urban city and town | Urban city and town | Urban city and town |
ambulance_service | South East Coast | South East Coast | South East Coast | South East Coast | South East Coast |
local_authority_district_22 | Adur | Adur | Adur | Adur | Adur |
LAD22NM | Adur | Adur | Adur | Adur | Adur |
country | England | England | England | England | England |
msoa11cd | E02006534 | E02006534 | E02006534 | E02006534 | E02006534 |
lsoa_code | E01031349 | E01031350 | E01031351 | E01031352 | E01031370 |
region | NHS Sussex ICB - 70F | NHS Sussex ICB - 70F | NHS Sussex ICB - 70F | NHS Sussex ICB - 70F | NHS Sussex ICB - 70F |
region_code | E38000248 | E38000248 | E38000248 | E38000248 | E38000248 |
region_type | SICBL | SICBL | SICBL | SICBL | SICBL |
short_code | SX2 | SX2 | SX2 | SX2 | SX2 |
icb | NHS Sussex Integrated Care Board | NHS Sussex Integrated Care Board | NHS Sussex Integrated Care Board | NHS Sussex Integrated Care Board | NHS Sussex Integrated Care Board |
icb_code | E54000053 | E54000053 | E54000053 | E54000053 | E54000053 |
isdn | Sussex | Sussex | Sussex | Sussex | Sussex |
lhb | <NA> | <NA> | <NA> | <NA> | <NA> |
icb_lhb | NHS Sussex Integrated Care Board | NHS Sussex Integrated Care Board | NHS Sussex Integrated Care Board | NHS Sussex Integrated Care Board | NHS Sussex Integrated Care Board |
type(gdf_lsoa)
geopandas.geodataframe.GeoDataFrame
# Drop the geometry and save a copy:
gdf_lsoa.drop('geometry', axis='columns').to_csv('data/lsoa_2021/lsoa_regions.csv')
Make a fake column for merging all LSOAs into one big shape:
gdf_lsoa['england_wales'] = 0
Merge region shapes#
For each column of region data, merge the shapes of all LSOAs that belong to each region.
Save a copy of the merged shapes.
def merge_lsoas_into_region(gdf, col):
# Keep only the requested column:
gdf = gdf.copy()
gdf = gdf[['geometry', col]]
# Dissolve by value:
gdf = gdf.dissolve(by=col)
gdf = gdf.reset_index()
return gdf
gdf_lsoa.columns
Index(['LSOA11NM', 'LSOA11CD', 'geometry', 'closest_ivt_unit',
'closest_mt_unit', 'closest_mt_transfer', 'la_district_name_2019',
'rural_urban_2011', 'ambulance_service', 'local_authority_district_22',
'LAD22NM', 'country', 'msoa11cd', 'lsoa_code', 'region', 'region_code',
'region_type', 'short_code', 'icb', 'icb_code', 'isdn', 'lhb',
'icb_lhb', 'england_wales'],
dtype='object')
cols_to_merge = [
'closest_ivt_unit',
'closest_mt_unit',
'closest_mt_transfer',
'rural_urban_2011',
'ambulance_service',
'LAD22NM',
'country',
'region_code',
'icb_code',
'isdn',
'lhb',
'icb_lhb',
'msoa11cd',
'england_wales'
]
for col in cols_to_merge:
gdf_merged = merge_lsoas_into_region(gdf_lsoa, col)
gdf_merged.to_file(
os.path.join(paths.data, paths.shapefiles, f'outline_{col}.geojson'))