Create simplified regions for Streamlit app

Create simplified regions for Streamlit app#

The stroke demographics app shows a map of the demographic data at the LSOA level. It is useful to show which LSOAs can be grouped together, for example into stroke unit catchment areas or into ambulance service areas.

The app uses simplified LSOA shapes to speed up the calculations. This means that the LSOA in the app are different shapes than those that were used to create the region files available from the Office for National Statistics geoportal and similar services.

In this notebook we will recreate the region boundary shapes from the simplified LSOA shapes.

Notebook setup#

from dataclasses import dataclass
import geopandas as gpd
import os
import pandas as pd
from importlib_resources import files
from shapely.validation import make_valid  # for fixing dodgy polygons

# The stroke-maps package from our other stroke work contains data
# linking different region types.
import stroke_maps

# Define file paths
@dataclass(frozen=True)
class Paths:
    '''Singleton object for storing paths to data and database.'''

    # Inputs:
    data = './data'
    collated = 'collated_data_amb.csv'

    # Outputs:
    shapefiles = 'shapefiles'
    lsoa_shp = 'LSOA_V3_reduced_simplified.geojson'


paths = Paths()

Load data#

Firstly the tabular data to say which LSOA belongs to which region, and only keep the region columns:

df_lsoa = pd.read_csv(os.path.join(paths.data, paths.collated), index_col='LSOA')

cols_regions = [
    'closest_ivt_unit',
    'closest_mt_unit',
    'closest_mt_transfer',
    'la_district_name_2019',
    'rural_urban_2011',
    'ambulance_service',
    'local_authority_district_22',
    'LAD22NM',
    'country'
]

df_lsoa = df_lsoa[cols_regions]

df_lsoa.head().T

LSOA	Welwyn Hatfield 010F	Welwyn Hatfield 012A	Welwyn Hatfield 002F	Welwyn Hatfield 002E	Welwyn Hatfield 010A
closest_ivt_unit	SG14AB	SG14AB	SG14AB	SG14AB	SG14AB
closest_mt_unit	NW12BU	NW12BU	NW12BU	NW12BU	NW12BU
closest_mt_transfer	CB20QQ	CB20QQ	CB20QQ	CB20QQ	CB20QQ
la_district_name_2019	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield
rural_urban_2011	Urban city and town	Urban city and town	Urban city and town	Urban city and town	Urban city and town
ambulance_service	East of England	East of England	East of England	East of England	East of England
local_authority_district_22	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield
LAD22NM	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield	Welwyn Hatfield
country	England	England	England	England	England

Then load the LSOA shape files:

gdf_lsoa = gpd.read_file(os.path.join(
    paths.data, paths.shapefiles, paths.lsoa_shp)
    )

# Make geometry valid:
gdf_lsoa['geometry'] = [
    make_valid(g) if g is not None else g
    for g in gdf_lsoa['geometry'].values
    ]

gdf_lsoa.head()

	LSOA11NM	LSOA11CD	geometry
0	City of London 001A	E01000001	POLYGON ((-0.09470 51.52060, -0.09730 51.52160...
1	City of London 001B	E01000002	POLYGON ((-0.08810 51.51940, -0.09270 51.52140...
2	City of London 001C	E01000003	POLYGON ((-0.09450 51.52200, -0.09680 51.52330...
3	City of London 001E	E01000005	POLYGON ((-0.07590 51.51590, -0.07730 51.51740...
4	Barking and Dagenham 016A	E01000006	POLYGON ((0.09330 51.53790, 0.08650 51.54180, ...

And a bonus file that links LSOA to more region types:

# Relative import from `stroke-maps` package files:
path_to_file = files('stroke_maps.data').joinpath('regions_lsoa_ew.csv')
df_lsoa_regions = pd.read_csv(path_to_file, index_col='lsoa')

# Rename index to match df_lsoa:
df_lsoa_regions.index.name = 'LSOA'

df_lsoa_regions.head()

	lsoa_code	region	region_code	region_type
LSOA
Halton 007A	E01012367	NHS Cheshire and Merseyside ICB - 01F	E38000068	SICBL
Halton 003A	E01012368	NHS Cheshire and Merseyside ICB - 01F	E38000068	SICBL
Halton 005A	E01012369	NHS Cheshire and Merseyside ICB - 01F	E38000068	SICBL
Halton 007B	E01012370	NHS Cheshire and Merseyside ICB - 01F	E38000068	SICBL
Halton 016A	E01012371	NHS Cheshire and Merseyside ICB - 01F	E38000068	SICBL

And a second bonus file that links those region types to yet more region types:

(These two files are stored separately to save disk space)

# Load further region data linking SICBL to other regions:
path_to_file = files('stroke_maps.data').joinpath('regions_ew.csv')
df_regions = pd.read_csv(path_to_file)

df_regions.head()

	region	region_code	region_type	short_code	country	icb	icb_code	isdn
0	Aneurin Bevan University Health Board	W11000028	LHB	AB	Wales	NaN	NaN	NaN
1	NHS Bath and North East Somerset, Swindon and ...	E38000231	SICBL	BA	England	NHS Bath and North East Somerset, Swindon and ...	E54000040	Gloucester, BSW, BNSSG and Somerset
2	Betsi Cadwaladr University Health Board	W11000023	LHB	BC	Wales	NaN	NaN	NaN
3	NHS Bedfordshire, Luton and Milton Keynes ICB ...	E38000249	SICBL	BD	England	NHS Bedfordshire, Luton and Milton Keynes Inte...	E54000024	East of England (South)
4	NHS Black Country ICB - D2P2L	E38000259	SICBL	BL	England	NHS Black Country Integrated Care Board	E54000062	North Midlands

Finally a file to link LSOA to MSOA:

df_msoa = pd.read_csv(
    os.path.join(paths.data, 'lsoa_2021', 'lsoa_to_msoa.csv'),
    index_col='lsoa11nm'
)

# Rename index to match df_lsoa:
df_msoa.index.name = 'LSOA'

df_msoa.head()

	lsoa11cd	msoa11cd	ladcd	msoa11nm	country
LSOA
City of London 001A	E01000001	E02000001	E09000001	City of London 001	E
City of London 001B	E01000002	E02000001	E09000001	City of London 001	E
City of London 001C	E01000003	E02000001	E09000001	City of London 001	E
City of London 001E	E01000005	E02000001	E09000001	City of London 001	E
Barking and Dagenham 016A	E01000006	E02000017	E09000002	Barking and Dagenham 016	E

Link region data#

Link MSOA names to LSOA:

df_lsoa = pd.merge(
    df_lsoa, df_msoa['msoa11cd'],
    left_index=True, right_index=True, how='left'
)

Make a new column for separate LHB (Wales only) and a new column for either LHB (Wales only) or ICB (England only).

mask_wales = df_regions['region_type'] == 'LHB'

df_regions['lhb'] = df_regions['region'].copy()
df_regions.loc[~mask_wales, 'lhb'] = pd.NA

df_regions['icb_lhb'] = df_regions['icb'].copy()
df_regions.loc[mask_wales, 'icb_lhb'] = df_regions.loc[mask_wales, 'region']

Combine the two bonus files:

# Drop columns already in df_lsoa_regions or df_lsoa:
df_regions = df_regions.drop(
    ['region', 'region_type', 'country'], axis='columns')

df_lsoa_regions = df_lsoa_regions.reset_index()
df_lsoa_regions = pd.merge(
    df_lsoa_regions, df_regions,
    on='region_code', how='left'
)
df_lsoa_regions = df_lsoa_regions.set_index('LSOA')

Combine bonus files with collated data:

df_lsoa = pd.merge(
    df_lsoa, df_lsoa_regions,
    left_index=True, right_index=True, how='left'
)

Combine collated data with geometry:

gdf_lsoa = pd.merge(
    gdf_lsoa, df_lsoa,
    left_on='LSOA11NM', right_index=True, how='right'
)

gdf_lsoa.index = range(len(gdf_lsoa))

gdf_lsoa.head().T

	0	1	2	3	4
LSOA11NM	Adur 001A	Adur 001B	Adur 001C	Adur 001D	Adur 001E
LSOA11CD	E01031349	E01031350	E01031351	E01031352	E01031370
geometry	POLYGON ((-0.2318 50.8398, -0.2319 50.8399, -0...	POLYGON ((-0.2259 50.8429, -0.2302 50.8476, -0...	POLYGON ((-0.2377 50.8407, -0.2337 50.8438, -0...	POLYGON ((-0.2319 50.8399, -0.2318 50.8398, -0...	POLYGON ((-0.2484 50.8416, -0.2554 50.8418, -0...
closest_ivt_unit	BN25BE	BN25BE	BN112DH	BN112DH	BN112DH
closest_mt_unit	BN25BE	BN25BE	BN25BE	BN25BE	BN25BE
closest_mt_transfer	BN25BE	BN25BE	BN25BE	BN25BE	BN25BE
la_district_name_2019	Adur	Adur	Adur	Adur	Adur
rural_urban_2011	Urban city and town	Urban city and town	Urban city and town	Urban city and town	Urban city and town
ambulance_service	South East Coast	South East Coast	South East Coast	South East Coast	South East Coast
local_authority_district_22	Adur	Adur	Adur	Adur	Adur
LAD22NM	Adur	Adur	Adur	Adur	Adur
country	England	England	England	England	England
msoa11cd	E02006534	E02006534	E02006534	E02006534	E02006534
lsoa_code	E01031349	E01031350	E01031351	E01031352	E01031370
region	NHS Sussex ICB - 70F	NHS Sussex ICB - 70F	NHS Sussex ICB - 70F	NHS Sussex ICB - 70F	NHS Sussex ICB - 70F
region_code	E38000248	E38000248	E38000248	E38000248	E38000248
region_type	SICBL	SICBL	SICBL	SICBL	SICBL
short_code	SX2	SX2	SX2	SX2	SX2
icb	NHS Sussex Integrated Care Board	NHS Sussex Integrated Care Board	NHS Sussex Integrated Care Board	NHS Sussex Integrated Care Board	NHS Sussex Integrated Care Board
icb_code	E54000053	E54000053	E54000053	E54000053	E54000053
isdn	Sussex	Sussex	Sussex	Sussex	Sussex
lhb	<NA>	<NA>	<NA>	<NA>	<NA>
icb_lhb	NHS Sussex Integrated Care Board	NHS Sussex Integrated Care Board	NHS Sussex Integrated Care Board	NHS Sussex Integrated Care Board	NHS Sussex Integrated Care Board

type(gdf_lsoa)

geopandas.geodataframe.GeoDataFrame

# Drop the geometry and save a copy:
gdf_lsoa.drop('geometry', axis='columns').to_csv('data/lsoa_2021/lsoa_regions.csv')

Make a fake column for merging all LSOAs into one big shape:

gdf_lsoa['england_wales'] = 0

Merge region shapes#

For each column of region data, merge the shapes of all LSOAs that belong to each region.

Save a copy of the merged shapes.

def merge_lsoas_into_region(gdf, col):
    # Keep only the requested column:
    gdf = gdf.copy()
    gdf = gdf[['geometry', col]]
    # Dissolve by value:
    gdf = gdf.dissolve(by=col)
    gdf = gdf.reset_index()
    return gdf

gdf_lsoa.columns

Index(['LSOA11NM', 'LSOA11CD', 'geometry', 'closest_ivt_unit',
       'closest_mt_unit', 'closest_mt_transfer', 'la_district_name_2019',
       'rural_urban_2011', 'ambulance_service', 'local_authority_district_22',
       'LAD22NM', 'country', 'msoa11cd', 'lsoa_code', 'region', 'region_code',
       'region_type', 'short_code', 'icb', 'icb_code', 'isdn', 'lhb',
       'icb_lhb', 'england_wales'],
      dtype='object')

cols_to_merge = [
    'closest_ivt_unit',
    'closest_mt_unit',
    'closest_mt_transfer',
    'rural_urban_2011',
    'ambulance_service',
    'LAD22NM',
    'country',
    'region_code',
    'icb_code',
    'isdn',
    'lhb',
    'icb_lhb',
    'msoa11cd',
    'england_wales'
]

for col in cols_to_merge:
    gdf_merged = merge_lsoas_into_region(gdf_lsoa, col)
    gdf_merged.to_file(
        os.path.join(paths.data, paths.shapefiles, f'outline_{col}.geojson'))