{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Create region-averaged data\n", "\n", "> This is a copy of notebook 3 (\"Create maps\") with the mapping elements removed.\n", "\n", "Average the data for all LSOAs in a given region to find new values for e.g. population weighted mean IMD score per IVT catchment, or mean transfer time by ambulance trust.\n", "\n", "Most of this notebook involves:\n", "+ Load the data for all LSOAs\n", "+ Pick out only the LSOAs in a given region\n", "+ Average or sum the data for those LSOAs\n", "\n", "Notes:\n", "* Change from Notebook 3: ethnicity now looks at \"white British\" vs. \"other than white British\" groups rather than \"white\" vs \"other than white\"." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1 Set up" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.1 Import libraries and define file paths" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "import geopandas as gpd\n", "import numpy as np\n", "import os\n", "import pandas as pd\n", "from pandas.api.types import is_numeric_dtype" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Define file paths\n", "@dataclass(frozen=True)\n", "class Paths:\n", " '''Singleton object for storing paths to data and database.'''\n", "\n", " data = './data'\n", " collated = 'collated_data_amb.csv'\n", "\n", " hospitals = 'hospitals'\n", " stroke_hospitals = 'stroke_hospitals_2022.csv'\n", "\n", " shapefiles = 'shapefiles'\n", " lsoa_shp = ('Lower_layer_super_output_areas_(E+W)_2011_Boundaries_' +\n", " '(Generalised_Clipped)_V2.zip')\n", "\n", "\n", "paths = Paths()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### 1.2 Load data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**LSOA information** - load `collated_data.csv` produced by `01_combine_demographic_data.ipynb` which has information on each LSOA." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LSOAadmissionsclosest_ivt_unitclosest_ivt_unit_timeclosest_mt_unitclosest_mt_unit_timeclosest_mt_transferclosest_mt_transfer_timetotal_mt_timeivt_rate...lsoa_coderegionregion_coderegion_typeshort_codeicbicb_codeisdnlhbicb_lhb
0Welwyn Hatfield 010F0.666667SG14AB18.7NW12BU36.9CB20QQ39.157.86.8...E01033311NHS Hertfordshire and West Essex ICB - 06KE38000049SICBLHE1NHS Hertfordshire and West Essex Integrated Ca...E54000025East of England (South)NaNNHS Hertfordshire and West Essex Integrated Ca...
1Welwyn Hatfield 012A4.000000SG14AB19.8NW12BU36.9CB20QQ39.158.96.8...E01023920NHS Hertfordshire and West Essex ICB - 06KE38000049SICBLHE1NHS Hertfordshire and West Essex Integrated Ca...E54000025East of England (South)NaNNHS Hertfordshire and West Essex Integrated Ca...
\n", "

2 rows × 128 columns

\n", "
" ], "text/plain": [ " LSOA admissions closest_ivt_unit closest_ivt_unit_time \\\n", "0 Welwyn Hatfield 010F 0.666667 SG14AB 18.7 \n", "1 Welwyn Hatfield 012A 4.000000 SG14AB 19.8 \n", "\n", " closest_mt_unit closest_mt_unit_time closest_mt_transfer \\\n", "0 NW12BU 36.9 CB20QQ \n", "1 NW12BU 36.9 CB20QQ \n", "\n", " closest_mt_transfer_time total_mt_time ivt_rate ... lsoa_code \\\n", "0 39.1 57.8 6.8 ... E01033311 \n", "1 39.1 58.9 6.8 ... E01023920 \n", "\n", " region region_code region_type \\\n", "0 NHS Hertfordshire and West Essex ICB - 06K E38000049 SICBL \n", "1 NHS Hertfordshire and West Essex ICB - 06K E38000049 SICBL \n", "\n", " short_code icb icb_code \\\n", "0 HE1 NHS Hertfordshire and West Essex Integrated Ca... E54000025 \n", "1 HE1 NHS Hertfordshire and West Essex Integrated Ca... E54000025 \n", "\n", " isdn lhb \\\n", "0 East of England (South) NaN \n", "1 East of England (South) NaN \n", "\n", " icb_lhb \n", "0 NHS Hertfordshire and West Essex Integrated Ca... \n", "1 NHS Hertfordshire and West Essex Integrated Ca... \n", "\n", "[2 rows x 128 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_lsoa = pd.read_csv(os.path.join(paths.data, paths.collated))\n", "\n", "# Load in extra region information:\n", "df_lsoa_regions = pd.read_csv('data/lsoa_2021/lsoa_regions.csv', index_col=0)\n", "# Remove any repeated columns:\n", "cols_to_keep = [c for c in df_lsoa_regions if c not in df_lsoa.columns]\n", "# Merge with region information:\n", "df_lsoa = pd.merge(df_lsoa, df_lsoa_regions[cols_to_keep],\n", " left_on='LSOA', right_on='LSOA11NM', how='left')\n", "\n", "df_lsoa.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check the contents of this dataframe:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LSOA\n", "admissions\n", "closest_ivt_unit\n", "closest_ivt_unit_time\n", "closest_mt_unit\n", "closest_mt_unit_time\n", "closest_mt_transfer\n", "closest_mt_transfer_time\n", "total_mt_time\n", "ivt_rate\n", "imd_2019_score\n", "la_district_name_2019\n", "income_domain_score\n", "income_domain_rank\n", "idaci_score\n", "idaci_rank\n", "idaopi_score\n", "idaopi_rank\n", "ethnic_group_all_categories_ethnic_group\n", "ethnic_group_white_total\n", "ethnic_group_white_english_welsh_scottish_northern_irish_british\n", "ethnic_group_white_irish\n", "ethnic_group_white_gypsy_or_irish_traveller\n", "ethnic_group_white_other_white\n", "ethnic_group_mixed_multiple_ethnic_group_total\n", "ethnic_group_mixed_multiple_ethnic_group_white_and_black_caribbean\n", "ethnic_group_mixed_multiple_ethnic_group_white_and_black_african\n", "ethnic_group_mixed_multiple_ethnic_group_white_and_asian\n", "ethnic_group_mixed_multiple_ethnic_group_other_mixed\n", "ethnic_group_asian_asian_british_total\n", "ethnic_group_asian_asian_british_indian\n", "ethnic_group_asian_asian_british_pakistani\n", "ethnic_group_asian_asian_british_bangladeshi\n", "ethnic_group_asian_asian_british_chinese\n", "ethnic_group_asian_asian_british_other_asian\n", "ethnic_group_black_african_caribbean_black_british_total\n", "ethnic_group_black_african_caribbean_black_british_african\n", "ethnic_group_black_african_caribbean_black_british_caribbean\n", "ethnic_group_black_african_caribbean_black_british_other_black\n", "ethnic_group_other_ethnic_group_total\n", "ethnic_group_other_ethnic_group_arab\n", "ethnic_group_other_ethnic_group_any_other_ethnic_group\n", "all_categories_general_health\n", "very_good_or_good_health\n", "fair_health\n", "bad_or_very_bad_health\n", "all_categories_long_term_health_problem_or_disability\n", "day_to_day_activities_limited_a_lot\n", "day_to_day_activities_limited_a_little\n", "day_to_day_activities_not_limited\n", "rural_urban_2011\n", "population_all\n", "age_band_all_0\n", "age_band_all_5\n", "age_band_all_10\n", "age_band_all_15\n", "age_band_all_20\n", "age_band_all_25\n", "age_band_all_30\n", "age_band_all_35\n", "age_band_all_40\n", "age_band_all_45\n", "age_band_all_50\n", "age_band_all_55\n", "age_band_all_60\n", "age_band_all_65\n", "age_band_all_70\n", "age_band_all_75\n", "age_band_all_80\n", "age_band_all_85\n", "age_band_all_90\n", "population_females\n", "age_band_females_0\n", "age_band_females_5\n", "age_band_females_10\n", "age_band_females_15\n", "age_band_females_20\n", "age_band_females_25\n", "age_band_females_30\n", "age_band_females_35\n", "age_band_females_40\n", "age_band_females_45\n", "age_band_females_50\n", "age_band_females_55\n", "age_band_females_60\n", "age_band_females_65\n", "age_band_females_70\n", "age_band_females_75\n", "age_band_females_80\n", "age_band_females_85\n", "age_band_females_90\n", "population_males\n", "age_band_males_0\n", "age_band_males_5\n", "age_band_males_10\n", "age_band_males_15\n", "age_band_males_20\n", "age_band_males_25\n", "age_band_males_30\n", "age_band_males_35\n", "age_band_males_40\n", "age_band_males_45\n", "age_band_males_50\n", "age_band_males_55\n", "age_band_males_60\n", "age_band_males_65\n", "age_band_males_70\n", "age_band_males_75\n", "age_band_males_80\n", "age_band_males_85\n", "age_band_males_90\n", "ambulance_service\n", "local_authority_district_22\n", "LAD22NM\n", "country\n", "LSOA11NM\n", "LSOA11CD\n", "msoa11cd\n", "lsoa_code\n", "region\n", "region_code\n", "region_type\n", "short_code\n", "icb\n", "icb_code\n", "isdn\n", "lhb\n", "icb_lhb\n" ] } ], "source": [ "for c in df_lsoa.columns:\n", " print(c)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Hospital data** - load data on each hospital, with aim of getting locations of hospitals that provide IVT." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PostcodeHospital_nameUse_IVTUse_MTUse_MSUCountryStrategic Clinical NetworkHealth Board / TrustStroke TeamSSNAP name...ivt_rateEastingNorthinglonglatNeuroscience30 England Thrombectomy Examplehospital_cityNotesgeometry
0RM70AGRM70AG111EnglandLondon SCNBarkingHavering and Redbridge University Hospitals N...Queens Hospital Romford HASU...11.95511181877800.17903064066193451.568646552150410RomfordPOINT (19932.617 6722504.230)
1E11BBE11BB111EnglandLondon SCNBarts Health NHS TrustThe Royal London HospitalRoyal London Hospital HASU...13.4534829181798-0.058132991604737251.519017836129511Royal LondonPOINT (-6468.377 6713620.832)
\n", "

2 rows × 22 columns

\n", "
" ], "text/plain": [ " Postcode Hospital_name Use_IVT Use_MT Use_MSU Country \\\n", "0 RM70AG RM70AG 1 1 1 England \n", "1 E11BB E11BB 1 1 1 England \n", "\n", " Strategic Clinical Network Health Board / Trust \\\n", "0 London SCN Barking \n", "1 London SCN Barts Health NHS Trust \n", "\n", " Stroke Team \\\n", "0 Havering and Redbridge University Hospitals N... \n", "1 The Royal London Hospital \n", "\n", " SSNAP name ... ivt_rate Easting Northing \\\n", "0 Queens Hospital Romford HASU ... 11.9 551118 187780 \n", "1 Royal London Hospital HASU ... 13.4 534829 181798 \n", "\n", " long lat Neuroscience \\\n", "0 0.179030640661934 51.5686465521504 1 \n", "1 -0.0581329916047372 51.5190178361295 1 \n", "\n", " 30 England Thrombectomy Example hospital_city Notes \\\n", "0 0 Romford \n", "1 1 Royal London \n", "\n", " geometry \n", "0 POINT (19932.617 6722504.230) \n", "1 POINT (-6468.377 6713620.832) \n", "\n", "[2 rows x 22 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read in data on each hospital\n", "gdf_units = gpd.read_file(\n", " os.path.join(paths.data, paths.hospitals, paths.stroke_hospitals))\n", "\n", "# Combine get geometry from Easting and Northing columns\n", "gdf_units[\"geometry\"] = gpd.points_from_xy(\n", " gdf_units.Easting, gdf_units.Northing)\n", "gdf_units = gdf_units.set_crs(epsg=27700)\n", "\n", "# Restrict to the units delivering thrombolysis\n", "mask = gdf_units['Use_IVT'] == '1'\n", "gdf_units = gdf_units[mask]\n", "\n", "# Convert crs to epsg 3857\n", "gdf_units = gdf_units.to_crs(epsg=3857)\n", "\n", "# Preview dataframe\n", "gdf_units.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "__Area data__ - Import geojson of all LSOA to calculate area of each region.\n", "\n", "(use the proper Office for National Statistics file, not the simplified one):" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LSOA11CDLSOALSOA11NMWgeometrypolygon_area_km2closest_ivt_unitclosest_mt_unitclosest_mt_transferla_district_name_2019rural_urban_2011...lsoa_coderegionregion_coderegion_typeshort_codeicbicb_codeisdnlhbicb_lhb
0E01000001City of London 001ACity of London 001APOLYGON ((532105.092 182011.230, 532162.491 18...0.343907E11BBE11BBE11BBCity of LondonUrban major conurbation...E01000001NHS North East London ICB - A3A8RE38000255SICBLNELNHS North East London Integrated Care BoardE54000029LondonNaNNHS North East London Integrated Care Board
1E01000002City of London 001BCity of London 001BPOLYGON ((532746.813 181786.891, 532671.688 18...0.583474E11BBE11BBE11BBCity of LondonUrban major conurbation...E01000002NHS North East London ICB - A3A8RE38000255SICBLNELNHS North East London Integrated Care BoardE54000029LondonNaNNHS North East London Integrated Care Board
2E01000003City of London 001CCity of London 001CPOLYGON ((532135.145 182198.119, 532158.250 18...0.147840E11BBE11BBE11BBCity of LondonUrban major conurbation...E01000003NHS North East London ICB - A3A8RE38000255SICBLNELNHS North East London Integrated Care BoardE54000029LondonNaNNHS North East London Integrated Care Board
3E01000005City of London 001ECity of London 001EPOLYGON ((533807.946 180767.770, 533649.063 18...0.491918E11BBE11BBE11BBCity of LondonUrban major conurbation...E01000005NHS North East London ICB - A3A8RE38000255SICBLNELNHS North East London Integrated Care BoardE54000029LondonNaNNHS North East London Integrated Care Board
4E01000006Barking and Dagenham 016ABarking and Dagenham 016APOLYGON ((545122.049 184314.931, 545271.917 18...0.372257RM70AGRM70AGRM70AGBarking and DagenhamUrban major conurbation...E01000006NHS North East London ICB - A3A8RE38000255SICBLNELNHS North East London Integrated Care BoardE54000029LondonNaNNHS North East London Integrated Care Board
\n", "

5 rows × 25 columns

\n", "
" ], "text/plain": [ " LSOA11CD LSOA LSOA11NMW \\\n", "0 E01000001 City of London 001A City of London 001A \n", "1 E01000002 City of London 001B City of London 001B \n", "2 E01000003 City of London 001C City of London 001C \n", "3 E01000005 City of London 001E City of London 001E \n", "4 E01000006 Barking and Dagenham 016A Barking and Dagenham 016A \n", "\n", " geometry polygon_area_km2 \\\n", "0 POLYGON ((532105.092 182011.230, 532162.491 18... 0.343907 \n", "1 POLYGON ((532746.813 181786.891, 532671.688 18... 0.583474 \n", "2 POLYGON ((532135.145 182198.119, 532158.250 18... 0.147840 \n", "3 POLYGON ((533807.946 180767.770, 533649.063 18... 0.491918 \n", "4 POLYGON ((545122.049 184314.931, 545271.917 18... 0.372257 \n", "\n", " closest_ivt_unit closest_mt_unit closest_mt_transfer la_district_name_2019 \\\n", "0 E11BB E11BB E11BB City of London \n", "1 E11BB E11BB E11BB City of London \n", "2 E11BB E11BB E11BB City of London \n", "3 E11BB E11BB E11BB City of London \n", "4 RM70AG RM70AG RM70AG Barking and Dagenham \n", "\n", " rural_urban_2011 ... lsoa_code region \\\n", "0 Urban major conurbation ... E01000001 NHS North East London ICB - A3A8R \n", "1 Urban major conurbation ... E01000002 NHS North East London ICB - A3A8R \n", "2 Urban major conurbation ... E01000003 NHS North East London ICB - A3A8R \n", "3 Urban major conurbation ... E01000005 NHS North East London ICB - A3A8R \n", "4 Urban major conurbation ... E01000006 NHS North East London ICB - A3A8R \n", "\n", " region_code region_type short_code \\\n", "0 E38000255 SICBL NEL \n", "1 E38000255 SICBL NEL \n", "2 E38000255 SICBL NEL \n", "3 E38000255 SICBL NEL \n", "4 E38000255 SICBL NEL \n", "\n", " icb icb_code isdn lhb \\\n", "0 NHS North East London Integrated Care Board E54000029 London NaN \n", "1 NHS North East London Integrated Care Board E54000029 London NaN \n", "2 NHS North East London Integrated Care Board E54000029 London NaN \n", "3 NHS North East London Integrated Care Board E54000029 London NaN \n", "4 NHS North East London Integrated Care Board E54000029 London NaN \n", "\n", " icb_lhb \n", "0 NHS North East London Integrated Care Board \n", "1 NHS North East London Integrated Care Board \n", "2 NHS North East London Integrated Care Board \n", "3 NHS North East London Integrated Care Board \n", "4 NHS North East London Integrated Care Board \n", "\n", "[5 rows x 25 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gdf_lsoa = gpd.read_file(os.path.join(\n", " paths.data, paths.shapefiles, paths.lsoa_shp),\n", " crs='EPSG:27700')\n", "\n", "# Change the projection to a Cartesian system (EPSG:3857) and get area in\n", "# square kilometers (through /10**6)\n", "gdf_lsoa['polygon_area_km2'] = (gdf_lsoa.to_crs('EPSG:3857').area / 10**6)\n", "\n", "# Merge with region information:\n", "df_lsoa_regions = pd.read_csv('data/lsoa_2021/lsoa_regions.csv', index_col=0)\n", "\n", "gdf_lsoa = pd.merge(gdf_lsoa, df_lsoa_regions.drop('LSOA11NM', axis='columns'),\n", " on='LSOA11CD', how='left')\n", "\n", "# Rename LSOA column to match df_lsoa:\n", "gdf_lsoa = gdf_lsoa.rename(columns={'LSOA11NM': 'LSOA'})\n", "\n", "gdf_lsoa.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1.3 Define functions for calculating weighted mean and proportions\n", "\n", "Group data by specified geography (e.g. closest IVT unit) and calculate the weighted average of specified columns - [see stackoverflow tutorial here](https://stackoverflow.com/questions/31521027/groupby-weighted-average-and-sum-in-pandas-dataframe)." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def weighted_av(df, group_by, col, new_col, weight='population_all'):\n", " '''\n", " Groupby specified column and then find weighted average of another column\n", " Returns normal mean and weighted mean\n", " Inputs:\n", " - df - dataframe, contains data and columns to do calculation\n", " - group_by - string, column to group by\n", " - col - string, column to find average of\n", " - new_col - string, name of column which will contain weighted mean\n", " - weight - string, name of column to weight by, default 'population_all'\n", " '''\n", " res = (df\n", " .groupby(group_by)\n", " .apply(lambda x: pd.Series([np.mean(x[col]),\n", " np.average(x[col], weights=x[weight])],\n", " index=['mean', new_col])))\n", " return (res)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Group data by specific geography (e.g. ambulance trust) and calculate proportion." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def find_proportion(subgroup_col, overall_col, proportion_col, group_col, df):\n", " '''\n", " Groupby a particular geography, and find a proportion from two provided\n", " columns\n", " Inputs:\n", " subgroup_col - string, numerator, has population counts for a subgroup\n", " overall_col - string, denominator, has population count overall\n", " proportion_col - string, name for the column produced\n", " group_col - string, column to groupby when calculate proportions\n", " df - dataframe, with columns used above\n", " '''\n", " # Sum the columns for each group\n", " res = df.groupby(group_col).agg({\n", " subgroup_col: 'sum',\n", " overall_col: 'sum'\n", " })\n", "\n", " # Find the proportions for each group\n", " res[proportion_col] = res[subgroup_col] / res[overall_col]\n", "\n", " return res" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find counts of people in each category, then calculate proportion." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def find_count_and_proportion(df_lsoa, area, col, count, prop):\n", " '''\n", " Finds count of people in each category for each area (based on LSOA info\n", " and counts), and then calculates proportion in particular category.\n", " Requires col to contain 'True' 'False' where 'True' is the proportion you\n", " want to find (so calculates 'True' / 'True'+'False')\n", " Inputs:\n", " - df_lsoa - dataframe with information on each LSOA\n", " - area - string, column to group areas by\n", " - col - string, column with data of interest\n", " - count - string, column with population count\n", " - prop - string, name of new column with proportion\n", " '''\n", " # Find count for each area of people in each category\n", " counts = (df_lsoa.groupby([area, col])[count].sum())\n", "\n", " # Reformat dataframe\n", " counts = counts.reset_index(name='n')\n", " counts = counts.pivot(\n", " index=area,\n", " columns=col,\n", " values='n').reset_index().rename_axis(None, axis=1)\n", "\n", " # Set NaN to 0\n", " counts = counts.replace(np.nan, 0)\n", "\n", " # Find proportion\n", " counts[prop] = counts['True'] / (counts['True'] + counts['False'])\n", "\n", " # Set index to the area unit\n", " counts = counts.set_index(area)\n", "\n", " # Rename 'True' and 'False' columns to make their meanings clearer\n", " # when the data is merged into another dataframe:\n", " s = '_'.join(prop.split('_')[1:])\n", " counts = counts.rename(columns={'True': f'{s}_True', 'False': f'{s}_False'})\n", "\n", " return (counts)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Calculate new LSOA-level data\n", "\n", "Must create the following columns in df_lsoa:\n", "+ `age_65_plus_count`\n", "+ `ethnic_group_other_than_white_british`\n", "+ `long_term_health_count`\n", "+ `rural`\n", "+ `ivt_within_30`\n", "+ `closest_hospital_is_mt`" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Add count of \"other than white British\"\n", "df_lsoa['ethnic_group_other_than_white_british'] = (\n", " df_lsoa['ethnic_group_all_categories_ethnic_group'] -\n", " df_lsoa['ethnic_group_white_english_welsh_scottish_northern_irish_british']\n", ")\n", "\n", "# Add column with count of long-term health problem (limited a little or alot)\n", "df_lsoa['long_term_health_count'] = (\n", " df_lsoa['all_categories_long_term_health_problem_or_disability'] -\n", " df_lsoa['day_to_day_activities_not_limited']\n", ")\n", "\n", "# Extract if is rural or if is urban (each classification starts with that)\n", "df_lsoa['ruc_overall'] = df_lsoa['rural_urban_2011'].str[:5]\n", "# Add column where True if rural\n", "df_lsoa['rural'] = (df_lsoa['ruc_overall'] == 'Rural').map({\n", " True: 'True', False: 'False'})\n", "\n", "# Find count of people age 65 plus\n", "df_lsoa['age_65_plus_count'] = df_lsoa[[\n", " 'age_band_all_65', 'age_band_all_70', 'age_band_all_75',\n", " 'age_band_all_80', 'age_band_all_85', 'age_band_all_90']].sum(axis=1)\n", "\n", "# Create column indicating if IVT unit is within 30 minutes\n", "df_lsoa['ivt_within_30'] = (df_lsoa['closest_ivt_unit_time'] < 30).map(\n", " {True: 'True', False: 'False'})\n", "\n", "# Add column with whether closest hospital offers thrombectomy\n", "df_lsoa['closest_hospital_is_mt'] = (\n", " df_lsoa['closest_mt_transfer_time'] == 0).map({True: 'True',\n", " False: 'False'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Big function for region-level data creation\n", "\n", "Create one big function that does all of the calculations needed for averaging data across any given region." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def big_data_creation(df_lsoa, gdf_lsoa, group_by):\n", " df_lsoa = df_lsoa.copy() # just in case\n", " gdf_lsoa = gdf_lsoa.copy() # just in case\n", " # Find all of the possible options for this region type:\n", " index_values = list(set(df_lsoa[group_by]))\n", " try:\n", " index_values = sorted(index_values)\n", " except TypeError:\n", " pass\n", " \n", " # Empty dataframe to place the results into:\n", " df_results = pd.DataFrame(index=index_values)\n", "\n", " # Find population of each region:\n", " pop = df_lsoa[[group_by, 'population_all']].groupby(group_by)['population_all'].sum()\n", " # Find area of each region:\n", " gdf_lsoa = gdf_lsoa[[group_by, 'polygon_area_km2']].groupby(group_by).sum()\n", " gdf_lsoa = pd.merge(gdf_lsoa, pop, left_index=True, right_index=True)\n", " # Divide population by area to get population density\n", " gdf_lsoa['population_density'] = (\n", " gdf_lsoa['population_all'] / gdf_lsoa['polygon_area_km2'])\n", " \n", " df_results = pd.merge(df_results, gdf_lsoa.drop('population_all', axis='columns'),\n", " left_index=True, right_index=True, how='left')\n", " \n", " # --- Weighted averages ---\n", " # Pre-requisites:\n", " # Must have already calculated the following columns in df_lsoa:\n", " # 'age_65_plus_count'\n", " \n", " cols_for_weighted_av = [\n", " # Population-weighted mean income domain score:\n", " dict(col='income_domain_score',\n", " new_col='income_domain_weighted_mean'),\n", " # Population-weighted mean IMD score:\n", " dict(col='imd_2019_score', new_col='imd_weighted_mean'),\n", " # Average travel time to IVT unit weighted by number of\n", " # over 65 year olds in catchment:\n", " dict(col='closest_ivt_unit_time', new_col='weighted_ivt_time',\n", " weight='age_65_plus_count'),\n", " # Population-weighted mean time to MT unit:\n", " dict(col='total_mt_time', new_col='mt_time_weighted_mean'),\n", " # Population-weighted mean time to IVT unit:\n", " dict(col='closest_ivt_unit_time',\n", " new_col='ivt_time_weighted_mean'),\n", " # Population-weighted mean transfer time between IVT and MT unit:\n", " dict(col='closest_mt_transfer_time',\n", " new_col='mt_transfer_time_weighted_mean'), \n", " ]\n", " \n", " for kwargs in cols_for_weighted_av:\n", " # Calculate results.\n", " # 'weighted_av()' function returns a df with index named group_by and\n", " # columns named 'mean' and new_col.\n", " df_w = weighted_av(\n", " df_lsoa,\n", " group_by=group_by,\n", " **kwargs\n", " )\n", " # Store in big results df.\n", " # Discard 'mean', keep new_col.\n", " df_results = pd.merge(df_results, df_w[kwargs['new_col']],\n", " left_index=True, right_index=True, how='left')\n", " \n", " # --- Find proportions ---\n", " # Pre-requisites:\n", " # Must have already calculated the following columns in df_lsoa:\n", " # 'ethnic_group_other_than_white_british'\n", " # 'long_term_health_count'\n", " # 'age_65_plus_count'\n", " cols_for_proportion = [\n", " # Bad or very bad general health\n", " dict(subgroup_col='ethnic_group_other_than_white_british',\n", " overall_col='ethnic_group_all_categories_ethnic_group',\n", " proportion_col='ethnic_minority_proportion',\n", " df=df_lsoa),\n", " # General health - proportion with bad/very bad health\n", " dict(subgroup_col='bad_or_very_bad_health',\n", " overall_col='all_categories_general_health',\n", " proportion_col='bad_health_proportion',\n", " df=df_lsoa),\n", " # Long-term health problem or disability\n", " dict(subgroup_col='long_term_health_count',\n", " overall_col='all_categories_long_term_health_problem_or_disability',\n", " proportion_col='long_term_health_proportion',\n", " df=df_lsoa),\n", " # Age - proportion aged 65+\n", " dict(subgroup_col='age_65_plus_count',\n", " overall_col='population_all',\n", " proportion_col='age_65_plus_proportion',\n", " df=df_lsoa),\n", " ]\n", " # 'find_proportion()' function.\n", " # Notebook 3 stored the proportion_col column, discarded the rest.\n", " # Here, keep all resulting columns. First is number of True,\n", " # second is total number (True + False), third is proportion.\n", " for kwargs in cols_for_proportion:\n", " # Calculate results.\n", " df_p = find_proportion(group_col=group_by, **kwargs)\n", " \n", " # Store in big results df.\n", " df_results = pd.merge(df_results, df_p,\n", " left_index=True, right_index=True, how='left')\n", " \n", " # --- Find count and proportions ---\n", " # Pre-requisites:\n", " # Must have already calculated the following columns in df_lsoa:\n", " # 'rural'\n", " # 'ivt_within_30'\n", " # 'closest_hospital_is_mt'\n", " cols_for_count_and_proportion = [\n", " # Rural/urban - proportion of people living in rural areas\n", " dict(col='rural', count='population_all', prop='proportion_rural'),\n", " # Proportion of 65+ year olds living within 30 minutes of their closest IVT unit\n", " dict(col='ivt_within_30', count='age_65_plus_count',\n", " prop='proportion_over_65_within_30'),\n", " # Proportion of patients whose nearest hospital is an MT unit\n", " dict(col='closest_hospital_is_mt', count='population_all',\n", " prop='proportion_closest_is_mt'),\n", " ]\n", " # 'find_count_and_proportion()' function.\n", " for kwargs in cols_for_count_and_proportion:\n", " # Calculate results.\n", " df_c = find_count_and_proportion(df_lsoa=df_lsoa, area=group_by, **kwargs)\n", " \n", " # Store in big results df.\n", " df_results = pd.merge(df_results, df_c,\n", " left_index=True, right_index=True, how='left')\n", " return df_results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run the big data collection function for each different region type:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "region_types = [\n", " 'LSOA',\n", " 'closest_ivt_unit',\n", " 'closest_mt_unit',\n", " 'closest_mt_transfer',\n", " 'ambulance_service',\n", " # 'LAD22NM',\n", " 'icb_code',\n", " 'isdn',\n", " 'lhb',\n", " 'icb_lhb'\n", "]\n", "\n", "for region_type in region_types:\n", " df_results = big_data_creation(df_lsoa, gdf_lsoa, region_type)\n", "\n", " if region_type == 'closest_ivt_unit':\n", " # Do bonus calculations.\n", " # Extract rate for each unit, with unit as index\n", " ivt_rate = (df_lsoa[['closest_ivt_unit', 'ivt_rate']]\n", " .drop_duplicates()\n", " .set_index('closest_ivt_unit')['ivt_rate'])\n", " df_results = pd.merge(df_results, ivt_rate,\n", " left_index=True, right_index=True, how='left')\n", "\n", " # Get admissions to each unit from 2021/22\n", " admissions_2122 = (\n", " gdf_units[['Postcode', 'Admissions 21/22']]\n", " .set_index('Postcode')\n", " .rename(columns={'Admissions 21/22': 'admissions_2122'})\n", " .astype(int)\n", " )\n", " df_results = pd.merge(df_results, admissions_2122,\n", " left_index=True, right_index=True, how='left')\n", " else:\n", " # Don't do anything extra.\n", " pass\n", "\n", " # Make sure data is not overly-precise before saving:\n", " cols_to_round = df_results.select_dtypes(include=['float']).columns\n", " df_results[cols_to_round] = df_results[cols_to_round].round(4)\n", "\n", " # Set index name explicitly to make sure something gets saved:\n", " df_results.index.name = region_type\n", " \n", " # Save to file:\n", " df_results.to_csv(f'data/collated_data_by_region/collated_data_regional_{region_type}.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 4 }