vulnerability/code/05_resample_and_join.py

"""
Convert all metrics to 2021 LSOAs and join into one file.

MSOA data is simply applied to all the LSOAs that it contains (we are always with ratios by this
point and not absolute totals).
"""

import pandas as pd

import config


# Create a lookup from MSOA code to LSOA code for our area of interest (South Gloucestershire)
msoa_to_lsoa_lookup = pd.read_csv(
    config.raw_data_dir / config.get_download_filename(config.get_data_source('oa_lsoa_msoa_lookup')),
    encoding='latin_1',
    dtype='string',
    index_col='msoa21cd',
    usecols=['msoa21cd', 'lsoa21cd', 'lsoa21nm'],
)
row_subset = msoa_to_lsoa_lookup['lsoa21nm'].str.startswith('South Glouc')
msoa_to_lsoa_lookup = msoa_to_lsoa_lookup.loc[row_subset].drop_duplicates()

def upsample_msoa_to_lsoa(df: pd.DataFrame) -> pd.DataFrame:
    """
    Take a DataFrame containing MSOA-level data and convert it to LSOA-level data by duplicating
    every MSOA row for each of the LSOAs that it contains, replacing the MSOA code and name with the
    LSOA code and name.
    """
    return (
        df.set_index('msoa21cd')
        .drop(columns='msoa21nm')
        .join(msoa_to_lsoa_lookup)
        .reset_index(drop=True)
    )

# Create a lookup from 2011 to 2021 LSOA code for our area of interest (South Gloucestershire),
# handling only unchanged and split LSOAs (which is all there was within the South Gloucestershire
# area) this time
lsoa_2011_to_2021_lookup = pd.read_csv(
    config.raw_data_dir / config.get_download_filename(config.get_data_source('lsoa_2011_to_2021_lookup')),
    dtype='string',
    index_col='F_LSOA11CD',
    usecols=['F_LSOA11CD', 'LSOA21CD', 'LSOA21NM', 'CHGIND'],
)
row_subset = (
    lsoa_2011_to_2021_lookup['LSOA21NM'].str.startswith('South Glouc')
    & ((lsoa_2011_to_2021_lookup['CHGIND'] == 'U') | (lsoa_2011_to_2021_lookup['CHGIND'] == 'S'))  # U = unchanged, S = split
)
lsoa_2011_to_2021_lookup = lsoa_2011_to_2021_lookup.loc[row_subset]
lsoa_2011_to_2021_lookup.columns = [col.lower() for col in lsoa_2011_to_2021_lookup.columns]

def convert_lsoa_from_2011_to_2021(df: pd.DataFrame) -> pd.DataFrame:
    """
    Take a DataFrame containing 2011 LSOA-level data and convert it to 2021 LSOA-level data by
    duplicating every split LSOA row for each new LSOA.
    """
    return (
        df.set_index('lsoa11cd')
        .join(lsoa_2011_to_2021_lookup)
        .drop(columns=['lsoa11nm', 'chgind'])
        .reset_index(drop=True)
    )


if __name__ == '__main__':
    # Ensure that the directories we are outputting data to exist
    config.resampled_data_dir.mkdir(parents=True, exist_ok=True)

    lsoa_indicators = pd.read_csv(config.indicators_data_dir / 'lsoa_indicators.csv')
    msoa_indicators = pd.read_csv(config.indicators_data_dir / 'msoa_indicators.csv')
    lsoa_2011_indicators = pd.read_csv(config.indicators_data_dir / 'lsoa_2011_indicators.csv')

    # Perform the conversions defined above
    msoa_indicators = upsample_msoa_to_lsoa(msoa_indicators)
    lsoa_2011_indicators = convert_lsoa_from_2011_to_2021(lsoa_2011_indicators)
    
    # Combine and save the data which is now all at 2021 LSOA-level
    to_combine = [
        lsoa_indicators.set_index(['lsoa21cd', 'lsoa21nm']),
        msoa_indicators.set_index(['lsoa21cd', 'lsoa21nm']),
        lsoa_2011_indicators.set_index(['lsoa21cd', 'lsoa21nm']),
    ]
    data = pd.concat(to_combine, axis=1).reset_index()
    
    data.to_csv(config.resampled_data_dir / 'indicators.csv', index=False)