This repository has been archived by the owner on May 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05_resample_and_join.py
86 lines (72 loc) · 3.4 KB
/
05_resample_and_join.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Convert all metrics to 2021 LSOAs and join into one file.
MSOA data is simply applied to all the LSOAs that it contains (we are always with ratios by this
point and not absolute totals).
"""
import pandas as pd
import config
# Create a lookup from MSOA code to LSOA code for our area of interest (South Gloucestershire)
msoa_to_lsoa_lookup = pd.read_csv(
config.raw_data_dir / config.get_download_filename(config.get_data_source('oa_lsoa_msoa_lookup')),
encoding='latin_1',
dtype='string',
index_col='msoa21cd',
usecols=['msoa21cd', 'lsoa21cd', 'lsoa21nm'],
)
row_subset = msoa_to_lsoa_lookup['lsoa21nm'].str.startswith('South Glouc')
msoa_to_lsoa_lookup = msoa_to_lsoa_lookup.loc[row_subset].drop_duplicates()
def upsample_msoa_to_lsoa(df: pd.DataFrame) -> pd.DataFrame:
"""
Take a DataFrame containing MSOA-level data and convert it to LSOA-level data by duplicating
every MSOA row for each of the LSOAs that it contains, replacing the MSOA code and name with the
LSOA code and name.
"""
return (
df.set_index('msoa21cd')
.drop(columns='msoa21nm')
.join(msoa_to_lsoa_lookup)
.reset_index(drop=True)
)
# Create a lookup from 2011 to 2021 LSOA code for our area of interest (South Gloucestershire),
# handling only unchanged and split LSOAs (which is all there was within the South Gloucestershire
# area) this time
lsoa_2011_to_2021_lookup = pd.read_csv(
config.raw_data_dir / config.get_download_filename(config.get_data_source('lsoa_2011_to_2021_lookup')),
dtype='string',
index_col='F_LSOA11CD',
usecols=['F_LSOA11CD', 'LSOA21CD', 'LSOA21NM', 'CHGIND'],
)
row_subset = (
lsoa_2011_to_2021_lookup['LSOA21NM'].str.startswith('South Glouc')
& ((lsoa_2011_to_2021_lookup['CHGIND'] == 'U') | (lsoa_2011_to_2021_lookup['CHGIND'] == 'S')) # U = unchanged, S = split
)
lsoa_2011_to_2021_lookup = lsoa_2011_to_2021_lookup.loc[row_subset]
lsoa_2011_to_2021_lookup.columns = [col.lower() for col in lsoa_2011_to_2021_lookup.columns]
def convert_lsoa_from_2011_to_2021(df: pd.DataFrame) -> pd.DataFrame:
"""
Take a DataFrame containing 2011 LSOA-level data and convert it to 2021 LSOA-level data by
duplicating every split LSOA row for each new LSOA.
"""
return (
df.set_index('lsoa11cd')
.join(lsoa_2011_to_2021_lookup)
.drop(columns=['lsoa11nm', 'chgind'])
.reset_index(drop=True)
)
if __name__ == '__main__':
# Ensure that the directories we are outputting data to exist
config.resampled_data_dir.mkdir(parents=True, exist_ok=True)
lsoa_indicators = pd.read_csv(config.indicators_data_dir / 'lsoa_indicators.csv')
msoa_indicators = pd.read_csv(config.indicators_data_dir / 'msoa_indicators.csv')
lsoa_2011_indicators = pd.read_csv(config.indicators_data_dir / 'lsoa_2011_indicators.csv')
# Perform the conversions defined above
msoa_indicators = upsample_msoa_to_lsoa(msoa_indicators)
lsoa_2011_indicators = convert_lsoa_from_2011_to_2021(lsoa_2011_indicators)
# Combine and save the data which is now all at 2021 LSOA-level
to_combine = [
lsoa_indicators.set_index(['lsoa21cd', 'lsoa21nm']),
msoa_indicators.set_index(['lsoa21cd', 'lsoa21nm']),
lsoa_2011_indicators.set_index(['lsoa21cd', 'lsoa21nm']),
]
data = pd.concat(to_combine, axis=1).reset_index()
data.to_csv(config.resampled_data_dir / 'indicators.csv', index=False)