Skip to content

Commit

Permalink
synth data
Browse files Browse the repository at this point in the history
  • Loading branch information
grahamstark committed Aug 21, 2024
1 parent 8bd177c commit d49a17c
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 80 deletions.
10 changes: 10 additions & 0 deletions docs/mostly-ai-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,14 @@ is_hrp - one per household

add `uhid` as a true unique primary key - was hid which can be duplicated over data_years.

## Relationships Messed Up

fixup_synth_data

file synthetic_data/skiplist.tab has list of errors

relationships! badly messed up.

## Take 2 - break adults and children into separate files


97 changes: 67 additions & 30 deletions scripts/fixup_synth_data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ using .Definitions
using .Intermediate
using .ModelHousehold
using .FRSHouseholdGetter
using .SingleHouseholdCalculations: do_one_calc
using .STBParameters

using DataFrames,CSV,StatsBase
using OrderedCollections
using Revise
Expand All @@ -35,45 +38,67 @@ checked that each person is allocated to a household via `hid`.
FIXME move to `tests/`
FIXME check the `relationship_x` records
"""
function do_pers_idiot_checks( pers :: AbstractDataFrame )
function do_pers_idiot_checks( pers :: AbstractDataFrame, skiplist :: DataFrame )
hh_pers = groupby( pers, [:hid])
nps = size(hh_pers)[1]
for hid in 1:nps
hp = hh_pers[hid]
hbus = groupby( hp, :default_benefit_unit )
nbusps = 0
first = hp[1,:]
for bu in hbus
nbusps += size( bu )[1]
numheads = sum( bu[:,:is_bu_head])
@assert numheads == 1 "1 head for each bu hh.hid=$(first.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])"
if not_in_skiplist(hp[1,:],skiplist)
hbus = groupby( hp, :default_benefit_unit )
nbusps = 0
first = hp[1,:]
for bu in hbus
nbusps += size( bu )[1]
numheads = sum( bu[:,:is_bu_head])
@assert numheads == 1 "1 head for each bu hh.hid=$(first.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])"
end
@assert nbusps == size(hp)[1] "size mismatch for hh.hid=$(first.hid)"
@assert sum( hp[:,:is_hrp]) == 1 "1 head for each hh hh.hid=$(first.hid) was $(sum( hp[:,:is_hrp]) )"
end
@assert nbusps == size(hp)[1] "size mismatch for hh.hid=$(hp.hid)"
@assert sum( hp[:,:is_hrp]) == 1 "1 head for each hh hh.hid=$(hp.hid) was $(sum( hp[:,:is_hrp]) )"
end
end


function create_skips()
function add_skips_from_model!( skips :: DataFrame )
settings = Settings()
settings.dataset_type = synthetic_data
settings.do_legal_aid = false
settings.run_name="run-$(settings.dataset_type)-$(date_string())"
settings.skiplist = "skiplist"

settings.run_name="run-$(settings.dataset_type)-$(date_string())"

sys = [
get_default_system_for_fin_year(2024; scotland=true),
get_default_system_for_fin_year( 2024; scotland=true )]
tot = 0

settings.num_households, settings.num_people, nhh2 =
FRSHouseholdGetter.initialise( settings; reset=reset )


settings.num_households,
settings.num_people,
nhh2 =
FRSHouseholdGetter.initialise( settings; reset=true )
for hno in 1:settings.num_households
println( "on hh $hno num_households=$(settings.num_households)")
mhh = FRSHouseholdGetter.get_household( hno )
try
intermed = make_intermediate(
Float64,
settings,
mhh,
sys[1].lmt.hours_limits,
sys[1].age_limits,
sys[1].child_limits )
for sysno in 1:2
res = do_one_calc( mhh, sys[sysno], settings )
end
catch e
println( "caught exception $(e) hh.hid=$(mhh.hid) hh.data_year=$(mhh.data_year)")
push!( skips, (; hid=mhh.hid, data_year=mhh.data_year, reason="$(e)"))
end
end
end

function delete_irredemably_bad_hhs( hh :: DataFrame, pers :: DataFrame )
kills = DataFrame( hid=zeros(BigInt,0), data_year=zeros(0), reason=fill("",0))
function select_irredemably_bad_hhs( hh :: DataFrame, pers :: DataFrame )::DataFrame
kills = DataFrame( hid=zeros(BigInt,0), data_year=zeros(Int,0), reason=fill("",0))
for h in eachrow( hh )
p = pers[pers.hid .== h.hid,:]
n = size(p)[1]
Expand All @@ -88,26 +113,26 @@ function delete_irredemably_bad_hhs( hh :: DataFrame, pers :: DataFrame )
nbusps += size( bu )[1]
numheads = sum( bu[:,:is_bu_head])
if numheads != 1
println("kill: != 1 head for each bu hh.hid=$(h.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])")
push!( kills, h.hid )
msg = "!= 1 head for each bu hh.hid=$(h.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])"
push!( kills, (; hid=h.hid, data_year=h.data_year, reason=msg))
end
end
if sum( p[:,:is_hrp]) != 1
println("kill !=1 head for each hh hh.hid=$(p.hid) was $(sum( p[:,:is_hrp]) )")
push!( kills, h.hid )
msg = "!=1 head for each hh hh.hid=$(p.hid) was $(sum( p[:,:is_hrp]) )"
push!( kills, (; hid=h.hid, data_year=h.data_year, reason=msg) )
end
# fixable, but hey..
age_oldest_child = maximum(p[p.from_child_record.==1,:age];init=-99)
println( "age_oldest_child=$age_oldest_child")
if age_oldest_child >= 20
println( "age_oldest_child=$age_oldest_child for $(h.hid)")
push!( kills, h.hid )
msg = "age_oldest_child=$age_oldest_child for $(h.hid)"
push!( kills, (; hid=h.hid, data_year=h.data_year, reason=msg))
end

end
println( "killing $(kills)")
deleteat!(hh, hh.hid .∈ (kills,))
deleteat!(pers, pers.hid .∈ (kills,))
# println( "killing $(kills)")
return kills;
# deleteat!(hh, hh.hid .∈ (kills,))
# deleteat!(pers, pers.hid .∈ (kills,))
end

"""
Expand Down Expand Up @@ -494,7 +519,6 @@ function do_main_fixes!(hh::DataFrame,pers::DataFrame)
end
end
end # hh loop
delete_irredemably_bad_hhs( hh, pers )
end

"""
Expand All @@ -503,10 +527,12 @@ end
function fixall!( hh::DataFrame, pers::DataFrame)
settings = Settings()
settings.dataset_type = synthetic_data
settings.skiplist = "skiplist"
do_initial_fixes!( hh, pers )
do_main_fixes!( hh, pers )
skiplist = select_irredemably_bad_hhs( hh, pers )
# Last minute checks - these are actually just a repeat of the hrp and bu checks in the main loop above.
do_pers_idiot_checks( pers )
do_pers_idiot_checks( pers, skiplist )
# Delete working columns with the mostly.ai string primary keys - we've replaced them
# with BigInts as in the actual data.
select!( hh, Not(:uhidstr) )
Expand All @@ -516,9 +542,20 @@ function fixall!( hh::DataFrame, pers::DataFrame)
ds = main_datasets( settings )
CSV.write( ds.hhlds, hh; delim='\t' )
CSV.write( ds.people, pers; delim='\t' )
CSV.write( ds.skiplist, skiplist; delim='\t')
# 2nd try - just let the model fail
add_skips_from_model!( skiplist )
CSV.write( ds.skiplist, skiplist; delim='\t')
end
#
# open unpacked synthetic files
#
#= original version
hh = CSV.File("tmp/model_households_scotland-2015-2021/model_households_scotland-2015-2021.csv") |> DataFrame
pers = CSV.File( "tmp/model_people_scotland-2015-2021/model_people_scotland-2015-2021.csv" ) |> DataFrame
=#
# version with child/adult seperate
hh = CSV.File("tmp/v3/model_households_scotland-2015-2021/model_households_scotland-2015-2021.csv")|>DataFrame
child = CSV.File("tmp/v3/model_children_scotland-2015-2021/model_children_scotland-2015-2021.csv")|>DataFrame
adult = CSV.File("tmp/v3/model_adults_scotland-2015-2021/model_adults_scotland-2015-2021.csv")|>DataFrame
pers = vcat( adult, child )
99 changes: 66 additions & 33 deletions src/FRSHouseholdGetter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module FRSHouseholdGetter
#

using CSV
using DataFrames: DataFrame
using DataFrames: DataFrame, DataFrameRow, AbstractDataFrame
using StatsBase

using ScottishTaxBenefitModel
Expand Down Expand Up @@ -41,6 +41,7 @@ module FRSHouseholdGetter
get_data_years,
get_household,
num_households,
not_in_skiplist,
get_household_of_person,
get_interview_years,
get_regression_dataset,
Expand Down Expand Up @@ -90,7 +91,14 @@ module FRSHouseholdGetter
data :: DataFrame
end


function get_skiplist( settings :: Settings )::DataFrame
df = DataFrame( hid=zeros(BigInt,0), data_year=zeros(Int,0), reason=fill("",0))
if settings.skiplist != ""
fname = main_datasets( settings ).skiplist
df = CSV.File( fname )|>DataFrame
end
return df
end

"""
Insert into data a pair of basic deciles in the hh data based on actual pre-model income and eq scale
Expand Down Expand Up @@ -140,6 +148,18 @@ module FRSHouseholdGetter

# fixme I don't see how to make this a constant
REG_DATA :: DataFrame = DataFrame()

"""
This hh not in skiplist
"""
function not_in_skiplist( hr :: DataFrameRow, skiplist :: DataFrame )::Bool
if size(skiplist)[1] == 0
return true
end
sk = skiplist[ (skiplist.data_year .== hr.data_year ) .&
(skiplist.hid .== hr.hid ),:]
return size(sk)[1] == 0 # not in skiplist
end

"""
Initialise the dataset. If this has already been done, do nothing unless
Expand Down Expand Up @@ -169,46 +189,57 @@ module FRSHouseholdGetter
if settings.do_legal_aid
LegalAidData.init( settings; reset = reset )
end
hh_dataset = CSV.File( joinpath(data_dir( settings ),settings.household_name*".tab" )) |> DataFrame
people_dataset = CSV.File( joinpath( data_dir( settings ), settings.people_name*".tab")) |> DataFrame
npeople = size( people_dataset)[1]
skiplist = get_skiplist( settings )
ds = main_datasets( settings )
hh_dataset = CSV.File( ds.hhlds ) |> DataFrame
people_dataset = CSV.File( ds.people ) |> DataFrame
npeople = 0; # size( people_dataset)[1]
nhhlds = size( hh_dataset )[1]
resize!( MODEL_HOUSEHOLDS.hhlds, nhhlds )
resize!( MODEL_HOUSEHOLDS.weight, nhhlds )
MODEL_HOUSEHOLDS.weight .= 0

pseq = 0
for hseq in 1:nhhlds
hh = load_hhld_from_frame( hseq, hh_dataset[hseq,:], people_dataset, FRS, settings )
MODEL_HOUSEHOLDS.hhlds[hseq] = hh
if settings.wealth_method == matching
WealthData.find_wealth_for_hh!( hh, settings, 1 ) # fixme allow 1 to vary somehow Lee Chung..
end
uprate!( hh, settings )
if( settings.indirect_method == matching ) && (settings.do_indirect_tax_calculations)
ConsumptionData.find_consumption_for_hh!( hh, settings, 1 ) # fixme allow 1 to vary somehow Lee Chung..
if settings.impute_fields_from_consumption
ConsumptionData.impute_stuff_from_consumption!(hh,settings)
hseq = 0
dseq = 0
for hdata in eachrow( hh_dataset )
if not_in_skiplist( hdata, skiplist )
hseq += 1
hh = load_hhld_from_frame( dseq, hdata, people_dataset, FRS, settings )
npeople += num_people(hh)
MODEL_HOUSEHOLDS.hhlds[hseq] = hh
if settings.wealth_method == matching
WealthData.find_wealth_for_hh!( hh, settings, 1 ) # fixme allow 1 to vary somehow Lee Chung..
end
uprate!( hh, settings )
if( settings.indirect_method == matching ) && (settings.do_indirect_tax_calculations)
ConsumptionData.find_consumption_for_hh!( hh, settings, 1 ) # fixme allow 1 to vary somehow Lee Chung..
if settings.impute_fields_from_consumption
ConsumptionData.impute_stuff_from_consumption!(hh,settings)
end
end
end

pseqs = []
for pid in keys(hh.people)
pseq += 1
push!( pseqs, pseq )
MODEL_HOUSEHOLDS.pers_map[OneIndex( pid, hh.data_year )] = OnePos(hseq,pseq)
end
MODEL_HOUSEHOLDS.hh_map[OneIndex( hh.hid, hh.data_year )] = HHPeople( hseq, pseqs)
if ! (hh.data_year in MODEL_HOUSEHOLDS.data_years )
push!( MODEL_HOUSEHOLDS.data_years, hh.data_year )
end
if settings.do_legal_aid
LegalAidData.add_la_probs!( hh )
end
if ! (hh.interview_year in MODEL_HOUSEHOLDS.interview_years )
push!( MODEL_HOUSEHOLDS.interview_years, hh.interview_year )
end
pseqs = []
for pid in keys(hh.people)
pseq += 1
push!( pseqs, pseq )
MODEL_HOUSEHOLDS.pers_map[OneIndex( pid, hh.data_year )] = OnePos(hseq,pseq)
end
MODEL_HOUSEHOLDS.hh_map[OneIndex( hh.hid, hh.data_year )] = HHPeople( hseq, pseqs)
if ! (hh.data_year in MODEL_HOUSEHOLDS.data_years )
push!( MODEL_HOUSEHOLDS.data_years, hh.data_year )
end
if settings.do_legal_aid
LegalAidData.add_la_probs!( hh )
end
if ! (hh.interview_year in MODEL_HOUSEHOLDS.interview_years )
push!( MODEL_HOUSEHOLDS.interview_years, hh.interview_year )
end
end # don't skip
end
resize!( MODEL_HOUSEHOLDS.hhlds, hseq )
resize!( MODEL_HOUSEHOLDS.weight, hseq )
nhhlds = size( MODEL_HOUSEHOLDS.hhlds )[1]
# default weighting using current Scotland settings; otherwise do manually
if settings.auto_weight && settings.target_nation == N_Scotland
@time weight = generate_weights(
Expand All @@ -224,6 +255,8 @@ module FRSHouseholdGetter
MODEL_HOUSEHOLDS.weight[hseq] = MODEL_HOUSEHOLDS.hhlds[hseq].weight
end
end
# in case we have skipped some

MODEL_HOUSEHOLDS.dimensions.=
size(MODEL_HOUSEHOLDS.hhlds)[1],
npeople,
Expand Down
3 changes: 3 additions & 0 deletions src/RunSettings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ module RunSettings
actual_data,
synthetic_data,
data_dir,
get_skiplist,

get_all_uk_settings_2023

Expand Down Expand Up @@ -138,6 +139,7 @@ module RunSettings
export_full_results = false
do_dodgy_takeup_corrections = false
dataset_type = actual_data
skiplist = ""
end

function data_dir( settings :: Settings ) :: String
Expand All @@ -159,6 +161,7 @@ module RunSettings
return (
hhlds = joinpath( dd, settings.household_name*".tab" ),
people = joinpath( dd, settings.people_name*".tab" ),
skiplist = joinpath( dd, settings.skiplist*".tab" )
)
end

Expand Down
8 changes: 0 additions & 8 deletions src/Runner.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,19 +96,11 @@ module Runner
observer[] =Progress( settings.uuid, "starting",0, 0, 0, settings.num_households )
@time @threads for thread in 1:num_threads
for hno in start[thread]:stop[thread]

hh = FRSHouseholdGetter.get_household( hno )
#=
if hno < 20
println( "getting hh $hno hid=$(hh.hid) datayear=$(hh.data_year)")
end
=#
nation = nation_from_region( hh.region )
# println( "nation = $nation; included nations=$(settings.included_nations)")
if nation in settings.included_nations
if hno % 100 == 0
observer[] =Progress( settings.uuid, "run",thread, hno, 100, settings.num_households )
# println( "on household hno $hno hid=$(hh.hid) year=$(hh.interview_year) thread $thread")
end
for sysno in 1:num_systems
res = do_one_calc( hh, params[sysno], settings )
Expand Down
1 change: 1 addition & 0 deletions src/Weighting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ function generate_weights(
nhhlds,
initialise_target_dataframe,
make_target_row! )
# println( data )
nrows = size( data )[1]
ncols = size( data )[2]
## FIXME parameterise this
Expand Down
Loading

0 comments on commit d49a17c

Please sign in to comment.