synth data

grahamstark · Aug 21, 2024 · d49a17c · d49a17c
1 parent 8bd177c
commit d49a17c
Show file tree

Hide file tree

Showing 9 changed files with 155 additions and 80 deletions.
diff --git a/docs/mostly-ai-notes.md b/docs/mostly-ai-notes.md
@@ -153,4 +153,14 @@ is_hrp - one per household
 
 add `uhid` as a true unique primary key - was hid which can be duplicated over data_years.
 
+## Relationships Messed Up
+
+fixup_synth_data 
+
+file synthetic_data/skiplist.tab has list of errors
+
+relationships! badly messed up.
+
+## Take 2 - break adults and children into separate files 
+
 
diff --git a/scripts/fixup_synth_data.jl b/scripts/fixup_synth_data.jl
@@ -9,6 +9,9 @@ using .Definitions
 using .Intermediate
 using .ModelHousehold
 using .FRSHouseholdGetter
+using .SingleHouseholdCalculations: do_one_calc
+using .STBParameters
+
 using DataFrames,CSV,StatsBase
 using OrderedCollections
 using Revise 
@@ -35,45 +38,67 @@ checked that each person is allocated to a household via `hid`.
 FIXME move to `tests/`
 FIXME check the `relationship_x` records
 """
-function do_pers_idiot_checks( pers :: AbstractDataFrame )
+function do_pers_idiot_checks( pers :: AbstractDataFrame, skiplist :: DataFrame )
     hh_pers = groupby( pers, [:hid])
     nps = size(hh_pers)[1]
     for hid in 1:nps
         hp = hh_pers[hid]
-        hbus = groupby( hp, :default_benefit_unit )
-        nbusps = 0
-        first = hp[1,:]
-        for bu in hbus 
-            nbusps += size( bu )[1]
-            numheads = sum( bu[:,:is_bu_head])
-            @assert numheads == 1 "1 head for each bu hh.hid=$(first.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])"
+        if not_in_skiplist(hp[1,:],skiplist)
+            hbus = groupby( hp, :default_benefit_unit )
+            nbusps = 0
+            first = hp[1,:]
+            for bu in hbus 
+                nbusps += size( bu )[1]
+                numheads = sum( bu[:,:is_bu_head])
+                @assert numheads == 1 "1 head for each bu hh.hid=$(first.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])"
+            end
+            @assert nbusps == size(hp)[1] "size mismatch for hh.hid=$(first.hid)"
+            @assert sum( hp[:,:is_hrp]) == 1 "1 head for each hh hh.hid=$(first.hid) was $(sum( hp[:,:is_hrp]) )"
         end
-        @assert nbusps == size(hp)[1] "size mismatch for hh.hid=$(hp.hid)"
-        @assert sum( hp[:,:is_hrp]) == 1 "1 head for each hh hh.hid=$(hp.hid) was $(sum( hp[:,:is_hrp]) )"
     end
 end  
 
 
-function create_skips()
+function add_skips_from_model!( skips :: DataFrame )
     settings = Settings()
     settings.dataset_type = synthetic_data 
     settings.do_legal_aid = false    
     settings.run_name="run-$(settings.dataset_type)-$(date_string())"
+    settings.skiplist = "skiplist"
 
     settings.run_name="run-$(settings.dataset_type)-$(date_string())"
+
     sys = [
         get_default_system_for_fin_year(2024; scotland=true), 
         get_default_system_for_fin_year( 2024; scotland=true )]
     tot = 0
-
-    settings.num_households, settings.num_people, nhh2 = 
-    FRSHouseholdGetter.initialise( settings; reset=reset )
-
-
+    settings.num_households, 
+    settings.num_people, 
+    nhh2 = 
+        FRSHouseholdGetter.initialise( settings; reset=true )
+    for hno in 1:settings.num_households
+        println( "on hh $hno num_households=$(settings.num_households)")
+        mhh = FRSHouseholdGetter.get_household( hno )            
+        try
+            intermed = make_intermediate( 
+                Float64,
+                settings,
+                mhh,  
+                sys[1].lmt.hours_limits,
+                sys[1].age_limits,
+                sys[1].child_limits )
+            for sysno in 1:2
+                res = do_one_calc( mhh, sys[sysno], settings )
+            end
+        catch e
+            println( "caught exception $(e) hh.hid=$(mhh.hid) hh.data_year=$(mhh.data_year)")
+            push!( skips, (; hid=mhh.hid, data_year=mhh.data_year, reason="$(e)"))
+        end
+    end
 end
 
-function delete_irredemably_bad_hhs( hh :: DataFrame, pers :: DataFrame )
-    kills = DataFrame( hid=zeros(BigInt,0), data_year=zeros(0), reason=fill("",0))
+function select_irredemably_bad_hhs( hh :: DataFrame, pers :: DataFrame )::DataFrame
+    kills = DataFrame( hid=zeros(BigInt,0), data_year=zeros(Int,0), reason=fill("",0))
     for h in eachrow( hh )
         p = pers[pers.hid .== h.hid,:]
         n = size(p)[1]
@@ -88,26 +113,26 @@ function delete_irredemably_bad_hhs( hh :: DataFrame, pers :: DataFrame )
             nbusps += size( bu )[1]
             numheads = sum( bu[:,:is_bu_head])
             if numheads != 1 
-                println("kill: != 1 head for each bu hh.hid=$(h.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])")
-                push!( kills, h.hid )
+                msg = "!= 1 head for each bu hh.hid=$(h.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])"
+                push!( kills, (; hid=h.hid, data_year=h.data_year, reason=msg))
             end
         end
         if sum( p[:,:is_hrp]) != 1 
-            println("kill !=1 head for each hh hh.hid=$(p.hid) was $(sum( p[:,:is_hrp]) )")
-            push!( kills, h.hid )
+            msg = "!=1 head for each hh hh.hid=$(p.hid) was $(sum( p[:,:is_hrp]) )"
+            push!( kills, (; hid=h.hid, data_year=h.data_year, reason=msg) )
         end
         # fixable, but hey..
         age_oldest_child = maximum(p[p.from_child_record.==1,:age];init=-99)
-        println( "age_oldest_child=$age_oldest_child")
         if age_oldest_child >= 20
-            println( "age_oldest_child=$age_oldest_child for $(h.hid)")
-            push!( kills, h.hid )
+            msg = "age_oldest_child=$age_oldest_child for $(h.hid)"
+            push!( kills,  (; hid=h.hid, data_year=h.data_year, reason=msg))
         end
 
     end
-    println( "killing $(kills)")
-    deleteat!(hh, hh.hid .∈ (kills,))
-    deleteat!(pers, pers.hid .∈ (kills,))
+    # println( "killing $(kills)")
+    return kills;
+    # deleteat!(hh, hh.hid .∈ (kills,))
+    # deleteat!(pers, pers.hid .∈ (kills,))
 end
 
 """
@@ -494,7 +519,6 @@ function do_main_fixes!(hh::DataFrame,pers::DataFrame)
             end
         end
     end # hh loop
-    delete_irredemably_bad_hhs( hh, pers )
 end
 
 """
@@ -503,10 +527,12 @@ end
 function fixall!( hh::DataFrame, pers::DataFrame)
     settings = Settings()
     settings.dataset_type = synthetic_data
+    settings.skiplist = "skiplist"
     do_initial_fixes!( hh, pers )
     do_main_fixes!( hh, pers )
+    skiplist = select_irredemably_bad_hhs( hh, pers )
     # Last minute checks - these are actually just a repeat of the hrp and bu checks in the main loop above.
-    do_pers_idiot_checks( pers )
+    do_pers_idiot_checks( pers, skiplist )
     # Delete working columns with the mostly.ai string primary keys - we've replaced them
     # with BigInts as in the actual data.
     select!( hh, Not(:uhidstr) )
@@ -516,9 +542,20 @@ function fixall!( hh::DataFrame, pers::DataFrame)
     ds = main_datasets( settings )
     CSV.write( ds.hhlds, hh; delim='\t' )
     CSV.write( ds.people, pers; delim='\t' )
+    CSV.write( ds.skiplist, skiplist; delim='\t')
+    # 2nd try - just let the model fail
+    add_skips_from_model!( skiplist )
+    CSV.write( ds.skiplist, skiplist; delim='\t')
 end
 #
 # open unpacked synthetic files
 #
+#= original version 
 hh = CSV.File("tmp/model_households_scotland-2015-2021/model_households_scotland-2015-2021.csv") |> DataFrame
 pers = CSV.File( "tmp/model_people_scotland-2015-2021/model_people_scotland-2015-2021.csv" ) |> DataFrame
+=#
+# version with child/adult seperate
+hh = CSV.File("tmp/v3/model_households_scotland-2015-2021/model_households_scotland-2015-2021.csv")|>DataFrame
+child = CSV.File("tmp/v3/model_children_scotland-2015-2021/model_children_scotland-2015-2021.csv")|>DataFrame
+adult = CSV.File("tmp/v3/model_adults_scotland-2015-2021/model_adults_scotland-2015-2021.csv")|>DataFrame
+pers = vcat( adult, child )
diff --git a/src/FRSHouseholdGetter.jl b/src/FRSHouseholdGetter.jl
@@ -7,7 +7,7 @@ module FRSHouseholdGetter
     # 
 
     using CSV
-    using DataFrames: DataFrame
+    using DataFrames: DataFrame, DataFrameRow, AbstractDataFrame
     using StatsBase
 
     using ScottishTaxBenefitModel
@@ -41,6 +41,7 @@ module FRSHouseholdGetter
         get_data_years,
         get_household, 
         num_households, 
+        not_in_skiplist,
         get_household_of_person,
         get_interview_years,
         get_regression_dataset, 
@@ -90,7 +91,14 @@ module FRSHouseholdGetter
         data :: DataFrame
     end
 
-
+    function get_skiplist( settings :: Settings )::DataFrame 
+        df = DataFrame( hid=zeros(BigInt,0), data_year=zeros(Int,0), reason=fill("",0))
+        if settings.skiplist != ""
+            fname = main_datasets( settings ).skiplist
+            df = CSV.File( fname )|>DataFrame
+        end
+        return df
+    end
 
     """
     Insert into data a pair of basic deciles in the hh data based on actual pre-model income and eq scale
@@ -140,6 +148,18 @@ module FRSHouseholdGetter
 
     # fixme I don't see how to make this a constant 
     REG_DATA :: DataFrame = DataFrame()
+
+    """
+    This hh not in skiplist
+    """
+    function not_in_skiplist( hr :: DataFrameRow, skiplist :: DataFrame )::Bool
+        if size(skiplist)[1] == 0
+            return true
+        end 
+        sk = skiplist[ (skiplist.data_year .== hr.data_year ) .&
+                  (skiplist.hid .== hr.hid ),:]
+        return size(sk)[1] == 0 # not in skiplist
+    end
 
     """
     Initialise the dataset. If this has already been done, do nothing unless 
@@ -169,46 +189,57 @@ module FRSHouseholdGetter
         if settings.do_legal_aid
             LegalAidData.init( settings; reset = reset )
         end
-        hh_dataset = CSV.File( joinpath(data_dir( settings ),settings.household_name*".tab" )) |> DataFrame
-        people_dataset = CSV.File( joinpath( data_dir( settings ), settings.people_name*".tab")) |> DataFrame
-        npeople = size( people_dataset)[1]
+        skiplist = get_skiplist( settings )
+        ds = main_datasets( settings )
+        hh_dataset = CSV.File( ds.hhlds ) |> DataFrame
+        people_dataset = CSV.File( ds.people ) |> DataFrame
+        npeople = 0; # size( people_dataset)[1]
         nhhlds = size( hh_dataset )[1]
         resize!( MODEL_HOUSEHOLDS.hhlds, nhhlds )
         resize!( MODEL_HOUSEHOLDS.weight, nhhlds )
         MODEL_HOUSEHOLDS.weight .= 0
 
         pseq = 0
-        for hseq in 1:nhhlds
-            hh = load_hhld_from_frame( hseq, hh_dataset[hseq,:], people_dataset, FRS, settings )
-            MODEL_HOUSEHOLDS.hhlds[hseq] = hh
-            if settings.wealth_method == matching 
-                WealthData.find_wealth_for_hh!( hh, settings, 1 ) # fixme allow 1 to vary somehow Lee Chung..
-            end
-            uprate!( hh, settings )
-            if( settings.indirect_method == matching ) && (settings.do_indirect_tax_calculations)
-                ConsumptionData.find_consumption_for_hh!( hh, settings, 1 ) # fixme allow 1 to vary somehow Lee Chung..
-                if settings.impute_fields_from_consumption
-                    ConsumptionData.impute_stuff_from_consumption!(hh,settings)
+        hseq = 0
+        dseq = 0
+        for hdata in eachrow( hh_dataset )            
+            if not_in_skiplist( hdata, skiplist )
+                hseq += 1
+                hh = load_hhld_from_frame( dseq, hdata, people_dataset, FRS, settings )
+                npeople += num_people(hh)
+                MODEL_HOUSEHOLDS.hhlds[hseq] = hh
+                if settings.wealth_method == matching 
+                    WealthData.find_wealth_for_hh!( hh, settings, 1 ) # fixme allow 1 to vary somehow Lee Chung..
+                end
+                uprate!( hh, settings )
+                if( settings.indirect_method == matching ) && (settings.do_indirect_tax_calculations)
+                    ConsumptionData.find_consumption_for_hh!( hh, settings, 1 ) # fixme allow 1 to vary somehow Lee Chung..
+                    if settings.impute_fields_from_consumption
+                        ConsumptionData.impute_stuff_from_consumption!(hh,settings)
+                    end
                 end
-            end
 
-            pseqs = []
-            for pid in keys(hh.people)
-                pseq += 1
-                push!( pseqs, pseq )
-                MODEL_HOUSEHOLDS.pers_map[OneIndex( pid, hh.data_year )] = OnePos(hseq,pseq)
-            end
-            MODEL_HOUSEHOLDS.hh_map[OneIndex( hh.hid, hh.data_year )] = HHPeople( hseq, pseqs)
-            if ! (hh.data_year in MODEL_HOUSEHOLDS.data_years )
-                push!( MODEL_HOUSEHOLDS.data_years, hh.data_year )
-            end
-            if settings.do_legal_aid
-                LegalAidData.add_la_probs!( hh )
-            end
-            if ! (hh.interview_year in MODEL_HOUSEHOLDS.interview_years )
-                push!( MODEL_HOUSEHOLDS.interview_years, hh.interview_year )
-            end
+                pseqs = []
+                for pid in keys(hh.people)
+                    pseq += 1
+                    push!( pseqs, pseq )
+                    MODEL_HOUSEHOLDS.pers_map[OneIndex( pid, hh.data_year )] = OnePos(hseq,pseq)
+                end
+                MODEL_HOUSEHOLDS.hh_map[OneIndex( hh.hid, hh.data_year )] = HHPeople( hseq, pseqs)
+                if ! (hh.data_year in MODEL_HOUSEHOLDS.data_years )
+                    push!( MODEL_HOUSEHOLDS.data_years, hh.data_year )
+                end
+                if settings.do_legal_aid
+                    LegalAidData.add_la_probs!( hh )
+                end
+                if ! (hh.interview_year in MODEL_HOUSEHOLDS.interview_years )
+                    push!( MODEL_HOUSEHOLDS.interview_years, hh.interview_year )
+                end
+            end # don't skip
         end
+        resize!( MODEL_HOUSEHOLDS.hhlds, hseq )
+        resize!( MODEL_HOUSEHOLDS.weight, hseq )
+        nhhlds = size( MODEL_HOUSEHOLDS.hhlds )[1]
         # default weighting using current Scotland settings; otherwise do manually
         if settings.auto_weight && settings.target_nation == N_Scotland
             @time weight = generate_weights( 
@@ -224,6 +255,8 @@ module FRSHouseholdGetter
                 MODEL_HOUSEHOLDS.weight[hseq] = MODEL_HOUSEHOLDS.hhlds[hseq].weight
             end
         end
+        # in case we have skipped some
+
         MODEL_HOUSEHOLDS.dimensions.=
             size(MODEL_HOUSEHOLDS.hhlds)[1],
             npeople,

diff --git a/src/RunSettings.jl b/src/RunSettings.jl
@@ -43,6 +43,7 @@ module RunSettings
         actual_data,
         synthetic_data,
         data_dir,
+        get_skiplist,
 
         get_all_uk_settings_2023
 
@@ -138,6 +139,7 @@ module RunSettings
         export_full_results = false
         do_dodgy_takeup_corrections = false
         dataset_type = actual_data 
+        skiplist = ""
     end
 
     function data_dir( settings :: Settings ) :: String
@@ -159,6 +161,7 @@ module RunSettings
         return ( 
             hhlds = joinpath( dd, settings.household_name*".tab" ),
             people = joinpath( dd, settings.people_name*".tab" ),
+            skiplist = joinpath( dd, settings.skiplist*".tab" )
         )
     end
 

diff --git a/src/Runner.jl b/src/Runner.jl
@@ -96,19 +96,11 @@ module Runner
         observer[] =Progress( settings.uuid, "starting",0, 0, 0, settings.num_households )
         @time @threads for thread in 1:num_threads
             for hno in start[thread]:stop[thread]
-
                 hh = FRSHouseholdGetter.get_household( hno )
-                #=
-                if hno < 20
-                    println( "getting hh $hno hid=$(hh.hid) datayear=$(hh.data_year)")
-                end
-                =#
                 nation = nation_from_region( hh.region )
-                # println( "nation = $nation; included nations=$(settings.included_nations)")
                 if nation in settings.included_nations
                     if hno % 100 == 0
                         observer[] =Progress( settings.uuid, "run",thread, hno, 100, settings.num_households )
-                        # println( "on household hno $hno hid=$(hh.hid) year=$(hh.interview_year) thread $thread")
                     end
                     for sysno in 1:num_systems
                         res = do_one_calc( hh, params[sysno], settings )

diff --git a/src/Weighting.jl b/src/Weighting.jl
@@ -94,6 +94,7 @@ function generate_weights(
         nhhlds, 
         initialise_target_dataframe, 
         make_target_row! )
+    # println( data )
     nrows = size( data )[1]
     ncols = size( data )[2]
     ## FIXME parameterise this