EQ OECD scale now matches HBAI.

grahamstark · grahamstark · commit f17b5f72acef · 2025-09-08T16:22:40.000+01:00
diff --git a/scripts/hbai-scotben-compares.jl b/scripts/hbai-scotben-compares.jl
@@ -35,16 +35,9 @@ function make_pov( df :: DataFrame, incf::Symbol, growth=0.02 )::Tuple
     povstats, povline   
 end
 
-# Raw FRS
-hhold = CSV.File( "/mnt/data/frs/2022/tab/househol.tab"; missingstring=[" ", ""])|>DataFrame
-rename!( hhold, lowercase.(names(hhold)))
-hhold_scot = @view hhold[hhold.gvtregn .== 299999999,:]
-
 # one run of scotben 24 sys
-sys = STBParameters.get_default_system_for_fin_year( 2025 )
-settings = Settings()
 tot = 0
-obs = Observable( Progress(settings.uuid,"",0,0,0,0))
+obs = Observable( Progress(Base.UUID("c2ae9c83-d24a-431c-b04f-74662d2ba07e"),"",0,0,0,0))
 Observable(Progress(Base.UUID("c2ae9c83-d24a-431c-b04f-74662d2ba07e"), "", 0, 0, 0, 0))
 of = on(obs) do p
     global tot
@@ -54,10 +47,13 @@ of = on(obs) do p
 end
 
 function onerun( ;
+    settings::Settings,
     weighting_relative_to_ons_weights :: Bool,
     to_y::Int, 
     to_q :: Int )
-    settings.included_data_years = [2019,2021,2022, 2023] # same as 3 year HBAI
+    global tot
+    tot = 0
+    sys = STBParameters.get_default_system_for_fin_year( 2025 )
     settings.requested_threads = 4
     settings.to_y=to_y #match hbai, kinda sorta
     settings.to_q=to_q
@@ -76,21 +72,36 @@ function onerun( ;
     return res, results_hhs, results_indiv
 end
 
-function load_model_data()
+function load_model_data(settings::Settings)::Tuple
     # overwrite raw data with uprated/matched versions
     dataset_artifact = get_data_artifact( settings )
     model_hhs = HouseholdFromFrame.read_hh( 
         joinpath( dataset_artifact, "households.tab")) # CSV.File( ds.hhlds ) |> DataFrame
     model_people = HouseholdFromFrame.read_pers( 
         joinpath( dataset_artifact, "people.tab"))
+    @show settings.included_data_years
     model_hhs = model_hhs[ model_hhs.data_year .∈ ( settings.included_data_years, ) , :]
     model_people = model_people[ model_people.data_year .∈ ( settings.included_data_years, ) , :]
+    settings.num_households = size( model_hhs )[1]
+    settings.num_people = size( model_people )[1]
+    @show settings
     DataSummariser.overwrite_raw!( model_hhs, model_people, settings.num_households )
-    jhhs = leftjoin(results_hhs, model_hhs, on=[:hid,:data_year], makeunique=true )
-    return jhhs, model_people, model_hhs
+    # jhhs = leftjoin(results_hhs, model_hhs, on=[:hid,:data_year], makeunique=true )
+    return  model_people, model_hhs
+end
+
+function make_compare(results_hhs::DataFrame , hbai_s::DataFrame)
+    sbsub = results_hhs[results_hhs.data_year.==2021,[:hid,:data_year,:grossing_factor,:bhc_net_income,:eq_scale_bhc ]]
+    hbsub = hbai_s[hbai_s.data_year.==2021,[:sernum,:data_year, :grossing_factor,:bhc_net_income,:before_hc_eqscale,:ahcpubdef,:ahcyrdef]]
+    hbsub.grossing_factor ./= 3
+    compset = innerjoin( sbsub, hbsub, on=[:hid=>:sernum, :data_year], makeunique=true)
+    compset.eqdif = .! (compset.eq_scale_bhc .≈ compset.before_hc_eqscale )
+    return compset
 end
 
-function get_hbai()
+
+
+function get_hbai(settings::Settings)
     hbai = CSV.File( "/mnt/data/hbai/2024-ed/UKDA-5828-tab/main/20224.csv"; delim=',', missingstring=["","-9","A"]) |> DataFrame
     rename!(lowercase, hbai)
     hbai = hbai[( .! ismissing.( hbai.s_oe_bhc .+ hbai.s_oe_ahc .+ hbai.eahchh)), :]
@@ -120,10 +131,14 @@ function get_hbai()
     hbai, hbai_s, hb23_s, hb23_heads
 end
 
+settings = Settings()
+settings.included_data_years = [2019,2021,2022, 2023] # same as 3 year HBAI
+hbai, hbai_s, hb23_s, hb23_heads = get_hbai(settings)
+settings.num_households, settings.num_people, nhhs2 = 
+        FRSHouseholdGetter.initialise( settings; reset=true )
+model_people, model_hhs = load_model_data(settings)
 
-hbai, hbai_s, hb23_s, hb23_heads = get_hbai()
-jhhs, model_people, model_hhs = load_model_data()
-n=16*4
+n=64
 df = DataFrame(
     uprated = fill("",n),
     gross_type_relative_to = fill("",n),
@@ -132,7 +147,8 @@ df = DataFrame(
     inc_measure = fill("",n),
     scotben_hh = zeros(n), # [sb_h_mean_grossed, sb_h_mean_ungrossed,sb_h_median_grossed, sb_h_median_ungrossed ],
     scotben_indiv = zeros(n), #[sb_i_mean_grossed, sb_i_mean_ungrossed,sb_i_median_grossed, sb_i_median_ungrossed ],
-    hbai = zeros(n)) #[hbai_mean_grossed, hbai_mean_ungrossed, hbai_median_grossed, hbai_median_ungrossed])
+    hbai_21_23 = zeros(n),
+    hbai_23 = zeros(n)) #[hbai_mean_grossed, hbai_mean_ungrossed, hbai_median_grossed, hbai_median_ungrossed])
 
 r = 0
 
@@ -144,6 +160,7 @@ for uprate in ["current", "y2024"]
     end
     for weighting_relative_to_ons_weights in [false,true]
         results, results_hhs, results_indiv = onerun( 
+            settings = settings,
             weighting_relative_to_ons_weights = weighting_relative_to_ons_weights,
             to_y = to_y, 
             to_q = to_q)
@@ -162,66 +179,32 @@ for uprate in ["current", "y2024"]
                     row.grossed = grossed ? "Grossed" : "Ungrossed"
                     row.inc_measure = pretty(string(inc))
                     row.stat = string(f)
-                    hhs_weights, indiv_weights, hbai_weights = if grossed 
+                    hhs_weights, indiv_weights, hbai_weights, hb23_weights = if grossed 
                         results_hhs.grossing_factor,
                         results_indiv.grossing_factor,
-                        hbai_s.grossing_factor
+                        hbai_s.grossing_factor,
+                        hb23_s.grossing_factor
                     else
                         Weights( results_hhs.num_people ),
                         Weights( ones( size( results_indiv)[1])),
-                        Weights( ones( size( hbai_s)[1]))
+                        Weights( ones( size( hbai_s)[1])),
+                        Weights( ones( size( hb23_s)[1]))
                     end
                     row.scotben_hh = f(results_hhs[!,inc], hhs_weights )
                     row.scotben_indiv = f( results_indiv[!,inc], indiv_weights )
-                    row.hbai = f( hbai_s[!,inc], hbai_weights )
+                    row.hbai_21_23 = f( hbai_s[!,inc], hbai_weights )
+                    row.hbai_23 = f( hb23_s[!,inc], hb23_weights )
                 end # func
             end # gross
         end # incs
     end
 end # uprating
 
+CSV.write( "hbai-scotben-compares.tab", df; delim='\t')
 
-sbmedian_frs_weights = median( jhhs.bhc_net_income, Weights( jhhs.weight_1 ./ 3) )
-# select summary hbai
-hbai_s[!,[:sernum,:grossing_factor,:ahc_net_income,:before_hc_eqscale,:data_year,:ahcpubdef,:ahcyrdef]]
-
-summarystats( results_hhs.bhc_net_income )
-summarystats( hbai_s.bhc_net_income )
-
-#1. is it my weights?
-# Problem: my mean income is >100 higher than SPI mean income.
-# 
-# join hbai and my hh data
-# read CSV version?? 
-# uprate mine to HBAI target
-# use HBAI weights/my weights
-#
-
-
-median(hbai.eq_ahc_net_income,Weights(hbai.grossing_factor))
-median(hb23.eq_ahc_net_income,Weights(hb23.grossing_factor))
-# should match ... these:
-unique(hbai.mdoeahc)
-# should match ... these:
-unique(hbai.mdoebhc)
-
-# test of weighting relative to exis
-
-household_total,
-    targets, # no institutional,
-    initialise_target_dataframe,
-    make_target_row! = Weighting.get_targets( settings )
-popsum = sum( jhhs.weight )
-wscale = household_total/popsum
-initial_weights = jhhs.weight .* wscale
-
-@time weightsp, data = generate_weights( 
-               settings.num_households;
-               weight_type = settings.weight_type,
-               lower_multiple = settings.lower_multiple,
-               upper_multiple = settings.upper_multiple,
-               household_total = household_total,
-               targets = targets, # no institutional,
-               initialise_target_dataframe = initialise_target_dataframe,
-               make_target_row! = make_target_row!, 
-               initial_weights=initial_weights )
+results, results_hhs, results_indiv = onerun( 
+            settings = settings,
+            weighting_relative_to_ons_weights = true,
+            to_y = 2024, 
+            to_q = 2)
+compdata = make_compare( results_hhs, hbai_s )
diff --git a/src/EquivalenceScales.jl b/src/EquivalenceScales.jl
@@ -65,7 +65,7 @@ module EquivalenceScales
                     eq += 1
                     println( "only 1 person; non head rel=$rel $(p.hid)")
                 else
-                    if get_age(p) <= 14
+                    if get_age(p) < 14
                         add = if scale == oxford 
                             0.5
                         elseif scale == oecd 
diff --git a/src/ModelHousehold.jl b/src/ModelHousehold.jl
@@ -273,11 +273,11 @@ function eq_rel_to_hoh( p :: Person ) :: EQ_P_Type
         return eq_head
     elseif p.is_standard_child
         return eq_dependent_child
-    elseif p.relationship_to_hoh in [Spouse,Cohabitee]
+    elseif (p.relationship_to_hoh in [Spouse,Cohabitee]) || (p.default_benefit_unit == 1)
         return eq_spouse_of_head
     # hack for 2nd bu adults always being heads
-    # elseif (p.default_benefit_unit > 1)
-        # return eq_head
+    # elseif (p.default_benefit_unit > 1) && ( ! p.is_standard_child )
+    #   return eq_head
     else 
         return eq_other_adult
     end
diff --git a/src/STBOutput.jl b/src/STBOutput.jl
@@ -874,20 +874,20 @@ function incomes_to_hist(
     maxr=1500.0,
     bandwidth=10 )::NamedTuple
     incs = deepcopy(hh[:,income_measure])
+    # constrain the graph as in HBAI    
+    incs = max.( incs, minr)
+    incs = min.( incs, maxr)
     maxinc = maximum(incs)
     mininc = minimum(incs)
     medinc = median( incs, Weights(hh.weighted_people))
     meaninc = mean( incs, Weights(hh.weighted_people))
     @show medinc meaninc
-    # constrain the graph as in HBAI    
-    incs = max.( incs, minr)
-    incs = min.( incs, maxr)
     ranges = collect( minr:bandwidth:maxr )
     push!( ranges,Inf)
     hist = fit( Histogram, incs, Weights( hh.weighted_people ), ranges, closed=:left )
     # check I've understood fit(Hist correctly ..
-    # @assert hist.weights[1] ≈ sum( hh.weighted_people[ incs .<= minr ]) "$(hist.weights[1]) ≈ $(sum( hh.weighted_people[ incs .<= minr ])) $hist"
-    # @assert hist.weights[end] ≈ sum( hh.weighted_people[ incs .>= maxr ]) "$(hist.weights[end]) ≈ $(sum( hh.weighted_people[ incs .>= maxr ])) $hist"
+    @assert hist.weights[1] ≈ sum( hh.weighted_people[ incs .< hist.edges[1][2] ]) "$(hist.weights[1]) ≈ $(sum( hh.weighted_people[ incs .<= minr ])) $hist"
+    @assert hist.weights[end] ≈ sum( hh.weighted_people[ incs .>= maxr ]) "$(hist.weights[end]) ≈ $(sum( hh.weighted_people[ incs .>= maxr ])) $hist"
     return ( max=maxinc, min=mininc, median=medinc, mean=meaninc, hist=hist )
 end
 
@@ -897,7 +897,7 @@ Dump out histogram, means, etc. as 2-col delimited data.
 """
 function write_hist( filename::String, incs::NamedTuple; delim='\t')
     d = DataFrame( 
-        edges_lower_limit=incs.hist.edges[1][1:end-1], 
+        edges_upper_limit=incs.hist.edges[1][2:end], 
         population=incs.hist.weights )
     # add stats at bottom
     push!(d, ["mean", incs.mean]; promote=true ) # since col1 is float only otherwise