Skip to content

Commit 534cd5d

Browse files
committed
pics
1 parent 43555e7 commit 534cd5d

File tree

2 files changed

+182
-82
lines changed

2 files changed

+182
-82
lines changed

scripts/fixup_synth_data.jl

Lines changed: 137 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,28 @@ function do_pers_idiot_checks( pers :: AbstractDataFrame )
4343
for bu in hbus
4444
nbusps += size( bu )[1]
4545
numheads = sum( bu[:,:is_bu_head])
46-
@assert numheads == 1 "1 head for each bu hh.hid=$(first.hid)"
46+
@assert numheads == 1 "1 head for each bu hh.hid=$(first.hid) numheads=$numheads bu = $(bu[1,:default_benefit_unit])"
4747
end
4848
@assert nbusps == size(hp)[1] "size mismatch for hh.hid=$(hp.hid)"
49-
@assert sum( hp[:,:is_hrp]) == 1 "1 head for each hh hh.hid=$(hp.hid)"
49+
@assert sum( hp[:,:is_hrp]) == 1 "1 head for each hh hh.hid=$(hp.hid) was $(sum( hp[:,:is_hrp]) )"
5050
end
5151
end
5252

53+
function delete_irredemably_bad_hhs( hh :: DataFrame, pers :: DataFrame )
54+
kills = []
55+
for h in eachrow( hh )
56+
p = pers[pers.hid .== h.hid,:]
57+
n = size(p)[1]
58+
# all children - killem all
59+
if(maximum( p[!,:age]) < 16) && (sum( p[!,:from_child_record]) == n)
60+
println( "want to kill $(h.hid)")
61+
push!(kills, h.hid)
62+
end
63+
end
64+
println( "killing $(kills)")
65+
deleteat!(hh, hh.hid .∈ (kills,))
66+
deleteat!(pers, pers.hid .∈ (kills,))
67+
end
5368

5469
"""
5570
reassign over 21s not in education to non child
@@ -86,12 +101,30 @@ function fixup_child_status!( pers :: DataFrameRow )::Int
86101
return oc == pers.from_child_record ? 0 : 1 # count of changes made
87102
end
88103

104+
105+
function zeropos( p :: DataFrameRow )::Int
106+
for i in 1:15
107+
k = Symbol( "relationship_$(i)")
108+
if ismissing(p[k])
109+
return 9999
110+
elseif p[k] == 0
111+
return i
112+
end
113+
end
114+
9999
115+
end
116+
117+
89118
"""
90119
91120
"""
92121
function fixup_relationships!( hp :: AbstractDataFrame )::Int
122+
# sort the hh people in order of the zero (itself) relationship field
123+
# that
124+
hp.zpos .= zeropos.( eachrow(hp) )
125+
sort!( hp, :zpos )
93126
num_people = size(hp)[1] #
94-
println( "num people $num_people")
127+
# println( "num people $num_people")
95128
@assert size( hp[hp.is_hrp.==1,:])[1] == 1 # exactly 1 hrp
96129
nfixes = 0
97130
for p in eachrow(hp) # for each person .. (only need 1st 1/2?)
@@ -107,67 +140,29 @@ function fixup_relationships!( hp :: AbstractDataFrame )::Int
107140
# change the other person's relationship to match this one, if needed.
108141
if j != p.pno
109142
k = Symbol( "relationship_$(j)")
110-
relationship = Relationship(p[k]) # relationship of this person to person j
143+
relationship = if ismissing( p[k] ) # relationship of this person to person j
144+
Missing_Relationship
145+
else
146+
Relationship(p[k])
147+
end
148+
if relationship == This_Person # can't be this person if pno != j
149+
relationship = Missing_Relationship
150+
end
111151
oper = hp[j,:] # look up the other person
112152
recip_relationship = Relationship(oper[ok])
113-
println("hh $(p.hid): checking $(p.pno)=>$(oper.pno) relationships $(relationship)=>$(recip_relationship)")
114-
if is_partner( relationship )
115-
if ! is_partner( recip_relationship )
116-
nfixes += 1
117-
oper[ok] = Int( relationship )
118-
end
119-
elseif is_dependent_child( relationship )
120-
if ! is_parent( recip_relationship )
121-
nfixes += 1
122-
r = if relationship == Son_or_daughter_incl_adopted
123-
Parent
124-
elseif relationship == Foster_child
125-
Foster_parent
126-
elseif relationship == Step_son_or_daughter
127-
Step_parent
128-
end
129-
oper[ok] = Int( r )
130-
end
131-
elseif is_parent( relationship )
132-
if ! is_dependent_child( recip_relationship )
133-
nfixes += 1
134-
r = if relationship == Parent
135-
Son_or_daughter_incl_adopted
136-
elseif relationship == Foster_parent
137-
Foster_child
138-
elseif relationship == Step_parent
139-
Step_son_or_daughter
140-
end
141-
oper[ok] = Int( r )
142-
end
143-
elseif is_sibling( relationship )
144-
if ! is_sibling( recip_relationship )
145-
nfixes += 1
146-
oper[ok] = Int( relationship )
147-
end
148-
elseif is_other_relative( relationship )
149-
if ! is_other_relative( recip_relationship )
150-
nfixes += 1
151-
r = if relationship == Parent_in_law
152-
Son_in_law_or_daughter_in_law
153-
elseif relationship == Son_in_law_or_daughter_in_law
154-
Parent_in_law
155-
elseif relationship == Grand_child
156-
Grand_parent
157-
elseif relationship == Grand_parent
158-
Grand_child
159-
elseif relationship == Other_relative
160-
Other_relative
161-
end
162-
oper[ok] = Int( r )
163-
end
164-
elseif is_non_relative( relationship )
165-
if ! is_non_relative( recip_relationship )
166-
nfixes += 1
167-
oper[ok] = Int( Other_non_relative )
168-
end
169-
end # check end
170-
println("final relationships: $(relationship)=>$(Relationship(oper[ok]))")
153+
if (relationship == Missing_Relationship) # lookup other way around if missing
154+
relationship = reciprocal_relationship( recip_relationship )
155+
end
156+
shouldbe_rel = reciprocal_relationship( relationship )
157+
if recip_relationship != shouldbe_rel
158+
# println("hh $(p.hid): changing for $(p.pno)=>$(oper.pno) relationships $(relationship)=>$(recip_relationship)")
159+
nfixes += 1
160+
oper[ok] = Int(shouldbe_rel)
161+
# println("final relationships: $(relationship)=>$(Relationship(oper[ok]))")
162+
end
163+
if relationship == This_Person # can't be this person if pno != j
164+
oper[ok] = Int(Other_non_relative)
165+
end
171166
end # other people
172167
end # each relationship of this person
173168
# clear out the rest
@@ -179,6 +174,8 @@ function fixup_relationships!( hp :: AbstractDataFrame )::Int
179174
end
180175
end # clearout unneeded relationships
181176
end # each person
177+
# clear out zero sort marker
178+
# select!( pers, Not( :zpos ))
182179
return nfixes
183180
end # function
184181

@@ -201,6 +198,7 @@ function assign_hrp!( hp :: AbstractDataFrame; target::Symbol )
201198
else # .. or oldest if no income
202199
hrpp = findmax(hp.age)[2]
203200
end
201+
println( "setting $hrpp $target to 1")
204202
hp[hrpp,target] = 1;
205203
end
206204

@@ -211,28 +209,60 @@ if bus numbers are 1,3,9 replace with 1,2,3
211209
function fixup_bus!( hp :: AbstractDataFrame; target :: Symbol )
212210
targets = hp[:,target]
213211
buos = collect(sort( OrderedSet(hp[:,target])))
214-
println("initial buos $(hp[:,target])")
212+
# println("initial buos $(hp[:,target])")
215213
for p in eachrow(hp)
216214
defb = p[target]
217215
nb = searchsorted(buos, defb )[1]
218216
p[target] = nb
219217
end
220-
println("final bunos $(hp[:,target])")
218+
# println("final bunos $(hp[:,target])")
219+
end
220+
221+
"""
222+
Allocate anyone, say, in Grand_parent relationship in a bu with a head to a new bu.
223+
FIXME won't work for couples
224+
"""
225+
function add_lonely_bus!( hp :: AbstractDataFrame )
226+
nbus = maximum( hp[:,:default_benefit_unit])
227+
buheads = hp[ hp.is_bu_head .== 1, : ]
228+
for b in eachrow(buheads)
229+
for p in eachrow(hp)
230+
if p.pno != b.pno
231+
if(p.default_benefit_unit == b.default_benefit_unit) # nominally in this bu
232+
k = Symbol( "relationship_$(b.pno)")
233+
if(is_not_immediate_family(Relationship(p[k]))&&(p.age >= 16))
234+
println( "adding $nbus for hh $(p.hid) age $(p.age) pno $(p.pno)")
235+
nbus += 1
236+
p.default_benefit_unit = nbus
237+
p.is_bu_head = true
238+
end
239+
end
240+
end
241+
end
242+
end
221243
end
222244

223-
function get_relationships( hp :: AbstractDataFrame ) :: Matrix{Int}
245+
function get_relationships( hp :: AbstractDataFrame ) :: Matrix{Relationship}
224246
num_people = size(hp)[1]
225-
v = fill(-1,15,15)
247+
v = fill(Missing_Relationship,15,15)
226248
for i in 1:num_people
227249
k = Symbol("relationship_$i")
228250
for j in 1:num_people
229-
v[j,i] = hp[j,k]
251+
v[j,i] = Relationship(hp[j,k])
230252
end
231253
end
232254
v
233255
end
234256

235257

258+
function print_relationships( m::Matrix{Relationship} )
259+
n = findfirst( isequal( Missing_Relationship ), m[1,:])-1
260+
hc = hcat(m[1:n,1:n ],collect(1:n))
261+
pretty_table( hc )
262+
end
263+
264+
265+
236266
function put_relationships!( hp :: AbstractDataFrame, rels :: Matrix{Int})
237267

238268
end
@@ -303,6 +333,14 @@ function do_initial_fixes!(hh::DataFrame, pers::DataFrame )
303333
if ! ismissing( p.highest_qualification ) && (p.highest_qualification == 0) # missing is -1 here, not zero
304334
p.highest_qualification = -1
305335
end
336+
if(p.age < 16) || ((p.from_child_record==1)&&(p.age < 20))
337+
p.is_hrp = 0
338+
if (! ismissing(p.is_bu_head)) && (p.is_bu_head == 1)
339+
println( "removing bu head for $(p.pno) aged $(p.age) hid=$(p.hid)")
340+
p.is_bu_head = 0
341+
p.default_benefit_unit = 1 # FIXME wild guess
342+
end
343+
end
306344
p.is_hrp = coalesce( p.is_hrp, 0 )
307345
# FIXME fixup all the relationships
308346
if p.is_hrp == 1
@@ -312,8 +350,8 @@ function do_initial_fixes!(hh::DataFrame, pers::DataFrame )
312350
#
313351
# Data in order - just makes inspection easier.
314352
#
315-
sort!( hh, [:data_year,:hid] )
316-
sort!( pers, [:data_year,:hid,:pno,:default_benefit_unit,:age])
353+
sort!( hh, [:hid] )
354+
sort!( pers, [:hid,:pno,:default_benefit_unit,:age])
317355
#
318356
# Kill a few annoying missings.
319357
#
@@ -370,7 +408,7 @@ function do_main_fixes!(hh::DataFrame,pers::DataFrame)
370408
@assert sum( hp[:,:is_hrp]) == 1 "!=1 hrp for $(thishh.hid)"
371409
# Fixup non-contigious default BU allocations.
372410
if length(bus) !== maximum(bus)
373-
println( "non contig $(bus) $(thishh.hid)" )
411+
# println( "non contig $(bus) $(thishh.hid)" )
374412
fixup_bus!( hp, target=:default_benefit_unit )
375413
end
376414
# For each of these now nicely numbered bus, ensure 1 bu head.
@@ -380,23 +418,43 @@ function do_main_fixes!(hh::DataFrame,pers::DataFrame)
380418
nbusps += size( bu )[1]
381419
numheads = sum( bu[:,:is_bu_head])
382420
if numheads !== 1
383-
println( "numheads $numheads")
421+
# println( "numheads $numheads")
384422
assign_hrp!( bu; target=:is_bu_head )
385423
end
386424
end
387425
# this is very unfinished
388-
n_relationships_changed += fixup_relationships!(hp)
389426
@assert nbusps == size(hp)[1] "size mismatch for $(hp.hid)"
390-
end
427+
n_relationships_changed += fixup_relationships!(hp)
428+
for p in eachrow( hp )
429+
if(p.age < 16) || ((p.from_child_record==1)&&(p.age < 20))
430+
p.is_hrp = 0
431+
if (! ismissing(p.is_bu_head)) && (p.is_bu_head == 1)
432+
println( "#2 removing bu head for $(p.pno) aged $(p.age) hid=$(p.hid)")
433+
p.is_bu_head = 0
434+
p.default_benefit_unit = 1 # FIXME wild guess
435+
end
436+
end
437+
end
438+
add_lonely_bus!( hp )
439+
# endlessly repeated FIXME
440+
nbusps = 0
441+
# regroup bus
442+
hbus = groupby( hp, :default_benefit_unit )
443+
for bu in hbus
444+
nbusps += size( bu )[1]
445+
numheads = sum( bu[:,:is_bu_head])
446+
if numheads !== 1
447+
println( "numheads wrong for $numheads")
448+
assign_hrp!( bu; target=:is_bu_head )
449+
end
450+
end
451+
end # hh loop
452+
delete_irredemably_bad_hhs( hh, pers )
391453
end
392454

455+
"""
393456
394-
## TODO FIXUP relationship_x fields
395-
396-
#
397-
# Load synthetic datasets using default settings.
398-
#
399-
457+
"""
400458
function fixall!( hh::DataFrame, pers::DataFrame)
401459
settings = Settings()
402460
settings.dataset_type = synthetic_data
@@ -406,11 +464,9 @@ function fixall!( hh::DataFrame, pers::DataFrame)
406464
do_pers_idiot_checks( pers )
407465
# Delete working columns with the mostly.ai string primary keys - we've replaced them
408466
# with BigInts as in the actual data.
409-
#=
410467
select!( hh, Not(:uhidstr) )
411468
select!( pers, Not( :pidstr ))
412469
select!( pers, Not( :uhidstr ))
413-
=#
414470
# write synth files to default locations.
415471
ds = main_datasets( settings )
416472
CSV.write( ds.hhlds, hh; delim='\t' )

src/Definitions.jl

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1445,7 +1445,9 @@ export is_partner,
14451445
is_parent,
14461446
is_sibling,
14471447
is_other_relative,
1448-
is_non_relative
1448+
is_non_relative,
1449+
is_not_immediate_family,
1450+
reciprocal_relationship
14491451

14501452
is_partner( r :: Relationship ) = r in [
14511453
Spouse,
@@ -1466,10 +1468,52 @@ is_sibling( r :: Relationship ) = r in [
14661468
is_other_relative( r :: Relationship ) = r in [
14671469
Parent_in_law,
14681470
Son_in_law_or_daughter_in_law,
1471+
Brother_or_sister_in_law,
14691472
Grand_child,
14701473
Grand_parent,
14711474
Other_relative]
14721475
is_non_relative( r :: Relationship ) = r == Other_non_relative
1476+
is_not_immediate_family( r :: Relationship ) = is_other_relative(r)||is_non_relative(r)||(r == Missing_Relationship)
1477+
1478+
function reciprocal_relationship( relationship :: Relationship ) :: Relationship
1479+
return if relationship == This_Person
1480+
This_Person
1481+
elseif is_partner(relationship)
1482+
relationship
1483+
elseif relationship == Son_or_daughter_incl_adopted
1484+
Parent
1485+
elseif relationship == Foster_child
1486+
Foster_parent
1487+
elseif relationship == Step_son_or_daughter
1488+
Step_parent
1489+
elseif relationship == Parent
1490+
Son_or_daughter_incl_adopted
1491+
elseif relationship == Foster_parent
1492+
Foster_child
1493+
elseif relationship == Step_parent
1494+
Step_son_or_daughter
1495+
elseif is_sibling( relationship )
1496+
relationship
1497+
elseif relationship == Parent_in_law
1498+
Son_in_law_or_daughter_in_law
1499+
elseif relationship == Son_in_law_or_daughter_in_law
1500+
Parent_in_law
1501+
elseif relationship == Grand_child
1502+
Grand_parent
1503+
elseif relationship == Grand_parent
1504+
Grand_child
1505+
elseif relationship == Other_relative
1506+
Other_relative
1507+
elseif relationship == Other_non_relative
1508+
Other_non_relative
1509+
elseif relationship == Brother_or_sister_in_law
1510+
Brother_or_sister_in_law
1511+
elseif relationship == Missing_Relationship
1512+
Missing_Relationship
1513+
else
1514+
@assert false "unmapped $relationship"
1515+
end
1516+
end
14731517

14741518
export Employment_Type # mapped from etype
14751519
export An_Employee,

0 commit comments

Comments
 (0)