Skip to content

Commit

Permalink
[#21] Rewrite a skipped matching aggregate features test
Browse files Browse the repository at this point in the history
This required some heavy refactoring. I think that this test was really old.
The new checks are very similar to the checks in the old test.
  • Loading branch information
riley-harper committed Jun 18, 2024
1 parent 94783ac commit d273ec3
Show file tree
Hide file tree
Showing 6 changed files with 167 additions and 126 deletions.
62 changes: 31 additions & 31 deletions hlink/tests/input_data/potential_matches_agg.csv
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
namelast_clean_a,namelast_clean_b,histid_a,histid_b,bpl_a,bpl_b,namefrst_unstd_a,namefrst_unstd_b,sex_a,sex_b,namefrst_jw,namelast_jw,regionf,state_distance,exact,exact_all
cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,002B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00
cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,003B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00
cridlebaugh,cridlebaugh,0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,3100,3100,gerald,gerald,1,1,1.0,1.0,4,0,1.00,1.00
symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00427A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00
symonds,symonds,1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,3600,horace,horace,1,1,1.0,1.0,2,0,1.00,1.00
abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,01620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00
abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,02620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00
abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,03620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00
abrahams,abrahams,095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,60094,isiah,isniah,1,1,0.9555555555555556,1.0,99,0,0.00,0.00
eilbatt,eilbott,6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,4700,4700,reginald,reginald,1,1,1.0,0.9428571428571428,6,0,1.00,0.00
knopke,knopke,EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,2100,andrew,andrew,1,1,1.0,1.0,6,0,1.00,1.00
caldwell,caldwell,AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,15010,15010,daisy,daisy,2,2,1.0,1.0,99,0,1.00,1.00
sonnenschein,sonnenschein,8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1700,1700,max,max,1,1,1.0,1.0,3,0,1.00,1.00
gibson,gebson,F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,5500,5500,dwight,dwight,1,1,1.0,0.9,3,0,1.00,0.00
hegewald,hegewald,D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,5600,karl,karl,1,1,1.0,1.0,8,0,1.00,1.00
king,king,CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,3800,virgel,virgil,1,1,0.9333333333333333,1.0,4,0,0.00,0.00
looney,looney,4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,4700,4700,sadie,sadye,2,2,0.9066666666666667,1.0,6,0,0.00,0.00
rydstrom,rydstrom,CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,1700,hubert,hubert,1,1,1.0,1.0,3,0,1.00,1.00
mugrdickian,mugrdichian,2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,3600,3600,misak,misak,1,1,1.0,0.977961432506887,2,0,1.00,0.00
brightman,brightman,195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,3900,austin,anstin,1,1,0.9,1.0,3,0,0.00,0.00
harman,harman,74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,5400,5400,eston,estan,1,1,0.9066666666666667,1.0,5,0,0.00,0.00
oglesby,oglesby,F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,4000,stephen,stephen,1,1,1.0,1.0,7,0,1.00,1.00
kassik,kassek,6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,5600,5600,james,james,1,1,1.0,0.9333333333333333,8,0,1.00,0.00
wood,wood,EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,1700,dudley,dudley,1,1,1.0,1.0,3,0,1.00,1.00
foulkrod,foulkrod,47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,4200,s,s,1,1,1.0,1.0,2,0,1.00,1.00
huges,hughes,7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,100,keneth,kenneth,1,1,0.9666666666666667,0.9611111111111111,6,0,0.00,0.00
caldwell,caldwell,A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,5000,nathan,nathan,1,1,1.0,1.0,1,0,1.00,1.00
platta,platts,E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,1200,1200,norman,norman,1,1,1.0,0.9444444444444444,5,0,1.00,0.00
lipscomb,lipscomb,671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,1300,roy,roy,1,1,1.0,1.0,5,0,1.00,1.00
woodburne,woodburn,81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,200,walter,walter,1,1,1.0,0.9925925925925926,9,0,1.00,0.00
histid_a,histid_b,namefrst_jw,namelast_jw,regionf,state_distance
0202928A-AC3E-48BB-8568-3372067F35C7,002B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0
0202928A-AC3E-48BB-8568-3372067F35C7,003B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0
0202928A-AC3E-48BB-8568-3372067F35C7,001B8A74-3795-4997-BC5B-2A07257668F9,1.0,1.0,4,0
1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00427A22-FF1E-400A-9A8A-1752A60BE7CF,1.0,1.0,2,0
1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,00327A22-FF1E-400A-9A8A-1752A60BE7CF,1.0,1.0,2,0
095AD921-9B08-468E-817A-44879FBCADDE,01620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0
095AD921-9B08-468E-817A-44879FBCADDE,02620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0
095AD921-9B08-468E-817A-44879FBCADDE,03620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0
095AD921-9B08-468E-817A-44879FBCADDE,00620FE0-E907-47F4-9368-5B14EBF69BE4,0.9555555555555556,1.0,99,0
6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,00669345-C937-4405-A0F0-1FCA5204DF64,1.0,0.9428571428571428,6,0
EAD03D68-F21D-4A74-8C16-F9123F5288D7,007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,1.0,1.0,6,0
AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,00849961-E52F-42F2-9B70-052606223052,1.0,1.0,99,0
8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,00C4291F-7064-4A81-8589-5854C367EEC4,1.0,1.0,3,0
F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,010F244F-94D0-4295-82DB-0E172724358A,1.0,0.9,3,0
D30C40B9-2E7C-4933-84CE-CEAAB37E3209,01230024-F3C6-4D4A-86DF-F9EF29F77292,1.0,1.0,8,0
CCBA170F-93D0-42C3-A57B-CCABBF2772FB,0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,0.9333333333333333,1.0,4,0
4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,016EF43B-E70F-440E-882E-E447663F682F,0.9066666666666667,1.0,6,0
CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,018C7B94-4387-4149-9B2D-CA7BB18AA559,1.0,1.0,3,0
2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,019D26A0-0335-48B5-A6D6-1D499424BE84,1.0,0.977961432506887,2,0
195EA695-D047-4045-8757-E7A22F12E148,0269E114-0EDD-4767-AE9F-B6557CD880EE,0.9,1.0,3,0
74941094-9737-40F0-BF3C-0C2380B08040,0282109F-581C-4B8E-A99D-135CF0077C2E,0.9066666666666667,1.0,5,0
F0F34E2F-49CC-4F06-8CC4-691CF3150244,02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,1.0,1.0,7,0
6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,033FD0FA-C523-42B5-976A-751E830F7021,1.0,0.9333333333333333,8,0
EE22ED8E-9544-4C77-A689-75895376E3EB,0350987D-D6B3-4519-A7C0-96C5B45111D6,1.0,1.0,3,0
47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,03522784-6097-4A7A-A54E-C6AA8E22BF20,1.0,1.0,2,0
7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,0.9666666666666667,0.9611111111111111,6,0
A859D9BC-6106-43A2-8A47-B12D9D2C49C8,039AE50E-84E7-49A7-B720-48D2C765C5D5,1.0,1.0,1,0
E19E5381-C68D-4E03-A688-597DF13311CE,03B89FD5-872A-4504-9758-F5AA1607BA01,1.0,0.9444444444444444,5,0
671DE512-479B-4EEB-85B4-93A848E6BDD7,03DD4EB7-0FA7-4AA4-A510-79448E316A43,1.0,1.0,5,0
81E992C0-3796-4BE7-B02E-9CAD0289C6EC,03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,1.0,0.9925925925925926,9,0
25 changes: 25 additions & 0 deletions hlink/tests/input_data/prepped_df_a_agg.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
histid,bpl,namelast_clean,namefrst_unstd,sex
0202928A-AC3E-48BB-8568-3372067F35C7,3100,cridlebaugh,gerald,1
1E5D9C44-3D8E-40F8-A843-8E7619CF7B94,3600,symonds,horace,1
095AD921-9B08-468E-817A-44879FBCADDE,60094,abrahams,isiah,1
6F6D3D9A-A2C4-471C-A19A-5EFE90AAA5C7,4700,eilbatt,reginald,1
EAD03D68-F21D-4A74-8C16-F9123F5288D7,2100,knopke,andrew,1
AF3C7686-98EF-46F5-B5DF-DE8CC50A93DC,15010,caldwell,daisy,2
8A50FA06-BAF8-4EC5-9726-2EB3551CD6D1,1700,sonnenschein,max,1
F2798AB4-3217-4D0A-A6A0-6B390A3C4B7A,5500,gibson,dwight,1
D30C40B9-2E7C-4933-84CE-CEAAB37E3209,5600,hegewald,karl,1
CCBA170F-93D0-42C3-A57B-CCABBF2772FB,3800,king,virgel,1
4F29F4B1-F953-4FC8-A7FB-42F54FB51E73,4700,looney,sadie,2
CC7B3BF9-AEE5-4ECB-9F5D-9F910346B9CD,1700,rydstrom,hubert,1
2CC7B61B-6821-4B2B-A283-8FE8D557D6F6,3600,mugrdickian,misak,1
195EA695-D047-4045-8757-E7A22F12E148,3900,brightman,austin,1
74941094-9737-40F0-BF3C-0C2380B08040,5400,harman,eston,1
F0F34E2F-49CC-4F06-8CC4-691CF3150244,4000,oglesby,stephen,1
6EB222E3-EB8F-4E20-BCE0-2C12F926ABB1,5600,kassik,james,1
EE22ED8E-9544-4C77-A689-75895376E3EB,1700,wood,dudley,1
47DB90F0-6A7B-421F-9B18-CAB1CFA45E71,4200,foulkrod,s,1
7E20FBBE-9B4B-4FAB-9433-CB77D9E6B022,100,huges,keneth,1
A859D9BC-6106-43A2-8A47-B12D9D2C49C8,5000,caldwell,nathan,1
E19E5381-C68D-4E03-A688-597DF13311CE,1200,platta,norman,1
671DE512-479B-4EEB-85B4-93A848E6BDD7,1300,lipscomb,roy,1
81E992C0-3796-4BE7-B02E-9CAD0289C6EC,200,woodburne,walter,1
31 changes: 31 additions & 0 deletions hlink/tests/input_data/prepped_df_b_agg.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
histid,bpl,namelast_clean,namefrst_unstd,sex
001B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1
002B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1
003B8A74-3795-4997-BC5B-2A07257668F9,3100,cridlebaugh,gerald,1
00327A22-FF1E-400A-9A8A-1752A60BE7CF,3600,symonds,horace,1
00427A22-FF1E-400A-9A8A-1752A60BE7CF,3600,symonds,horace,1
01620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1
02620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1
03620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1
00620FE0-E907-47F4-9368-5B14EBF69BE4,60094,abrahams,isniah,1
00669345-C937-4405-A0F0-1FCA5204DF64,4700,eilbott,reginald,1
007EDAE5-BDEF-4819-969E-7DAA8DC3FDD6,2100,knopke,andrew,1
00849961-E52F-42F2-9B70-052606223052,15010,caldwell,daisy,2
00C4291F-7064-4A81-8589-5854C367EEC4,1700,sonnenschein,max,1
010F244F-94D0-4295-82DB-0E172724358A,5500,gebson,dwight,1
01230024-F3C6-4D4A-86DF-F9EF29F77292,5600,hegewald,karl,1
0141C5C9-32DC-469E-862C-A6CCD8A2EB2B,3800,king,virgil,1
016EF43B-E70F-440E-882E-E447663F682F,4700,looney,sadye,2
018C7B94-4387-4149-9B2D-CA7BB18AA559,1700,rydstrom,hubert,1
019D26A0-0335-48B5-A6D6-1D499424BE84,3600,mugrdichian,misak,1
0269E114-0EDD-4767-AE9F-B6557CD880EE,3900,brightman,anstin,1
0282109F-581C-4B8E-A99D-135CF0077C2E,5400,harman,estan,1
02A06F96-AAD4-4EE2-B20B-CD1A4ED33D46,4000,oglesby,stephen,1
033FD0FA-C523-42B5-976A-751E830F7021,5600,kassek,james,1
0350987D-D6B3-4519-A7C0-96C5B45111D6,1700,wood,dudley,1
03522784-6097-4A7A-A54E-C6AA8E22BF20,4200,foulkrod,s,1
038F08DA-12C8-4AF2-B5DD-43BB2A58DAA1,100,hughes,kenneth,1
039AE50E-84E7-49A7-B720-48D2C765C5D5,5000,caldwell,nathan,1
03B89FD5-872A-4504-9758-F5AA1607BA01,1200,platts,norman,1
03DD4EB7-0FA7-4AA4-A510-79448E316A43,1300,lipscomb,roy,1
03FFD04A-DC09-47EC-84EF-A0DD3E9C0528,200,woodburn,walter,1
89 changes: 0 additions & 89 deletions hlink/tests/matching_potential_matches_test.py

This file was deleted.

71 changes: 71 additions & 0 deletions hlink/tests/matching_scoring_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,74 @@ def test_step_3_alpha_beta_thresholds(

assert tp.query("histid_a == '5a' and histid_b == '7b'")["prediction"].iloc[0] == 1
assert tp.query("histid_a == '5a' and histid_b == '6b'")["prediction"].iloc[0] == 0


def test_step_2_aggregate_features(
spark, matching_conf, matching, agg_features_datasources
):
matching_conf["id_column"] = "histid"
matching_conf["comparison_features"] = [
{
"alias": "namelast_jw",
"column_name": "namelast",
"comparison_type": "jaro_winkler",
},
{
"alias": "exact",
"column_names": ["namefrst_unstd", "namelast_clean"],
"comparison_type": "all_equals",
},
{
"alias": "exact_all",
"column_names": ["namefrst_unstd", "namelast_clean", "bpl"],
"comparison_type": "all_equals",
},
]
matching_conf["training"] = {
"independent_vars": [
"namelast_jw",
"exact",
"exact_all",
"hits",
"hits2",
"exact_mult",
"exact_all_mult",
"exact_all_mult2",
],
"chosen_model": {
"type": "probit",
"threshold": 0.5,
},
"dependent_var": "match",
}

potential_matches_path, prepped_df_a_path, prepped_df_b_path = (
agg_features_datasources
)
spark.read.csv(potential_matches_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("potential_matches")

spark.read.csv(prepped_df_a_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("prepped_df_a")
spark.read.csv(prepped_df_b_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("prepped_df_b")

link_step_score = LinkStepScore(matching)
link_step_score._create_features(matching_conf)

pm_prepped = spark.table("potential_matches_prepped").toPandas()

filtered = pm_prepped.query(
"histid_a == '0202928A-AC3E-48BB-8568-3372067F35C7' and histid_b == '001B8A74-3795-4997-BC5B-2A07257668F9'"
)

assert filtered["exact"].item()
assert filtered["exact_all"].item()
assert filtered["hits"].item() == 3
assert filtered["hits2"].item() == 9
assert filtered["exact_mult"].item()
assert filtered["exact_all_mult"].item() == 3
assert filtered["exact_all_mult2"].item() == 9
15 changes: 9 additions & 6 deletions hlink/tests/plugins/external_data_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,18 @@ def potential_matches_path_ids_only(spark):


@pytest.fixture(scope="module")
def potential_matches_agg_path(spark):
"""Create a fixture with the path to the test potential_matches csv file"""

path = "input_data/potential_matches_agg.csv"
def agg_features_datasources() -> tuple[str, str, str]:
"""Return the path to the potential_matches, prepped_df_a, and prepped_df_b csv data files."""
potential_matches_path = "input_data/potential_matches_agg.csv"
prepped_df_a_path = "input_data/prepped_df_a_agg.csv"
prepped_df_b_path = "input_data/prepped_df_b_agg.csv"

package_path = os.path.dirname(hlink.tests.__file__)
full_path = os.path.join(package_path, path)
full_pm_path = os.path.join(package_path, potential_matches_path)
full_prepped_a_path = os.path.join(package_path, prepped_df_a_path)
full_prepped_b_path = os.path.join(package_path, prepped_df_b_path)

return full_path
return full_pm_path, full_prepped_a_path, full_prepped_b_path


@pytest.fixture(scope="module")
Expand Down

0 comments on commit d273ec3

Please sign in to comment.