Skip to content

Commit 1383518

Browse files
committed
[#137] Add an integration test for blocking or_groups
1 parent c370f36 commit 1383518

File tree

5 files changed

+208
-0
lines changed

5 files changed

+208
-0
lines changed

hlink/tests/conftest.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1160,6 +1160,63 @@ def blocking_explode_conf(spark, conf):
11601160
return conf
11611161

11621162

1163+
@pytest.fixture(scope="function")
1164+
def blocking_or_groups_conf(spark, conf):
1165+
"""
1166+
For testing the or_groups blocking functionality.
1167+
"""
1168+
conf["column_mappings"] = [
1169+
{"column_name": "namefrst"},
1170+
{"column_name": "namelast"},
1171+
{"column_name": "birthyr"},
1172+
{"column_name": "sex"},
1173+
{"column_name": "bpl1"},
1174+
{"column_name": "bpl2"},
1175+
{"column_name": "bpl3"},
1176+
]
1177+
1178+
conf["blocking"] = [
1179+
{
1180+
"column_name": "birthyr_3",
1181+
"dataset": "a",
1182+
"derived_from": "birthyr",
1183+
"expand_length": 3,
1184+
"explode": True,
1185+
"or_group": "birthyr",
1186+
},
1187+
{"column_name": "sex"},
1188+
{"column_name": "bpl1", "or_group": "bpl"},
1189+
{"column_name": "bpl2", "or_group": "bpl"},
1190+
{"column_name": "bpl3", "or_group": "bpl"},
1191+
]
1192+
conf["comparison_features"] = [
1193+
{
1194+
"alias": "namefrst_jw",
1195+
"column_name": "namefrst",
1196+
"comparison_type": "jaro_winkler",
1197+
},
1198+
{
1199+
"alias": "namelast_jw",
1200+
"column_name": "namelast",
1201+
"comparison_type": "jaro_winkler",
1202+
},
1203+
]
1204+
conf["comparisons"] = {
1205+
"comp_a": {
1206+
"feature_name": "namefrst_jw",
1207+
"threshold": 0.8,
1208+
"comparison_type": "threshold",
1209+
},
1210+
"comp_b": {
1211+
"feature_name": "namelast_jw",
1212+
"threshold": 0.8,
1213+
"comparison_type": "threshold",
1214+
},
1215+
"operator": "AND",
1216+
}
1217+
return conf
1218+
1219+
11631220
@pytest.fixture(scope="function")
11641221
def matching_household_conf(
11651222
spark, conf, datasource_real_households, preprocessing, matching
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
id,namefrst,namelast,birthyr,sex,bpl1,bpl2,bpl3
2+
b5689d06-edd3-498e-8b5b-e04f2fa2f2a9,Catherine,Beebe,1866,2,,,
3+
a7118f06-949d-4d02-be0a-db33a6f8f3a8,Frances E,Bird,1870,2,,,
4+
85d089c0-b907-4d9c-95ab-c5fa4a3dd2bb,J S,Luff,1861,1,,,
5+
cddd9455-48e0-4b48-89a5-9ee315e00087,John,Smith,1884,1,,,
6+
8cb74256-6dfa-4d17-913a-59fa646c388a,Saml H,Russell,1833,1,,,
7+
1f8e1a74-d486-44ad-8d5c-51aedf86208e,Charles,Robertson,1884,1,,,
8+
61a1590f-1d3a-4666-8406-3d4aaf0770b4,John,Dickinson,1868,1,,,
9+
92277f0b-1476-41f5-9dc8-bf83672616d0,Joseph,Shissler,1874,1,,,
10+
322291a1-de91-439d-bba0-45fc2f47a2eb,David,Hall,1839,1,,,
11+
136f7105-ff59-4eac-9d95-44b002cbb448,John,Decame,1858,1,,,
12+
1138ab41-e234-4c72-b812-eaaf0fc5f76c,Nancy,Decame,1857,2,,,
13+
066ea4e1-f340-4231-b505-ec7bb9a07103,Peter N,Decame,1895,1,,,
14+
b7d96336-404e-490c-8c45-61f2287b52ff,Annam,Decame,1897,2,,,
15+
24bdff6a-5590-4494-8e8a-ac4a549c8890,Sarah,Decame,1900,2,,,
16+
c1fedaab-f026-4aa4-9320-e10f2432d539,James,Carney,1888,1,,,
17+
43a6ebe5-752b-4054-818d-6f6f75cc89e7,Alfred,Dell,1883,1,,,
18+
0d693015-2349-4363-9667-45036af7d0db,Chas,Syaex,1870,1,,,
19+
1d586e26-aac1-49df-a2ad-fe0a385a26bf,Sarah,Russell,1897,2,,,
20+
93b7ac89-f9db-49b2-a1f2-c189fecc14ae,Wm H,Hazard,1881,1,,,
21+
e51c36c9-570c-466d-aac1-bf380c9c20f1,Martha,Hazard,1880,2,,,
22+
9250341a-8336-494a-bc84-2b803efe64c6,Willie May,Hazard,1902,2,,,
23+
a70679f0-9313-4ef3-bf87-5dfe81beed5d,Samuel,Hazard,1906,2,,,
24+
4715bbf6-d3e2-4260-9ddd-6aece147e5c1,Samuel,Morgan,1878,1,,,
25+
77378570-5214-4ac5-8258-c5156e8b99b3,J Clauson,Mcfarland,1890,1,,,
26+
6542b541-6e10-411f-9b2a-7c0b93b0aa68,Eugene,Mcfarland,1892,1,,,
27+
396c4077-6a70-4a17-97fb-f8a0c06fdafe,Anna,Preston,1871,2,,,
28+
7e9dde5e-3fad-4b2e-b367-643c0dc8cabb,Rebecca N,Alexander,1861,2,,,
29+
f7d9e25f-c390-4222-ac24-4e93d72daa05,Martha,Ellis,1873,2,,,
30+
24b7afa1-8c49-4833-8292-c545c85d3b89,Otillia,Zeider,1876,2,,,
31+
4b416874-0c5c-4233-81ec-39223bc66f4f,Mary,Doyle,1846,2,,,
32+
a499b0dc-7ac0-4d61-b493-91a3036c712e ,ANNIE ,FAUBLE ,1884,2,1,,
33+
ae7261c3-7d71-4ea1-997f-5d1a68c18777 ,MARY ,REESE ,1875,2,,,
34+
ad6442b5-42bc-4c2e-a517-5a951d989a92 ,MARY ,REESE ,1899,2,1,2,3
35+
b0b6695f-dfa5-4e4d-bc75-798c27195fff ,SALLY ,REESE ,1901,2,,,
36+
9e807937-de09-414c-bfb2-ac821e112929 ,JOHN ,SHIELDS ,1880,1,1,,
37+
426f2cbe-32e1-45eb-9f86-89a2b9116b7e ,ANNE ,FAUBLE ,1884,2,,,
38+
a76697d9-b0c8-4774-bc3e-12a7e403c7e6 ,JOHN ,COLLINS ,1893,1,,,
39+
3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ,MAGGIE ,COLLINS ,1894,2,,,
40+
49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 ,MARY ,COLLINS ,1898,2,,,
41+
50b33ef6-259d-43af-8cdc-56a61f881169 ,WILLIAM H. ,SEWARD ,1856,1,,4,
42+
952754a5-48b4-462a-ac57-e4a059a9ef98 ,ESTHER ,BIERHAHN ,1870,2,,,
43+
ea6d77b3-2e2d-4c59-a0ac-6b297e8898e3 ,CHARLES ,CLEVELAND ,1865,1,,,
44+
60a5052e-6d67-455a-a3aa-bb79560c7d8d ,SUSAN ,WILSON ,1850,2,,,
45+
0d4472ec-6378-4aeb-b6c7-17e1c388bb94 ,ARCHER ,HARVEY ,1890,1,,,
46+
65ccbeb7-2c79-4fb0-b354-c67f150ad80c ,ELIZABETH ,MC LEAN ,1868,2,,,
47+
72cbe5fa-f558-4393-8423-1842fadf7f11 ,MARY A. ,FLEMMING ,1837,2,,,
48+
44693008-fd6f-48fe-9c52-e6c07baff361 ,BESSIE ,CHAMBERS ,1908,2,,,
49+
bcc0988e-2397-4f1b-8e76-4bfe1b05dbc6 ,THOMAS ,GRAHAM ,1846,1,,,
50+
a7b10530-b7c9-44d5-9125-c603f392d6d3 ,EDWARD ,DEKAY ,1875,1,,,
51+
1e635c1c-7faa-4270-acf3-a22635884b90 ,NATHEN ,THORPE ,1836,1,,,
52+
d3217545-3453-4d96-86c0-d6a3e60fb2f8 ,JOB ,FOSTER ,1884,1,,,
53+
2a35bae5-3120-4e2c-87da-694d4419c9ce ,JEZEBEL ,FOSTER ,1888,2,,,
54+
94460fc2-954b-469d-9726-f7126c30e5e2 ,ELIZA ,GOODWIN ,1871,2,,,
55+
620b6ebb-82e6-42db-8aae-300ca2be0c00 ,MARY ,GOODWIN ,1893,2,,,
56+
bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ,JO ,GOODWIN ,1895,1,,6,7
57+
7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ,PHINEAS ,TAYLOR ,1871,1,,5,
58+
a0f33b36-cef7-4949-a031-22b90f1055d4 ,MARY A. ,LORD ,1856,2,,,1
59+
1a76745c-acf8-48a0-9992-7fb10c11710b ,E.B. ,ALLEN ,1889,1,,,
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
id,namefrst,namelast,birthyr,sex,bpl1,bpl2,bpl3
2+
a499b0dc-7ac0-4d61-b493-91a3036c712e ,ANNIE ,FAUBLE ,1884,2,1,,
3+
ae7261c3-7d71-4ea1-997f-5d1a68c18777 ,MARY ,REESE ,1875,2,,,
4+
ad6442b5-42bc-4c2e-a517-5a951d989a92 ,MARY ,REESE ,1902,2,1,2,3
5+
9e807937-de09-414c-bfb2-ac821e112929 ,JOHN ,SHIELDS ,1889,1,1,,
6+
426f2cbe-32e1-45eb-9f86-89a2b9116b7e ,ANNE ,FAUBLE ,1884,2,,,
7+
a76697d9-b0c8-4774-bc3e-12a7e403c7e6 ,JOHN ,COLLINS ,1893,1,,,
8+
3575c9ba-1527-4ca2-aff0-d7c2d1efb421 ,MAGGIE ,COLLINS ,1894,2,,,
9+
49e53dbc-fe8e-4e55-8cb9-a1d93c284d98 ,MARY ,COLLINS ,1898,2,,,
10+
50b33ef6-259d-43af-8cdc-56a61f881169 ,WILLIAM H. ,SEWARD ,1866,1,,4,
11+
952754a5-48b4-462a-ac57-e4a059a9ef98 ,ESTHER ,BIERHAHN ,1870,2,,,
12+
ea6d77b3-2e2d-4c59-a0ac-6b297e8898e3 ,CHARLES ,CLEVELAND ,1865,1,,,
13+
60a5052e-6d67-455a-a3aa-bb79560c7d8d ,SUSAN ,WILSON ,1850,2,,,
14+
0d4472ec-6378-4aeb-b6c7-17e1c388bb94 ,ARCHER ,HARVEY ,1893,1,,,
15+
65ccbeb7-2c79-4fb0-b354-c67f150ad80c ,ELIZABETH ,MC LEAN ,1868,2,,,
16+
72cbe5fa-f558-4393-8423-1842fadf7f11 ,MARY A. ,FLEMMING ,1842,2,,,
17+
bcc0988e-2397-4f1b-8e76-4bfe1b05dbc6 ,THOMAS ,GRAHAM ,1846,1,,,
18+
a7b10530-b7c9-44d5-9125-c603f392d6d3 ,EDWARD ,DEKAY ,1875,1,,,
19+
1e635c1c-7faa-4270-acf3-a22635884b90 ,NATHEN ,THORPE ,1836,1,,,
20+
d3217545-3453-4d96-86c0-d6a3e60fb2f8 ,JOB ,FOSTER ,1884,1,,,
21+
2a35bae5-3120-4e2c-87da-694d4419c9ce ,JEZEBEL ,FOSTER ,1888,2,,,
22+
94460fc2-954b-469d-9726-f7126c30e5e2 ,ELIZA ,GOODWIN ,1871,2,,,
23+
620b6ebb-82e6-42db-8aae-300ca2be0c00 ,MARY ,GOODWIN ,1893,2,,,
24+
bfe1080e-2e67-4a8c-a6e1-ed94ea103712 ,JO ,GOODWIN ,1890,1,,6,7
25+
7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ,PHINEAS ,TAYLOR ,1871,1,,5,
26+
a0f33b36-cef7-4949-a031-22b90f1055d4 ,MARY A. ,LORD ,1856,2,,,1
27+
1a76745c-acf8-48a0-9992-7fb10c11710b ,E.B. ,ALLEN ,1889,1,,,

hlink/tests/matching_blocking_explode_test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# in this project's top-level directory, and also on-line at:
44
# https://github.com/ipums/hlink
55

6+
from pyspark.sql import Row
67
import pytest
78
import pandas as pd
89
from hlink.linking.matching.link_step_match import extract_or_groups_from_blocking
@@ -123,6 +124,42 @@ def test_blocking_multi_layer_comparison(
123124
) or (row["namelast_jw_x"] < 0.7)
124125

125126

127+
def test_blocking_or_groups(
128+
spark, blocking_or_groups_conf, matching_or_groups_test_input, matching
129+
):
130+
"""Test the blocking or_group functionality. This feature supports
131+
combining some or all blocking conditions with OR instead of AND."""
132+
table_a, table_b = matching_or_groups_test_input
133+
table_a.createOrReplaceTempView("prepped_df_a")
134+
table_b.createOrReplaceTempView("prepped_df_b")
135+
136+
matching.run_step(0)
137+
matching.run_step(1)
138+
139+
potential_matches = matching.spark.table("potential_matches")
140+
141+
results = potential_matches.select("id_a", "id_b").collect()
142+
143+
assert set(results) == {
144+
Row(
145+
id_a="ad6442b5-42bc-4c2e-a517-5a951d989a92 ",
146+
id_b="ad6442b5-42bc-4c2e-a517-5a951d989a92 ",
147+
),
148+
Row(
149+
id_a="a499b0dc-7ac0-4d61-b493-91a3036c712e ",
150+
id_b="a499b0dc-7ac0-4d61-b493-91a3036c712e ",
151+
),
152+
Row(
153+
id_a="7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ",
154+
id_b="7fb55d25-2a7d-486d-9efa-27b9d7e60c24 ",
155+
),
156+
Row(
157+
id_a="a0f33b36-cef7-4949-a031-22b90f1055d4 ",
158+
id_b="a0f33b36-cef7-4949-a031-22b90f1055d4 ",
159+
),
160+
}
161+
162+
126163
# TODO: test_step_2_length_b
127164

128165
# TODO: test_step_2_has_matching_element

hlink/tests/plugins/datasources.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,34 @@ def matching_test_input(spark, conf, tmpdir_factory):
879879
return pdfa, pdfb
880880

881881

882+
@pytest.fixture(scope="function")
883+
def matching_or_groups_test_input(spark):
884+
prepped_a_data = "input_data/matching_or_group_test_a.csv"
885+
prepped_b_data = "input_data/matching_or_group_test_b.csv"
886+
887+
package_path = os.path.dirname(hlink.tests.__file__)
888+
pa_path = os.path.join(package_path, prepped_a_data)
889+
pb_path = os.path.join(package_path, prepped_b_data)
890+
891+
schema = StructType(
892+
[
893+
StructField("id", StringType(), True),
894+
StructField("namefrst", StringType(), True),
895+
StructField("namelast", StringType(), True),
896+
StructField("birthyr", LongType(), True),
897+
StructField("sex", LongType(), True),
898+
StructField("bpl1", LongType(), True),
899+
StructField("bpl2", LongType(), True),
900+
StructField("bpl3", LongType(), True),
901+
]
902+
)
903+
904+
pdfa = spark.read.csv(pa_path, schema)
905+
pdfb = spark.read.csv(pb_path, schema)
906+
907+
return pdfa, pdfb
908+
909+
882910
@pytest.fixture(scope="function")
883911
def datasource_mi_comparison(spark, conf):
884912
"""Create the prepped_df_(a/b) dataframes and populate basic config values"""

0 commit comments

Comments
 (0)