Skip to content

Commit c370f36

Browse files
committed
[#137] Implement extract_or_groups_from_blocking() and plug it into LinkStepMatch
1 parent bb75f46 commit c370f36

File tree

1 file changed

+13
-3
lines changed

1 file changed

+13
-3
lines changed

hlink/linking/matching/link_step_match.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# in this project's top-level directory, and also on-line at:
44
# https://github.com/ipums/hlink
55

6+
from collections import defaultdict
67
import logging
78
from typing import Any
89

@@ -45,9 +46,18 @@ def extract_or_groups_from_blocking(blocking: list[dict[str, Any]]) -> list[list
4546
```
4647
4748
This function returns a list of or_groups, each of which is a list of
48-
column names.
49+
column names. It maintains the input order except that the implicit
50+
or_groups are all placed after the explicit or_groups.
4951
"""
50-
raise NotImplementedError()
52+
or_groups: defaultdict[str | None, list[str]] = defaultdict(list)
53+
54+
for blocking_table in blocking:
55+
column_name = blocking_table["column_name"]
56+
or_group = blocking_table.get("or_group")
57+
or_groups[or_group].append(column_name)
58+
59+
implicit_or_groups = [[column_name] for column_name in or_groups.pop(None, [])]
60+
return list(or_groups.values()) + implicit_or_groups
5161

5262

5363
class LinkStepMatch(LinkStep):
@@ -82,7 +92,7 @@ def _run(self):
8292
config["id_column"],
8393
)
8494

85-
t_ctx["blocking_columns"] = [[bc["column_name"]] for bc in blocking]
95+
t_ctx["blocking_columns"] = extract_or_groups_from_blocking(blocking)
8696

8797
blocking_exploded_columns = [
8898
bc["column_name"] for bc in blocking if "explode" in bc and bc["explode"]

0 commit comments

Comments
 (0)