|
3 | 3 | # in this project's top-level directory, and also on-line at:
|
4 | 4 | # https://github.com/ipums/hlink
|
5 | 5 |
|
| 6 | +from collections import defaultdict |
6 | 7 | import logging
|
7 | 8 | from typing import Any
|
8 | 9 |
|
@@ -45,9 +46,18 @@ def extract_or_groups_from_blocking(blocking: list[dict[str, Any]]) -> list[list
|
45 | 46 | ```
|
46 | 47 |
|
47 | 48 | This function returns a list of or_groups, each of which is a list of
|
48 |
| - column names. |
| 49 | + column names. It maintains the input order except that the implicit |
| 50 | + or_groups are all placed after the explicit or_groups. |
49 | 51 | """
|
50 |
| - raise NotImplementedError() |
| 52 | + or_groups: defaultdict[str | None, list[str]] = defaultdict(list) |
| 53 | + |
| 54 | + for blocking_table in blocking: |
| 55 | + column_name = blocking_table["column_name"] |
| 56 | + or_group = blocking_table.get("or_group") |
| 57 | + or_groups[or_group].append(column_name) |
| 58 | + |
| 59 | + implicit_or_groups = [[column_name] for column_name in or_groups.pop(None, [])] |
| 60 | + return list(or_groups.values()) + implicit_or_groups |
51 | 61 |
|
52 | 62 |
|
53 | 63 | class LinkStepMatch(LinkStep):
|
@@ -82,7 +92,7 @@ def _run(self):
|
82 | 92 | config["id_column"],
|
83 | 93 | )
|
84 | 94 |
|
85 |
| - t_ctx["blocking_columns"] = [[bc["column_name"]] for bc in blocking] |
| 95 | + t_ctx["blocking_columns"] = extract_or_groups_from_blocking(blocking) |
86 | 96 |
|
87 | 97 | blocking_exploded_columns = [
|
88 | 98 | bc["column_name"] for bc in blocking if "explode" in bc and bc["explode"]
|
|
0 commit comments