From cd17299ffb8956ab151bdaaf3c8a4c777ef454a6 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Thu, 22 Aug 2024 20:55:31 +0000
Subject: [PATCH 01/13] Test that generate_comparison_feature() raises a
 friendly error on unrecognized comparison type

---
 hlink/tests/core/comparison_feature_test.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hlink/tests/core/comparison_feature_test.py b/hlink/tests/core/comparison_feature_test.py
index 9a6f0a1..384caae 100644
--- a/hlink/tests/core/comparison_feature_test.py
+++ b/hlink/tests/core/comparison_feature_test.py
@@ -2,6 +2,7 @@
 # For copyright and licensing information, see the NOTICE and LICENSE files
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
+import pytest
 
 import hlink.linking.core.comparison_feature as comparison_feature_core
 import hlink.linking.core.pipeline as pipeline_core
@@ -374,3 +375,9 @@ def test_multi_jaro_winkler_search_column_templating():
     assert "static_column" in sql_expr
     assert "static_column1" not in sql_expr
     assert "static_colum1" not in sql_expr
+
+
+def test_generate_comparison_feature_error_on_unknown_comparison_type() -> None:
+    comparison_feature = {"comparison_type": "not_supported"}
+    with pytest.raises(ValueError, match="No comparison type"):
+        comparison_feature_core.generate_comparison_feature(comparison_feature, "id")

From e49004ea3353fed1ccf275316bd2fd90748fc14d Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Thu, 22 Aug 2024 21:48:31 +0000
Subject: [PATCH 02/13] Add tests for the b_minus_a comparison feature

---
 hlink/tests/core/comparison_feature_test.py | 73 +++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/hlink/tests/core/comparison_feature_test.py b/hlink/tests/core/comparison_feature_test.py
index 384caae..5f92351 100644
--- a/hlink/tests/core/comparison_feature_test.py
+++ b/hlink/tests/core/comparison_feature_test.py
@@ -7,6 +7,7 @@
 import hlink.linking.core.comparison_feature as comparison_feature_core
 import hlink.linking.core.pipeline as pipeline_core
 from pyspark.ml import Pipeline
+from pyspark.sql import Row
 
 
 def test_rel_jaro_winkler_comparison(spark, conf, datasource_rel_jw_input):
@@ -377,6 +378,78 @@ def test_multi_jaro_winkler_search_column_templating():
     assert "static_colum1" not in sql_expr
 
 
+def test_b_minus_a_comparison(spark) -> None:
+    comparison_feature = {
+        "alias": "agediff",
+        "column_name": "age",
+        "comparison_type": "b_minus_a",
+    }
+
+    df_a = spark.createDataFrame([[0, 15], [1, 77]], "id:integer, age:integer")
+    df_b = spark.createDataFrame([[100, 15], [101, 70]], "id:integer, age:integer")
+    df_a.write.saveAsTable("table_a")
+    df_b.write.saveAsTable("table_b")
+
+    sql_expr = comparison_feature_core.generate_comparison_feature(
+        comparison_feature, "id", include_as=True
+    )
+
+    result = (
+        spark.sql(
+            f"SELECT a.id AS id_a, b.id AS id_b, {sql_expr} FROM table_a a CROSS JOIN table_b b"
+        )
+        .sort("id_a", "id_b")
+        .collect()
+    )
+
+    assert result == [
+        Row(id_a=0, id_b=100, agediff=0),
+        Row(id_a=0, id_b=101, agediff=55),
+        Row(id_a=1, id_b=100, agediff=-62),
+        Row(id_a=1, id_b=101, agediff=-7),
+    ]
+
+
+def test_b_minus_a_comparison_with_not_equals(spark) -> None:
+    comparison_feature = {
+        "alias": "agediff",
+        "column_name": "age",
+        "comparison_type": "b_minus_a",
+        "not_equals": 99,
+    }
+    df_a = spark.createDataFrame([[0, 15], [1, 77], [2, 99]], "id:integer, age:integer")
+    df_b = spark.createDataFrame(
+        [[100, 15], [101, 70], [102, 99]], "id:integer, age:integer"
+    )
+
+    df_a.write.saveAsTable("table_a")
+    df_b.write.saveAsTable("table_b")
+
+    sql_expr = comparison_feature_core.generate_comparison_feature(
+        comparison_feature, "id", include_as=True
+    )
+
+    result = (
+        spark.sql(
+            f"SELECT a.id AS id_a, b.id AS id_b, {sql_expr} FROM table_a a CROSS JOIN table_b b"
+        )
+        .sort("id_a", "id_b")
+        .collect()
+    )
+
+    assert result == [
+        Row(id_a=0, id_b=100, agediff=0),
+        Row(id_a=0, id_b=101, agediff=55),
+        Row(id_a=0, id_b=102, agediff=-1),
+        Row(id_a=1, id_b=100, agediff=-62),
+        Row(id_a=1, id_b=101, agediff=-7),
+        Row(id_a=1, id_b=102, agediff=-1),
+        Row(id_a=2, id_b=100, agediff=-1),
+        Row(id_a=2, id_b=101, agediff=-1),
+        Row(id_a=2, id_b=102, agediff=-1),
+    ]
+
+
 def test_generate_comparison_feature_error_on_unknown_comparison_type() -> None:
     comparison_feature = {"comparison_type": "not_supported"}
     with pytest.raises(ValueError, match="No comparison type"):

From 1719b748692031daa6bb13ca4b637cb609f935e5 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 26 Aug 2024 16:29:20 +0000
Subject: [PATCH 03/13] Add a test for the override_column_a attribute in
 feature_selections

I didn't know this was a thing! I will update the docs as well.
---
 hlink/tests/core/transforms_test.py | 52 +++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
index f940dbb..cf9a892 100644
--- a/hlink/tests/core/transforms_test.py
+++ b/hlink/tests/core/transforms_test.py
@@ -90,3 +90,55 @@ def test_generate_transforms_array_transform_3_cols(
         Row(array_column=["Ix", "26", "Hwi"]),
         Row(array_column=["Arrakis", "25", "Siona"]),
     ]
+
+
+def test_generate_transforms_override_column_a(
+    spark: SparkSession, preprocessing: LinkTask
+) -> None:
+    """
+    In a feature selection, you can set the `override_column_a` attribute to
+    copy that column in dataset A as the feature selection instead of computing
+    the feature selection like normal. This does not affect dataset B.
+    """
+    feature_selections = [
+        {
+            "output_column": "mbpl_range",
+            "transform": "sql_condition",
+            "input_column": "mother_nativity",
+            "condition": "CASE WHEN mother_nativity = 0 THEN 0 WHEN mother_nativity > 0 and mother_nativity < 5 THEN 1 WHEN mother_nativity = 5 THEN 2 ELSE 0 END",
+            "override_column_a": "test_override_column",
+        }
+    ]
+    df_a = spark.createDataFrame(
+        [[0, 2, -1], [1, 5, -1], [2, 0, -1], [3, 6, -1]],
+        "id:integer, mother_nativity:integer, test_override_column:integer",
+    )
+    df_b = spark.createDataFrame(
+        [[0, 2, -1], [1, 5, -1], [2, 0, -1], [3, 6, -1]],
+        "id:integer, mother_nativity:integer, test_override_column:integer",
+    )
+
+    df_result_a = generate_transforms(
+        spark, df_a, feature_selections, preprocessing, is_a=True, id_col="id"
+    ).sort("id")
+
+    result_a = df_result_a.collect()
+    assert result_a == [
+        Row(id=0, mother_nativity=2, test_override_column=-1, mbpl_range=-1),
+        Row(id=1, mother_nativity=5, test_override_column=-1, mbpl_range=-1),
+        Row(id=2, mother_nativity=0, test_override_column=-1, mbpl_range=-1),
+        Row(id=3, mother_nativity=6, test_override_column=-1, mbpl_range=-1),
+    ]
+
+    # mbpl_range should be computed with the SQL condition in dataset B
+    df_result_b = generate_transforms(
+        spark, df_b, feature_selections, preprocessing, is_a=False, id_col="id"
+    ).sort("id")
+
+    result_b = df_result_b.collect()
+    assert result_b == [
+        Row(id=0, mother_nativity=2, test_override_column=-1, mbpl_range=1),
+        Row(id=1, mother_nativity=5, test_override_column=-1, mbpl_range=2),
+        Row(id=2, mother_nativity=0, test_override_column=-1, mbpl_range=0),
+        Row(id=3, mother_nativity=6, test_override_column=-1, mbpl_range=0),
+    ]

From e37e49e0dd06550ec8e63888d7ad944c41c0746d Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 26 Aug 2024 18:10:24 +0000
Subject: [PATCH 04/13] Add a test for the override_column_b attribute in
 feature_selections

---
 hlink/tests/core/transforms_test.py | 50 +++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
index cf9a892..7b6041f 100644
--- a/hlink/tests/core/transforms_test.py
+++ b/hlink/tests/core/transforms_test.py
@@ -142,3 +142,53 @@ def test_generate_transforms_override_column_a(
         Row(id=2, mother_nativity=0, test_override_column=-1, mbpl_range=0),
         Row(id=3, mother_nativity=6, test_override_column=-1, mbpl_range=0),
     ]
+
+
+def test_generate_transforms_override_column_b(
+    spark: SparkSession, preprocessing: LinkTask
+) -> None:
+    """
+    In a feature selection, you can set the `override_column_b` attribute to
+    copy that column in dataset B as the feature selection instead of computing
+    the feature selection like normal. This does not affect dataset A.
+    """
+    feature_selections = [
+        {
+            "output_column": "mbpl_range",
+            "transform": "sql_condition",
+            "input_column": "mother_nativity",
+            "condition": "CASE WHEN mother_nativity = 0 THEN 0 WHEN mother_nativity > 0 AND mother_nativity < 5 THEN 1 WHEN mother_nativity = 5 THEN 2 ELSE 0 END",
+            "override_column_b": "test_override_column",
+        }
+    ]
+    df_a = spark.createDataFrame(
+        [[0, 2, -1], [1, 5, -1], [2, 0, -1], [3, 6, -1]],
+        "id:integer, mother_nativity:integer, test_override_column:integer",
+    )
+    df_b = spark.createDataFrame(
+        [[0, 2, -1], [1, 5, -1], [2, 0, -1], [3, 6, -1]],
+        "id:integer, mother_nativity:integer, test_override_column:integer",
+    )
+
+    # mbpl_range should be computed with the SQL condition in dataset A
+    df_result_a = generate_transforms(
+        spark, df_a, feature_selections, preprocessing, is_a=True, id_col="id"
+    ).sort("id")
+    result_a = df_result_a.collect()
+    assert result_a == [
+        Row(id=0, mother_nativity=2, test_override_column=-1, mbpl_range=1),
+        Row(id=1, mother_nativity=5, test_override_column=-1, mbpl_range=2),
+        Row(id=2, mother_nativity=0, test_override_column=-1, mbpl_range=0),
+        Row(id=3, mother_nativity=6, test_override_column=-1, mbpl_range=0),
+    ]
+
+    df_result_b = generate_transforms(
+        spark, df_b, feature_selections, preprocessing, is_a=False, id_col="id"
+    ).sort("id")
+    result_b = df_result_b.collect()
+    assert result_b == [
+        Row(id=0, mother_nativity=2, test_override_column=-1, mbpl_range=-1),
+        Row(id=1, mother_nativity=5, test_override_column=-1, mbpl_range=-1),
+        Row(id=2, mother_nativity=0, test_override_column=-1, mbpl_range=-1),
+        Row(id=3, mother_nativity=6, test_override_column=-1, mbpl_range=-1),
+    ]

From cbc2654dfd2cd9540d24fa45b9cd50c980d7c989 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 26 Aug 2024 18:38:36 +0000
Subject: [PATCH 05/13] Update the docs for feature selection transforms to add
 details for shared attributes

---
 .../feature_selection_transforms.md.txt       |  36 +++++++++++-------
 docs/feature_selection_transforms.html        |  35 +++++++++++------
 docs/objects.inv                              | Bin 528 -> 532 bytes
 docs/searchindex.js                           |   2 +-
 sphinx-docs/feature_selection_transforms.md   |  36 +++++++++++-------
 5 files changed, 71 insertions(+), 38 deletions(-)

diff --git a/docs/_sources/feature_selection_transforms.md.txt b/docs/_sources/feature_selection_transforms.md.txt
index c87440a..3f89ee1 100644
--- a/docs/_sources/feature_selection_transforms.md.txt
+++ b/docs/_sources/feature_selection_transforms.md.txt
@@ -1,16 +1,26 @@
-# Feature Selection transforms
-
-Each header below represents a feature selection transform.  These transforms are used in the context of `feature_selections`.
-
-```
-[[feature_selections]]
-input_column = "clean_birthyr"
-output_column = "replaced_birthyr"
-condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end"
-transform = "sql_condition"
-```
-
-There are some additional attributes available for all transforms: `checkpoint`, `override_column_a`, `override_column_b`, `set_value_column_a`, `set_value_column_b`.
+# Feature Selection Transforms
+
+Each feature selection in the `[[feature_selections]]` list must have a
+`transform` attribute which tells hlink which transform it uses. The available
+feature selection transforms are listed below. The attributes of the feature
+selection often vary with the feature selection transform. However, there are a
+few utility attributes which are available for all transforms:
+
+- `override_column_a` - Type: `string`. Optional. Given the name of a column in
+  dataset A, copy that column into the output column instead of computing the
+  feature selection for dataset A. This does not affect dataset B.
+- `override_column_b` - Type: `string`. Optional. Given the name of a column in
+  dataset B, copy that column into the output column instead of computing the
+  feature selection for dataset B. This does not affect dataset A.
+- `set_value_column_a` - Type: any. Optional. Instead of computing the feature
+  selection for dataset A, use the given value for every row in the output
+  column. This does not affect dataset B.
+- `set_value_column_b` - Type: any. Optional. Instead of computing the feature
+  selection for dataset B, use the given value for every row in the output
+  column. This does not affect dataset A.
+- `checkpoint` - Type: `boolean`. Optional. If set to true, checkpoint the
+  dataset in Spark before computing the feature selection. This can reduce some
+  resource usage for very complex workflows, but should not be necessary.
 
 ## bigrams
 
diff --git a/docs/feature_selection_transforms.html b/docs/feature_selection_transforms.html
index b13b9d1..ee95d70 100644
--- a/docs/feature_selection_transforms.html
+++ b/docs/feature_selection_transforms.html
@@ -5,7 +5,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Feature Selection transforms &#8212; hlink 3.6.1 documentation</title>
+    <title>Feature Selection Transforms &#8212; hlink 3.6.1 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
     <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
     <script src="_static/documentation_options.js?v=f731707b"></script>
@@ -33,16 +33,29 @@
           <div class="body" role="main">
             
   <section id="feature-selection-transforms">
-<h1>Feature Selection transforms<a class="headerlink" href="#feature-selection-transforms" title="Link to this heading">¶</a></h1>
-<p>Each header below represents a feature selection transform.  These transforms are used in the context of <code class="docutils literal notranslate"><span class="pre">feature_selections</span></code>.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">feature_selections</span><span class="p">]]</span>
-<span class="n">input_column</span> <span class="o">=</span> <span class="s2">&quot;clean_birthyr&quot;</span>
-<span class="n">output_column</span> <span class="o">=</span> <span class="s2">&quot;replaced_birthyr&quot;</span>
-<span class="n">condition</span> <span class="o">=</span> <span class="s2">&quot;case when clean_birthyr is null or clean_birthyr == &#39;&#39; then year - age else clean_birthyr end&quot;</span>
-<span class="n">transform</span> <span class="o">=</span> <span class="s2">&quot;sql_condition&quot;</span>
-</pre></div>
-</div>
-<p>There are some additional attributes available for all transforms: <code class="docutils literal notranslate"><span class="pre">checkpoint</span></code>, <code class="docutils literal notranslate"><span class="pre">override_column_a</span></code>, <code class="docutils literal notranslate"><span class="pre">override_column_b</span></code>, <code class="docutils literal notranslate"><span class="pre">set_value_column_a</span></code>, <code class="docutils literal notranslate"><span class="pre">set_value_column_b</span></code>.</p>
+<h1>Feature Selection Transforms<a class="headerlink" href="#feature-selection-transforms" title="Link to this heading">¶</a></h1>
+<p>Each feature selection in the <code class="docutils literal notranslate"><span class="pre">[[feature_selections]]</span></code> list must have a
+<code class="docutils literal notranslate"><span class="pre">transform</span></code> attribute which tells hlink which transform it uses. The available
+feature selection transforms are listed below. The attributes of the feature
+selection often vary with the feature selection transform. However, there are a
+few utility attributes which are available for all transforms:</p>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">override_column_a</span></code> - Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Optional. Given the name of a column in
+dataset A, copy that column into the output column instead of computing the
+feature selection for dataset A. This does not affect dataset B.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">override_column_b</span></code> - Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Optional. Given the name of a column in
+dataset B, copy that column into the output column instead of computing the
+feature selection for dataset B. This does not affect dataset A.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">set_value_column_a</span></code> - Type: any. Optional. Instead of computing the feature
+selection for dataset A, use the given value for every row in the output
+column. This does not affect dataset B.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">set_value_column_b</span></code> - Type: any. Optional. Instead of computing the feature
+selection for dataset B, use the given value for every row in the output
+column. This does not affect dataset A.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">checkpoint</span></code> - Type: <code class="docutils literal notranslate"><span class="pre">boolean</span></code>. Optional. If set to true, checkpoint the
+dataset in Spark before computing the feature selection. This can reduce some
+resource usage for very complex workflows, but should not be necessary.</p></li>
+</ul>
 <section id="bigrams">
 <h2>bigrams<a class="headerlink" href="#bigrams" title="Link to this heading">¶</a></h2>
 <p>Split the given string column into <a class="reference external" href="https://en.wikipedia.org/wiki/Bigram">bigrams</a>.</p>
diff --git a/docs/objects.inv b/docs/objects.inv
index 1fe6cd1df376b9375c621a27eca302534c7e09ce..cdb55e660a0ded999710629d753fcc1e8df1b482 100644
GIT binary patch
delta 420
zcmV;V0bBl%1e64jdVfo993c>f_ne}<&qVSzyC{mKNU=1sEqRH`O%2@|UgD8>cD{zM
zmy;NpX1a+-_71-PuLlJTRwZq?M&~9Y1B58P2|K{<8iro_I!3MF#|+_ifo!nap`U^^
zJf0l=m!Q{LhDKqgUm$uk;b6T66k=yhMku8xiILzy7?UqHG=Bm{n*xnii3^V0i#bfP
z@1T%Yx2%)LI9bD%ARb83PzXPF(VgKVU(I_{9dO?q@Njl8NoF#Fp68SbpGcLYjn)Le
z%95?NrSit{NGi{gAX*sJWRCw{UIS!X(q!U%o~s-mA}TfS`ZOzV^;VL@qPHRq@55H!
z!dAjc7d&D(c7MZKX$|?E_wZ2mOj|*!GgnfLxAVG~E=<dvc1<|DlcHsAZgQ0C5^v97
zTjQNr6YLjUE`sudAhAB8O|?(j*q`E<ht_rDlgU>rc^rv5Z>2}Q?dO7AbKO;;UM~xT
zei-}ml~wnB)%Pfgh4d8S6jQmD?(Chg#BCAFKZ(;bY8|Kgp>27M`!4>V8S<1Ktv?==
O{R=-AoxK5p^^@o%+s>i@

delta 416
zcmV;R0bl-<1ds%fdVkAK10fWK_dEset%+NA#%N-qNn=gAB*)>DnUq_&XuI<@d_A8;
z7zSt1MtAW2|G990&d8`VQ(*0&g{MC7D{eZvx};qzUGzaodNV@$I6=}GWx);JXcmqZ
ze<WzNnxRrU(sLRdXumT~QQ&-KbV4A6LjeKl4k1LBZJ;^rO@H**8_7>NQZITph_-`F
zYTdk!4#HqGnSz){iCre_yNl)wfB8z@yW*gm>X^^ti9kJQj)yU&gcp=KX{9B>eO{8;
zHdS6YwkXpqkp@G1Iq3c8`-i8=78U9+p64vZ`v6joyEe_tYrUDIu;2`j-S4oO*RYw8
z+yx8Z_sy_oT7N<w(w;u&JyC|EY|Moe<Kwu_rW4b$$5j=+-BIE+H#=BNb&=I)u&nV*
zED6>V&KH5{K_H(W(W2TvZLE)>PeW_EaZ#tM6)g0~tTV!aTK027F1enHP_CDF1lRR#
z`NE2OQ}i_mBtE|QFoc+|r8#>eOmRy@(obS|0A*!6v>7EYao7A6beE>IHEzF?=8fJ?
KI{5<Mb(9Fje$Dv+

diff --git a/docs/searchindex.js b/docs/searchindex.js
index b1b64e3..c3a4121 100644
--- a/docs/searchindex.js
+++ b/docs/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["column_mappings", "comparison_types", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "filenames": ["column_mappings.md", "comparison_types.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "titles": ["Column Mappings", "Comparison types, transform add-ons, aggregate features, and household aggregate features", "Configuration", "Feature Selection transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "terms": {"each": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "read": [0, 1, 2, 7, 10], "from": [0, 1, 2, 4, 6, 7, 8, 9, 10, 12], "input": [0, 1, 2, 3, 6, 7, 10, 11], "dataset": [0, 1, 2, 6, 7, 10, 12], "hlink": [0, 1, 2, 5, 6, 7, 12], "It": [0, 1, 2, 6, 10, 12], "ha": [0, 1, 2, 6, 10, 12], "column_nam": [0, 1, 2, 11], "attribut": [0, 1, 2, 3, 7, 8, 9, 10, 11], "which": [0, 1, 2, 3, 6, 7, 9, 10, 12], "specifi": [0, 1, 2, 6, 7, 9, 10, 11], "name": [0, 1, 2, 10, 11], "both": [0, 1, 2, 7, 12], "option": [0, 1, 2, 3, 6, 7, 8, 10, 12], "mai": [0, 2, 6, 7, 10], "have": [0, 1, 2, 5, 6, 7, 8, 10, 12], "an": [0, 1, 2, 6, 8, 10], "alia": [0, 2, 7], "give": [0, 2], "new": [0, 2, 12], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "support": [0, 7, 8], "some": [0, 1, 2, 3, 6, 7, 10], "make": [0, 1, 2, 5, 12], "chang": [0, 1, 2, 5, 10, 12], "data": [0, 1, 4, 6, 7, 10], "thei": [0, 1, 2, 7, 10], "ar": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12], "These": [0, 1, 2, 3, 6, 7, 8, 9], "clean": [0, 6], "harmon": 0, "The": [0, 1, 2, 3, 5, 7, 8, 9, 10, 12], "avail": [0, 1, 2, 3, 5, 7, 8, 9, 12], "list": [0, 1, 2, 3, 4, 8, 10, 11], "below": [0, 1, 2, 3, 8, 9, 10], "section": [0, 1, 2, 12], "By": [0, 2, 10], "default": [0, 1, 2, 7, 8, 10], "must": [0, 1, 2, 8, 9, 11], "same": [0, 1, 2, 6, 7, 10], "With": [0, 9], "override_column_a": [0, 2, 3], "override_column_b": [0, 2, 3], "you": [0, 1, 2, 5, 10, 11, 12], "can": [0, 1, 2, 5, 6, 7, 8, 10, 12], "differ": [0, 1, 2, 4, 6, 7], "either": [0, 1, 2, 6, 11], "A": [0, 1, 2, 9, 10], "b": [0, 1, 2, 10], "when": [0, 1, 2, 3, 7, 12], "do": [0, 1, 3, 10, 12], "thi": [0, 1, 2, 5, 6, 7, 9, 10, 12], "appli": [0, 2, 3, 7, 12], "onli": [0, 1, 2, 7, 12], "non": 0, "overrid": [0, 2], "also": [0, 1, 2, 5, 6, 7, 9, 10, 12], "provid": [0, 2, 3, 6, 7, 9, 10], "override_transform": [0, 2], "describ": [0, 2, 10], "type": [0, 2, 3, 4, 7, 8, 10, 11, 12], "oper": [0, 2], "singl": [0, 2, 3, 10, 12], "output": [0, 1, 2, 3, 6, 7, 10, 12], "more": [0, 1, 2, 9, 10, 12], "than": [0, 1, 2, 8], "one": [0, 1, 2, 7], "order": [0, 2, 7], "so": [0, 1, 2, 5, 12], "anoth": [0, 1, 3, 7], "format": 0, "letter": 0, "t": [0, 1, 2, 3, 12], "u": 0, "repres": [0, 1, 2, 3, 9, 10], "arbitrari": 0, "requir": [0, 1, 2, 3, 4, 7, 9, 10, 11], "addit": [0, 1, 2, 3, 5, 6, 10], "vari": [0, 2], "inform": [0, 1, 2, 10], "appear": [0, 1], "its": [0, 1, 6, 10], "suffix": 0, "mean": [0, 2], "two": [0, 1, 2, 3, 6, 7, 9, 10, 12], "link": [0, 1, 2, 4, 6, 8], "most": [0, 1, 7, 10], "independ": [0, 2], "For": [0, 1, 2, 7, 10, 12], "exampl": [0, 1, 2], "taken": [0, 1], "10": [0, 2, 5, 12], "year": [0, 1, 2, 3, 4], "apart": 0, "want": [0, 1, 2, 10, 12], "standard": [0, 1, 11], "ag": [0, 1, 2, 3], "variabl": [0, 1, 2, 12], "i": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12], "compar": [0, 1, 2, 6, 7], "between": [0, 1, 2, 6, 7, 10, 12], "To": [0, 1, 5, 7, 10], "could": [0, 2], "creat": [0, 2, 6, 7, 9, 10, 11, 12], "age_at_dataset_b": 0, "ad": [0, 1, 2], "column_map": [0, 2, 7], "valu": [0, 1, 2, 8, 9, 10, 11], "As": 0, "suppos": [0, 2], "record": [0, 1, 2, 6, 7], "person": [0, 1, 6], "": [0, 1, 2, 6, 7, 10, 11], "first": [0, 1, 2, 5, 7, 10, 11], "string": [0, 1, 2, 3, 7, 8, 10, 11], "In": [0, 1, 6, 10, 12], "call": 0, "namefrst": [0, 1, 2], "entir": [0, 2], "lowercas": 0, "first_nam": 0, "uppercas": 0, "follow": [0, 1, 6, 10, 11, 12], "configur": [0, 1, 6, 10, 12], "add": [0, 4], "_": [0, 1, 2, 3, 4, 8, 9, 10], "given": [0, 1, 2, 3, 8, 12], "numer": [0, 1], "11": [0, 2, 5, 9], "concat": 0, "concaten": [0, 1], "end": [0, 1, 2, 3, 11], "col": [0, 1], "togeth": [0, 1, 2], "take": [0, 1, 2, 3, 7, 10], "column_to_append": 0, "multipl": [0, 1, 2, 10], "time": [0, 2, 7, 10], "row": [0, 2], "If": [0, 1, 2, 3, 7, 8, 10, 11], "automat": [0, 2, 5, 7], "convert": [0, 1, 2], "befor": [0, 1, 2, 3, 5, 7], "statefip": [0, 1, 2], "counti": [0, 1], "strip": [0, 7], "alphabet": 0, "charact": 0, "lower": [0, 1], "case": [0, 1, 2, 3, 6], "white": 0, "space": [0, 2, 3, 11], "start": [0, 11], "ration": 0, "word": [0, 4], "replac": [0, 1, 4], "sinc": [0, 2], "peopl": [0, 1, 6, 10], "raw": [0, 2, 7, 10], "censu": [0, 7, 12], "contain": [0, 1, 11], "lead": 0, "better": [0, 6], "match": [0, 1, 4, 6, 10, 11, 12], "remov": 0, "qmark": 0, "hyphen": 0, "punctuat": 0, "apostroph": 0, "altern": [0, 2], "surround": 0, "all": [0, 1, 2, 3, 7, 8, 9, 10], "them": [0, 1, 2, 7], "jr": [0, 2], "sr": [0, 2], "ii": [0, 2], "iii": [0, 2], "stop": 0, "last": [0, 1, 7, 9], "street": [0, 1], "avenu": [0, 11], "blvd": 0, "circl": 0, "court": 0, "road": 0, "prefix": 0, "like": [0, 2, 7, 10], "m": [0, 1], "mr": 0, "ah": 0, "chines": 0, "condens": 0, "whitespac": [0, 7], "leav": 0, "behind": 0, "arrai": [0, 2, 4, 8, 9], "namefrst_split": [0, 2], "namefrst_clean": [0, 2], "index": [0, 5], "select": [0, 1, 4, 6, 10, 12], "element": 0, "posit": [0, 1, 2, 4, 6], "second": [0, 1, 2, 11], "1": [0, 1, 2, 4, 7, 8, 9, 10, 12], "item": 0, "set": [0, 1, 2, 3, 6, 7, 10, 12], "Then": [0, 5], "0": [0, 1, 2, 7, 8, 9, 10, 12], "initi": [0, 1, 10], "probabl": [0, 2, 8], "middl": [0, 1], "namefrst_mid_init": [0, 1], "otherwis": [0, 1, 9, 12], "known": 0, "recod": 0, "birthyr": [0, 2], "clean_birthyr": [0, 2, 3], "9999": [0, 2, 9], "1999": [0, 2], "9998": 0, "divid": 0, "int": [0, 1, 2, 3, 8], "integ": [0, 1, 2, 9], "result": [0, 1, 6, 9, 10, 12], "instanc": [0, 8], "birthplac": [0, 2], "detail": [0, 2, 10], "version": [0, 5, 12], "gener": [0, 1, 4, 6, 7, 10], "least": [0, 1], "signific": 0, "digit": 0, "we": [0, 1, 10, 12], "simpli": [0, 2], "drop": [0, 2, 10], "100": [0, 2, 12], "round": [0, 2], "lowest": 0, "whole": [0, 6], "number": [0, 1, 2, 3, 7, 8, 10], "floor": 0, "function": [0, 1, 2, 6, 10], "bpl": [0, 1, 2], "bpl_root": 0, "condit": [0, 1, 2, 3, 4, 7], "logic": 0, "work": [0, 1, 2, 5, 7, 10, 12], "sql": [0, 1, 2, 3, 4, 7, 10], "express": [0, 1, 2], "claus": [0, 1], "if_valu": 0, "else_valu": 0, "race": [0, 1, 2, 9, 12], "ipum": [0, 6], "code": [0, 1, 2, 5], "categori": [0, 8], "get": [0, 1, 2, 10], "down": [0, 6, 12], "nearest": 0, "produc": [0, 10], "relat": [0, 1, 2], "hundr": 0, "300": 0, "child": [0, 8], "household": [0, 4, 6, 8, 10, 12], "head": 0, "301": 0, "302": 0, "adopt": 0, "303": 0, "step": [0, 1, 2, 6], "usual": [0, 7, 12], "need": [0, 1, 2, 7, 10, 12], "2": [0, 1, 2, 3, 7, 8, 11, 12], "spous": 0, "3": [0, 1, 2, 5, 7, 8, 9, 12], "4": [0, 1, 8], "law": 0, "5": [0, 1, 2, 8, 9, 10, 12], "parent": [0, 1, 11], "6": [0, 2, 8, 9, 12], "7": [0, 1, 2, 8, 12], "sibl": 0, "12": [0, 5], "relate_div_100": [0, 1, 2], "page": [1, 2, 10], "comparison_featur": [1, 2, 7], "along": 1, "header": [1, 2, 3, 9, 11], "context": [1, 3, 9], "relatematch": [1, 2], "comparison_typ": [1, 2], "categor": [1, 2, 8, 9], "true": [1, 2, 3, 7, 9, 11, 12], "maximum": [1, 8], "jaro": [1, 9], "winkler": [1, 9], "find": [1, 7, 12], "greatest": 1, "among": 1, "cartesian": 1, "product": [1, 6, 12], "column": [1, 3, 4, 7, 9, 10, 11, 12], "namelast": [1, 2], "would": [1, 2, 12], "return": [1, 3, 8, 10], "four": 1, "namefrst_a": 1, "namefrst_b": 1, "namelast_b": 1, "namelast_a": 1, "maximum_jw": 1, "score": [1, 2, 7, 9], "namefrst_jw": [1, 2, 12], "geograph": 1, "filter": [1, 4, 7, 11], "major": [1, 10], "locat": [1, 2, 10], "boundari": 1, "zero": 1, "jw_street": 1, "enum_dist": 1, "max": [1, 8, 10], "member": [1, 7], "neighborhood": 1, "surnam": 1, "related_individual_max_jw": 1, "namefrst_rel": 1, "assert": [1, 10], "NOT": 1, "distinct": 1, "f1": 1, "evalu": [1, 2, 6, 7, 8], "ani": [1, 2, 3, 5, 8], "potenti": [1, 4, 7], "mismatch": 1, "queri": [1, 2], "fi": 1, "OR": [1, 2], "mi0": 1, "mi1": 1, "THEN": 1, "els": [1, 2, 3], "first_init_col": 1, "namefrst_init": 1, "mid_init_col": 1, "namefrst_mid_init_2": 1, "f2": 1, "empti": 1, "null": [1, 2, 3], "AND": [1, 2], "individu": [1, 2, 7, 12], "mainli": 1, "caution": [1, 9], "flag": [1, 9, 10, 12], "f": [1, 10], "sp": 1, "m_caution": [1, 2, 9, 12], "mbpl": 1, "mother_birthyr": 1, "stepmom": 1, "momloc": 1, "comp_a": [1, 2], "comp_b": [1, 2], "comp_c": 1, "parent_step_chang": 1, "comp_d": 1, "check": [1, 10], "sign": 1, "boolean": [1, 2, 3, 11, 12], "form": [1, 7, 11], "cast": 1, "namelast_equal_as_int": 1, "namelast_clean": [1, 2, 3], "whether": [1, 2, 11], "join": [1, 2, 11], "across": 1, "being": [1, 7], "exact": [1, 2], "namefrst_unstd": [1, 2], "present": [1, 2, 9], "nonzero": 1, "primarili": [1, 7], "indic": [1, 12], "kind": 1, "incompar": 1, "akin": 1, "miss": [1, 10], "see": [1, 2, 5, 10, 12], "univers": [1, 4, 7], "similar": 1, "fbpl_nomatch": 1, "fbpl": 1, "allow": [1, 2, 7, 12], "up": [1, 2, 10, 11], "sub": 1, "object": [1, 2, 6, 10], "document": [1, 8, 10, 12], "sp_caution": [1, 2, 12], "spouse_bpl": 1, "spouse_birthyr": 1, "durmarr": [1, 2], "new_marr": [1, 2], "street_jw": [1, 2, 12], "9": 1, "multipli": 1, "after": [1, 2, 4, 8, 10], "float": [1, 2, 8], "comp": 1, "c": 1, "sploc": 1, "012": 1, "fals": [1, 2, 3, 4, 6, 10], "d": 1, "under": [1, 2], "specif": [1, 2, 10], "circumst": 1, "should": [1, 2, 8, 9, 10], "mid_init_match": 1, "either_1": 1, "nativ": 1, "either_0": 1, "gen": 1, "imm": [1, 2, 12], "immigr": 1, "look": [1, 10, 11], "foreign": 1, "born": 1, "sgen": [1, 2, 12], "rel": [1, 2, 12], "scala": 1, "determin": [1, 7], "greater": [1, 5], "jw_threshold": 1, "less": [1, 2], "age_threshold": 1, "sex": [1, 2, 11], "sampl": 1, "related_individual_row": 1, "unrel": 1, "depend": [1, 2, 5, 12], "name_col": 1, "birthyr_col": 1, "namefrst_related_row": 1, "replaced_birthyr": [1, 2, 3], "extra": 1, "children": 1, "who": 1, "base": [1, 2, 7], "expect": 1, "count": [1, 10, 12], "suspect": [1, 6], "relate_col": 1, "histid_col": 1, "id": [1, 2], "birth": 1, "year_b": 1, "wa": [1, 12], "minimum": [1, 8], "accept": [1, 2, 12], "consid": [1, 8], "histid": [1, 2, 12], "1910": [1, 2, 12], "8": [1, 2, 5, 10], "rate": 1, "calcul": [1, 12], "percentag": 1, "seen": 1, "neighbor": 1, "meet": 1, "95": 1, "nbor": [1, 2, 12], "namelast_neighbor": 1, "05": [1, 2], "namelast_popularity_sum": 1, "namelast_popular": 1, "length": [1, 2, 9], "size": 1, "ab": 1, "diff": 1, "absolut": 1, "invalid": [1, 8], "instead": [1, 2, 5, 7], "marriag": 1, "durat": 1, "99": [1, 2], "placehold": 1, "unknown": 1, "exclud": 1, "those": [1, 2], "consider": 1, "byrdiff": [1, 2, 12], "mardurmatch": [1, 2], "14": 1, "minu": [1, 2], "subtract": 1, "geo": 1, "distanc": [1, 8], "lookup": 1, "tabl": [1, 2, 4, 7, 10, 12], "core": [1, 7, 10, 12], "dist_tabl": 1, "py": [1, 2], "There": [1, 2, 3, 7], "sever": [1, 6], "wai": [1, 5, 10], "file": [1, 4, 6, 7, 10, 11, 12], "kei": [1, 7, 10], "key_count": 1, "secondari": 1, "serv": 1, "back": 1, "primari": [1, 6], "doe": [1, 7, 12], "particularli": 1, "state": [1, 6], "much": [1, 7], "fewer": [1, 8], "combin": [1, 2, 3, 7], "thu": 1, "risk": 1, "fill": 1, "aren": 1, "ex": 1, "just": [1, 2, 10, 12], "even": 1, "though": 1, "distances_fil": 1, "path": [1, 2, 10, 11, 12], "table_nam": 1, "what": [1, 2, 10, 12], "onc": [1, 10], "loc_a": 1, "where": [1, 7, 10, 12], "come": 1, "loc_b": 1, "distance_col": 1, "source_column_a": 1, "sourc": [1, 4, 7, 10, 12], "source_column_b": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b_0": 1, "loc_b_1": 1, "secondary_key_count": 1, "backup": 1, "secondary_table_nam": 1, "secondary_distances_fil": 1, "secondary_source_column": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_distance_col": 1, "state_dist": 1, "state_distance_lookup": 1, "county_state_dist": 1, "csv": [1, 2, 7, 10, 11, 12], "statecode1": 1, "statecode2": 1, "dist": 1, "county_dist": [1, 2, 12], "county_distance_lookup": 1, "county_1900_1910_distances_km": 1, "from_icpsrctyi": 1, "to_icpsrctyi": 1, "from_statefip": 1, "to_statefip": 1, "distance_km": 1, "state_1900_1910_distances_km": 1, "fetch": 1, "neither": 1, "nor": 1, "mpre": 1, "m_namefrst": 1, "accord": 1, "niu": 1, "other": [1, 2, 12], "mfbplmatch": 1, "multi": 1, "search": 1, "special": 1, "simplifi": 1, "particular": [1, 2], "constraint": 1, "num_col": 1, "whose": 1, "templat": 1, "n": [1, 8, 9], "per": [1, 2, 8, 9, 10], "current": [1, 2, 10], "respect": [1, 7], "jw_col_templ": 1, "jw": 1, "pair": [1, 12], "equal_and_not_null_templ": 1, "final": [1, 2, 12], "comput": [1, 3, 7], "_namefrst": 1, "_bpl": 1, "_sex": 1, "25": 1, "nvl": 1, "sm_namefrst": 1, "sn_namefrst": 1, "sm_bpl": 1, "sn_bpl": 1, "sm_sex": 1, "sn_sex": 1, "pass": [1, 2, 7, 8], "flexibl": 1, "user": [1, 10], "write": [1, 10, 12], "own": [1, 2], "favor": 1, "reason": 1, "good": 1, "fallback": 1, "defin": [1, 7, 8, 9, 10], "spark": [1, 2, 5, 8, 9, 10, 12], "builtin": 1, "argument": [1, 10, 12], "namelast_jw_max": 1, "namelast1": 1, "namelast2": 1, "namelast3": 1, "abov": [1, 5], "extend": 1, "beyond": 1, "top": [1, 4], "level": [1, 4, 10], "everi": 1, "jw_f": [1, 2, 12], "father_namefrst": 1, "rais": [1, 3], "exponenti": 1, "squar": 1, "county_distance_squar": [1, 2, 12], "county_a": 1, "county_b": 1, "upper": 1, "gt": 1, "btwn": 1, "addl": 1, "var": [1, 2], "program": [1, 2, 7, 12], "report": [1, 4, 6, 10], "addl_var": 1, "check_val_expr": 1, "else_v": 1, "volumn": 1, "datasourc": [1, 2, 10], "yrimmig": 1, "immyear_diff": [1, 2, 9, 12], "includ": [1, 2, 7, 9, 10], "train": [1, 4, 6, 8, 10], "independent_var": [1, 2, 12], "config": [1, 4, 7, 10, 12], "id_column": [1, 2], "_a": 1, "mult": 1, "exist": [1, 2, 10], "within": [1, 2, 6, 10, 11], "hh_train": [1, 2, 7, 10, 12], "hh": 1, "highest": [1, 2], "against": [1, 11], "ten": [1, 2], "tell": 2, "how": [2, 7], "descript": [2, 8, 10], "refer": 2, "here": [2, 7, 10, 12], "tutori": [2, 10], "script": [2, 6, 10], "discuss": 2, "readm": 2, "note": 2, "written": [2, 6], "toml": [2, 6, 10], "abl": 2, "json": [2, 10], "datasource_a": [2, 7], "datasource_b": [2, 7], "transform": [2, 4, 6, 7], "lowercase_strip": 2, "add_to_a": 2, "age_2": 2, "derived_from": 2, "expand_length": 2, "explod": [2, 7], "jaro_winkl": 2, "namelast_jw": [2, 12], "threshold": [2, 8, 12], "feature_nam": 2, "79": 2, "84": 2, "complex": 2, "machin": [2, 6, 7, 10, 12], "learn": [2, 6, 7, 10, 12], "probabilist": [2, 6], "drop_data_from_scored_match": 2, "us1900": 2, "us1900m_usa": 2, "p": 2, "parquet": [2, 7], "us1910": 2, "us1910m_usa": 2, "training_data_subset": 2, "serialp": 2, "rationalize_name_word": 2, "remove_qmark_hyphen": 2, "replace_apostroph": 2, "remove_suffix": 2, "remove_alternate_nam": 2, "condense_strip_whitespac": 2, "split": [2, 3, 7, 8, 9, 12], "namefrst_std": [2, 11], "array_index": 2, "bpl_orig": 2, "divide_by_int": 2, "get_floor": 2, "statefip_h": 2, "output_typ": 2, "substitution_column": [2, 7, 11], "join_column": [2, 11], "join_valu": [2, 11], "substitution_fil": [2, 11], "name_std": [2, 11], "male": [2, 11], "femal": [2, 11], "feature_select": [2, 3, 7], "input_column": [2, 3, 9], "output_column": [2, 3, 9], "sql_condit": 2, "namelast_bigram": 2, "bigram": [2, 4], "bpl_clean": 2, "bpl_str": 2, "washington": 2, "bpl2_str": 2, "53": 2, "region": [2, 12], "attach_vari": 2, "region_dict": 2, "col_to_join_on": 2, "col_to_add": 2, "null_fil": 2, "col_typ": 2, "potential_matches_univers": [2, 7], "birthyr_3": 2, "namefrst_std_jw": [2, 12], "75": [2, 8, 12], "comparis": 2, "post": [2, 7], "hh_comparison": [2, 7], "threshold_expr": 2, "fetch_a": 2, "sex_equ": 2, "equal": [2, 11], "relate_a": [2, 9], "pipeline_featur": [2, 7, 9], "sex_region_interact": 2, "transformer_typ": [2, 9], "interact": [2, 4, 7, 12], "relatetyp": [2, 9], "bucket": [2, 7], "hit": [2, 10, 12], "scale_data": [2, 12], "training_data": [2, 10], "dependent_var": [2, 12], "score_with_model": [2, 12], "use_training_data_featur": [2, 7, 12], "split_by_id_a": [2, 12], "decis": [2, 4, 8, 12], "drop_duplicate_with_threshold_ratio": [2, 12], "n_training_iter": [2, 7, 12], "output_suspicious_td": [2, 12], "param_grid": [2, 12], "model_paramet": [2, 7, 8, 12], "random_forest": [2, 12], "maxdepth": [2, 8, 12], "numtre": [2, 8, 12], "005": 2, "threshold_ratio": [2, 8, 12], "logistic_regress": [2, 12], "50": [2, 12], "65": 2, "80": 2, "chosen_model": [2, 8, 12], "prediction_col": 2, "predict": [2, 12], "hh_col": 2, "hh_training_data_1900_1910": 2, "probit": [2, 4], "go": [2, 10], "your": [2, 5, 7, 10, 12], "uniqu": 2, "identifi": [2, 6, 12], "full": [2, 7, 12], "short": 2, "alphanumer": 2, "convert_ints_to_long": 2, "long": [2, 11], "especi": 2, "assum": 2, "schema": 2, "sometim": 2, "term": 2, "bigint": 2, "thing": 2, "my_fil": 2, "subset": [2, 11], "limit": 2, "extract": 2, "modifi": 2, "meant": 2, "usag": [2, 4, 10], "set_value_column_a": [2, 3], "liter": 2, "set_value_column_b": [2, 3], "iv": 2, "v": 2, "vi": 2, "vii": 2, "viii": 2, "namelast_clean_bigram": [2, 3], "fed": [2, 7], "prep": 2, "df": [2, 10], "men": 2, "newli": 2, "attempt": 2, "duplic": [2, 8], "conjuct": 2, "Will": 2, "conjunct": 2, "rang": [2, 9], "original_valu": 2, "plu": 2, "1870": 2, "expand": 2, "1867": 2, "1868": 2, "1869": 2, "1871": 2, "1872": 2, "1873": 2, "kept": 2, "keep": 2, "appropri": 2, "treat": [2, 9], "import": [2, 7, 10, 12], "dure": [2, 7], "hot": 2, "encod": [2, 3], "vector": [2, 9], "stage": 2, "well": 2, "upper_threshold": 2, "cannot": 2, "robust": 2, "ml": [2, 4, 8, 9], "typic": [2, 7], "leverag": 2, "api": [2, 6, 9], "piplin": 2, "regionf": 2, "sex_regionf_interact": 2, "immyear_caut": [2, 9], "myriad": 2, "explor": [2, 4, 6, 10], "part": [2, 7], "task": [2, 4, 6, 8, 12], "drop_duplicate_a": 2, "out": [2, 7, 12], "best": [2, 7], "smallest": 2, "possibl": 2, "ratio": [2, 8], "beta": [2, 8], "test": [2, 7, 12], "model_explor": [2, 10, 12], "hyper": [2, 6, 12], "paramet": [2, 6, 7, 8, 10, 12], "eval": 2, "skip": [2, 7], "apply_model": 2, "run_all_step": [2, 10, 12], "command": [2, 6, 10, 12], "try": 2, "creation": 2, "iter": 2, "scale": 2, "error": [2, 9], "1900": [2, 12], "about": [2, 10, 12], "1930": [2, 12], "1940": [2, 12], "fail": 2, "were": 2, "sure": [2, 5, 10], "scratch": 2, "although": 2, "know": 2, "haven": 2, "save": [2, 7, 12], "small": 2, "amount": 2, "process": [2, 6, 10], "repeatedli": 2, "help": [2, 7, 10], "neg": [2, 4, 6], "area": 2, "coverag": 2, "increas": [2, 9], "represent": [2, 7], "ensur": 2, "group": [2, 7], "a304bt": 2, "three": [2, 7], "b200": 2, "c201": 2, "d425": 2, "perform": [2, 6, 7, 11], "feature_import": [2, 7, 12], "coeffici": [2, 7], "enabl": [2, 7, 10], "srace": [2, 9, 12], "race_interacted_srac": [2, 9, 12], "hits2": [2, 12], "exact_mult": [2, 12], "ncount": [2, 3, 12], "ncount2": [2, 3, 12], "f_interacted_jw_f": [2, 12], "f_caution": [2, 12], "f_pre": [2, 12], "fbplmatch": [2, 12], "m_interacted_jw_m": [2, 9, 12], "jw_m": [2, 9, 12], "m_pre": [2, 9, 12], "mbplmatch": [2, 12], "sp_interacted_jw_sp": [2, 12], "jw_sp": [2, 12], "sp_pre": [2, 12], "mi": [2, 12], "fsoundex": [2, 12], "lsoundex": [2, 12], "oth": [2, 12], "imm_interacted_immyear_caut": [2, 12], "1900_1910_training_data_20191023": 2, "jw_max_a": 2, "jw_max_b": 2, "f1_match": 2, "f2_match": 2, "byrdifcat": 2, "racematch": 2, "bplmatch": 2, "imm_interacted_bplmatch": 2, "sexmatch": 2, "relatetype_interacted_relatematch": 2, "checkpoint": 3, "no_first_pad": 3, "don": [2, 3], "prepend": 3, "namefrst_unstd_bigram": 3, "namelast_frst_bigram": 3, "namelast_clean_soundex": 3, "input_col": 3, "output_col": 3, "expon": 3, "introduct": 4, "overview": 4, "instal": 4, "pypi": 4, "preprocess": [4, 6, 10, 12], "model": [4, 6, 10], "run": [4, 5, 6, 7, 12], "librari": [4, 6], "mode": [4, 5, 12], "advanc": 4, "workflow": 4, "export": [4, 7, 10], "featur": [4, 6, 7, 8, 10], "reus": 4, "basic": 4, "map": [4, 7, 9], "substitut": [4, 7], "block": [4, 7], "comparison": [4, 7], "pipelin": 4, "ons": 4, "aggreg": 4, "union": 4, "soundex": 4, "power": 4, "regex": 4, "random": [4, 8], "forest": [4, 8], "logist": [4, 8], "regress": [4, 8], "tree": [4, 8], "gradient": [4, 8], "boost": [4, 8], "system": 5, "python": [5, 6, 10], "java": 5, "integr": 5, "apach": 5, "via": [5, 6], "pyspark": [5, 8, 9, 10], "packag": 5, "org": 5, "latest": 5, "pip": 5, "easiest": [5, 10], "through": [5, 7, 9, 10], "instruct": [5, 10], "But": [2, 5], "clone": 5, "github": 5, "repositori": 5, "root": 5, "project": 5, "directori": [5, 10, 12], "develop": [5, 6], "e": 5, "dev": 5, "edit": 5, "made": 5, "built": 5, "tool": [5, 6], "line": [6, 10], "share": 6, "characterist": [6, 7], "correspond": [6, 7], "real": 6, "world": 6, "determinist": [6, 7], "rule": [6, 7], "algorithm": [6, 7], "At": [6, 7], "been": 6, "unit": 6, "census": 6, "hierarch": [6, 10], "structur": 6, "nest": 6, "howev": [6, 12], "tailor": 6, "ignor": 6, "common": [6, 7, 12], "highli": [6, 7], "languag": 6, "further": [6, 12], "broken": 6, "smaller": 6, "sequenc": 6, "linkrun": [6, 10], "prepar": [6, 7, 10], "research": 6, "experi": 6, "understand": 6, "tune": [6, 12], "relationship": 6, "varieti": 7, "normal": 7, "abbrevi": [7, 11], "regist": [7, 10], "datafram": [7, 10, 12], "request": 7, "classif": [7, 8], "metadata": 7, "introspect": 7, "ingest": 7, "inspect": 7, "mani": [7, 10], "aspect": [7, 10], "extens": 7, "longest": 7, "definit": 7, "reduc": 7, "drastic": 7, "improv": 7, "runtim": 7, "separ": 7, "total": 7, "potential_match": [7, 10], "satisfi": 7, "elig": 7, "reshap": 7, "thought": 7, "ahead": 7, "chosen": 7, "experiment": [7, 10], "focus": 7, "demograph": 7, "moment": 7, "veri": [7, 12], "anyon": 7, "percent": 7, "remain": 7, "popul": 7, "pull": 7, "fix": 7, "width": 7, "crosswalk": 7, "construct": 7, "alpha": 8, "hyperparamet": [8, 12], "de": 8, "param": [8, 12], "label": 8, "doc": [8, 9], "commonli": 8, "explan": 8, "randomforestclassifi": 8, "depth": 8, "20": 8, "featuresubsetstrategi": 8, "node": 8, "auto": 8, "onethird": 8, "sqrt": 8, "log2": 8, "15": 8, "generalizedlinearregress": 8, "famili": 8, "binomi": 8, "85": [8, 10], "logisticregress": 8, "decisiontreeclassifi": 8, "mininstancespernod": 8, "caus": 8, "left": 8, "right": [8, 10], "discard": 8, "maxbin": 8, "bin": 8, "discret": 8, "continu": [8, 9, 12], "gbtclassifi": 8, "mother": 9, "point": [9, 12], "x": [9, 10], "y": 9, "hold": 9, "except": 9, "strictli": 9, "inf": 9, "explicitli": 9, "cover": 9, "doubl": 9, "outsid": 9, "job": 10, "high": 10, "class": 10, "handl": 10, "main": 10, "complet": 10, "access": [10, 12], "link_run": 10, "factori": 10, "sparkfactori": 10, "load_config": 10, "load_conf_fil": 10, "sparksess": 10, "now": 10, "let": 10, "load": 10, "our": 10, "my_conf": 10, "lr": 10, "prep_step": 10, "get_step": 10, "enumer": 10, "print": 10, "input_table_nam": 10, "output_table_nam": 10, "run_step": 10, "get_tabl": 10, "matches_df": 10, "hh_model_explor": 10, "method": [10, 12], "interfac": 10, "easili": 10, "conveni": 10, "adjust": 10, "set_loc": 10, "set_num_cor": 10, "set_executor_memori": 10, "5g": 10, "ll": 10, "dictionari": 10, "often": 10, "modul": 10, "pleas": 10, "reproduc": 10, "consol": 10, "cpu": 10, "h": 10, "executor_memori": [10, 12], "execute_task": 10, "execute_command": 10, "conf": [10, 12], "show": 10, "messag": 10, "exit": 10, "memori": 10, "executor": 10, "begin": 10, "execut": 10, "seri": 10, "excute_command": 10, "filepath": 10, "sai": 10, "fullcount_1870_1880": 10, "pattern": 10, "full_count_1870_1880": 10, "prompt": 10, "enter": 10, "text": 10, "unstabl": 10, "topic": 10, "analyz": [10, 12], "set_preexisting_t": 10, "x_persist": 10, "borrow_t": 10, "get_task": 10, "set_print_sql": 10, "x_sql": 10, "x_sqlf": 10, "ipython": 10, "showf": 10, "x_summari": 10, "desc": 10, "x_crosswalk": 10, "x_tab": 10, "q": [10, 12], "x_hh_tfam": 10, "x_tfam": 10, "drop_al": 10, "reload": 10, "x_hh_tfam_2a": 10, "x_tfam_raw": 10, "drop_all_prc": 10, "x_hh_tfam_2b": 10, "x_union": 10, "drop_all_temp": 10, "x_load": 10, "get_set": 10, "set_link_task": 10, "x_parquet_from_csv": 10, "organ": 10, "hierarchi": 10, "five": 10, "hh_match": 10, "someth": 10, "choic": 10, "preexist": 10, "prepped_df_a": 10, "prepped_df_b": 10, "raw_df_b": 10, "raw_df_a": 10, "training_featur": [10, 12], "scored_potential_match": 10, "potential_matches_prep": 10, "exploded_df_b": 10, "exploded_df_a": 10, "predicted_match": 10, "hh_training_featur": [10, 12], "hh_training_data": 10, "hh_predicted_match": 10, "hh_scored_potential_match": 10, "hh_potential_match": 10, "hh_blocked_match": 10, "hh_potential_matchs_prep": 10, "model_eval_training_vector": 10, "model_eval_training_data": 10, "model_eval_repeat_fp": 10, "model_eval_training_featur": 10, "model_eval_training_result": 10, "model_eval_repeat_fn": 10, "hh_model_eval_training_vector": 10, "hh_model_eval_repeat_fp": 10, "hh_model_eval_repeat_fn": 10, "hh_model_eval_training_result": 10, "hh_model_eval_training_featur": 10, "hh_model_eval_training_data": 10, "persist": 10, "hidden": 10, "intermedi": 10, "yet": 10, "databas": 10, "tablenam": 10, "istemporari": 10, "task_nam": 10, "num": 10, "finish": 10, "put": [10, 12], "launch": [10, 12], "my": [10, 12], "subhead": 11, "suppli": 11, "regex_word_replac": 11, "variant": 11, "av": 11, "7th": 11, "swap": 11, "still": 11, "anywher": 11, "proceed": 11, "street_unstd": 11, "dir": 11, "substitutions_street_abbrev": 11, "span": 12, "1920": 12, "deriv": 12, "necessari": 12, "scenario": 12, "copi": 12, "use_potential_matches_featur": 12, "full_count_1900_1910": 12, "50g": 12, "ask": 12, "arg": 12, "partit": 12, "training_data_1900_1910_hlink_featur": 12, "might": 12, "shut": 12, "framework": 12, "etc": 12, "relev": 12, "matrix": 12, "implement": 12, "regular": 12, "training_data_1900_1910": 12, "weren": 12, "ident": 12, "manual": 12, "updat": 12, "isn": 12, "analysi": 12, "training_result": 12, "hh_training_result": 12, "1900_1910_training_result": 12, "repeat_fp": 12, "repeat_fn": 12, "hh_repeat_fp": 12, "hh_repeat_fn": 12, "1900_1910_potential_fp": 12, "1900_1910_potential_fn": 12, "prefer": 12, "ve": 12, "or_group": 2, "belong": 2, "bpl1": 2, "bpl2": 2, "bpl3": 2, "parenthes": 2, "around": 2, "connect": 2}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"column": [0, 2], "map": [0, 2], "basic": [0, 2], "usag": 0, "advanc": [0, 2, 12], "transform": [0, 1, 3, 9], "add_to_a": 0, "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "lowercase_strip": 0, "rationalize_name_word": 0, "remove_qmark_hyphen": 0, "remove_punctu": 0, "replace_apostroph": 0, "remove_alternate_nam": 0, "remove_suffix": 0, "remove_stop_word": 0, "remove_prefix": 0, "condense_strip_whitespac": 0, "remove_one_letter_nam": 0, "split": 0, "array_index": 0, "substr": 0, "divide_by_int": 0, "when_valu": 0, "get_floor": 0, "comparison": [1, 2], "type": [1, 9], "add": 1, "ons": 1, "aggreg": 1, "featur": [1, 2, 3, 9, 12], "household": [1, 2, 7], "maximum_jaro_winkl": 1, "jaro_winkl": 1, "jaro_winkler_street": 1, "max_jaro_winkl": 1, "equal": 1, "f1_match": 1, "f2_match": 1, "not_equ": 1, "equals_as_int": 1, "all_equ": 1, "not_zero_and_not_equ": 1, "time": 1, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "any_equ": 1, "either_are_1": 1, "either_are_0": 1, "second_gen_imm": 1, "rel_jaro_winkl": 1, "extra_children": 1, "jaro_winkler_r": 1, "sum": 1, "length_b": 1, "abs_diff": 1, "b_minus_a": 1, "geo_dist": 1, "fetch_a": 1, "fetch_b": 1, "present_both_year": 1, "neither_are_nul": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_and_equal_categorical_in_univers": 1, "multi_jaro_winkler_search": 1, "sql_condit": [1, 3], "alia": 1, "power": [1, 3], "threshold": 1, "lower_threshold": 1, "upper_threshold": 1, "gt_threshold": 1, "btwn_threshold": 1, "look_at_addl_var": 1, "hit": 1, "hits2": 1, "exact_mult": 1, "jw_max_a": 1, "jw_max_b": 1, "configur": [2, 4, 7], "config": 2, "file": 2, "top": 2, "level": 2, "data": [2, 11, 12], "sourc": [2, 5], "filter": 2, "substitut": [2, 11], "select": [2, 3], "potenti": [2, 12], "match": [2, 7], "univers": 2, "block": 2, "pipelin": [2, 9], "gener": [2, 9, 12], "train": [2, 7, 12], "model": [2, 7, 8, 12], "bigram": 3, "arrai": 3, "union": 3, "soundex": 3, "welcom": 4, "hlink": [4, 10], "": 4, "document": 4, "api": 4, "instal": 5, "requir": 5, "from": 5, "pypi": 5, "introduct": 6, "overview": [6, 7], "link": [7, 10, 12], "task": [7, 10], "preprocess": 7, "step": [7, 10], "relat": 7, "section": 7, "explor": [7, 12], "report": 7, "random_forest": 8, "probit": 8, "logistic_regress": 8, "decision_tre": 8, "gradient_boosted_tre": 8, "interact": [9, 10], "bucket": 9, "run": 10, "us": 10, "librari": 10, "mode": 10, "start": 10, "program": 10, "exampl": [10, 12], "workflow": [10, 12], "1": 11, "tabl": 11, "regex": 11, "word": 11, "replac": 11, "export": 12, "after": 12, "reus": 12, "differ": 12, "year": 12, "ml": 12, "list": 12, "fals": 12, "posit": 12, "neg": 12, "fp": 12, "fn": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Column Mappings": [[0, "column-mappings"], [2, "column-mappings"]], "Basic Usage": [[0, "basic-usage"]], "Advanced Usage": [[0, "advanced-usage"]], "Transforms": [[0, "transforms"]], "add_to_a": [[0, "add-to-a"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "lowercase_strip": [[0, "lowercase-strip"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_punctuation": [[0, "remove-punctuation"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_suffixes": [[0, "remove-suffixes"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_prefixes": [[0, "remove-prefixes"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "split": [[0, "split"]], "array_index": [[0, "array-index"]], "mapping": [[0, "mapping"]], "substring": [[0, "substring"]], "divide_by_int": [[0, "divide-by-int"]], "when_value": [[0, "when-value"]], "get_floor": [[0, "get-floor"]], "Comparison types, transform add-ons, aggregate features, and household aggregate features": [[1, "comparison-types-transform-add-ons-aggregate-features-and-household-aggregate-features"]], "Comparison types": [[1, "comparison-types"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "equals": [[1, "equals"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "not_equals": [[1, "not-equals"]], "equals_as_int": [[1, "equals-as-int"]], "all_equals": [[1, "all-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "and": [[1, "and"]], "times": [[1, "times"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "any_equals": [[1, "any-equals"]], "either_are_1": [[1, "either-are-1"]], "either_are_0": [[1, "either-are-0"]], "second_gen_imm": [[1, "second-gen-imm"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "extra_children": [[1, "extra-children"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "sum": [[1, "sum"]], "length_b": [[1, "length-b"]], "abs_diff": [[1, "abs-diff"]], "b_minus_a": [[1, "b-minus-a"]], "geo_distance": [[1, "geo-distance"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "present_both_years": [[1, "present-both-years"]], "neither_are_null": [[1, "neither-are-null"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "sql_condition": [[1, "sql-condition"], [3, "sql-condition"]], "Feature add-ons": [[1, "feature-add-ons"]], "alias": [[1, "alias"]], "power": [[1, "power"], [3, "power"]], "threshold": [[1, "threshold"]], "lower_threshold": [[1, "lower-threshold"]], "upper_threshold": [[1, "upper-threshold"]], "gt_threshold": [[1, "gt-threshold"]], "btwn_threshold": [[1, "btwn-threshold"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "Aggregate Features": [[1, "aggregate-features"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "exact_mult": [[1, "exact-mult"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "Configuration": [[2, "configuration"]], "Basic Config File": [[2, "basic-config-file"]], "Advanced Config File": [[2, "advanced-config-file"]], "Top level configs": [[2, "top-level-configs"]], "Data sources": [[2, "data-sources"]], "Filter": [[2, "filter"]], "Substitution Columns": [[2, "substitution-columns"]], "Feature Selections": [[2, "feature-selections"]], "Potential Matches Universe": [[2, "potential-matches-universe"]], "Blocking": [[2, "blocking"]], "Comparisons": [[2, "comparisons"]], "Household Comparisons": [[2, "household-comparisons"]], "Comparison Features": [[2, "comparison-features"]], "Pipeline-generated Features": [[2, "pipeline-generated-features"]], "Training and models": [[2, "training-and-models"]], "Household training and models": [[2, "household-training-and-models"]], "Feature Selection transforms": [[3, "feature-selection-transforms"]], "bigrams": [[3, "bigrams"]], "array": [[3, "array"]], "union": [[3, "union"]], "soundex": [[3, "soundex"]], "Welcome to hlink\u2019s documentation!": [[4, "welcome-to-hlink-s-documentation"]], "Configuration API": [[4, "configuration-api"], [4, null]], "Installation": [[5, "installation"]], "Requirements": [[5, "requirements"]], "Installing from PyPI": [[5, "installing-from-pypi"]], "Installing from source": [[5, "installing-from-source"]], "Introduction": [[6, "introduction"]], "Overview": [[6, "overview"], [7, "overview"], [7, "id1"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"]], "Link Tasks": [[7, "link-tasks"]], "Preprocessing": [[7, "preprocessing"]], "Task steps": [[7, "task-steps"], [7, "id2"], [7, "id5"], [7, "id8"], [7, "id11"], [7, "id14"]], "Related Configuration Sections": [[7, "related-configuration-sections"], [7, "id3"], [7, "id6"], [7, "id9"], [7, "id12"], [7, "id15"]], "Training and Household Training": [[7, "training-and-household-training"]], "Matching": [[7, "matching"]], "Household Matching": [[7, "household-matching"]], "Model Exploration and Household Model Exploration": [[7, "model-exploration-and-household-model-exploration"]], "Reporting": [[7, "reporting"]], "Models": [[8, "models"]], "random_forest": [[8, "random-forest"]], "probit": [[8, "probit"]], "logistic_regression": [[8, "logistic-regression"]], "decision_tree": [[8, "decision-tree"]], "gradient_boosted_trees": [[8, "gradient-boosted-trees"]], "Pipeline generated features": [[9, "pipeline-generated-features"]], "Transformer types": [[9, "transformer-types"]], "interaction": [[9, "interaction"]], "bucketizer": [[9, "bucketizer"]], "Running hlink": [[10, "running-hlink"]], "Using hlink as a Library": [[10, "using-hlink-as-a-library"]], "Interactive Mode": [[10, "interactive-mode"]], "Starting the program": [[10, "starting-the-program"]], "Running Linking Tasks and Steps": [[10, "running-linking-tasks-and-steps"]], "Example interactive mode workflow": [[10, "example-interactive-mode-workflow"]], "Substitutions": [[11, "substitutions"]], "1:1 substitution by data table": [[11, "substitution-by-data-table"]], "Substitution by regex word replace": [[11, "substitution-by-regex-word-replace"]], "Advanced Workflow Examples": [[12, "advanced-workflow-examples"]], "Export training data after generating features to reuse in different linking years": [[12, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Example training data export with generated ML features": [[12, "example-training-data-export-with-generated-ml-features"]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[12, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Example model exploration and FP/FN export workflow": [[12, "example-model-exploration-and-fp-fn-export-workflow"]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["column_mappings", "comparison_types", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "filenames": ["column_mappings.md", "comparison_types.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "titles": ["Column Mappings", "Comparison types, transform add-ons, aggregate features, and household aggregate features", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "terms": {"each": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "read": [0, 1, 2, 7, 10], "from": [0, 1, 2, 4, 6, 7, 8, 9, 10, 12], "input": [0, 1, 2, 3, 6, 7, 10, 11], "dataset": [0, 1, 2, 3, 6, 7, 10, 12], "hlink": [0, 1, 2, 3, 5, 6, 7, 12], "It": [0, 1, 2, 6, 10, 12], "ha": [0, 1, 2, 6, 10, 12], "column_nam": [0, 1, 2, 11], "attribut": [0, 1, 2, 3, 7, 8, 9, 10, 11], "which": [0, 1, 2, 3, 6, 7, 9, 10, 12], "specifi": [0, 1, 2, 6, 7, 9, 10, 11], "name": [0, 1, 2, 3, 10, 11], "both": [0, 1, 2, 7, 12], "option": [0, 1, 2, 3, 6, 7, 8, 10, 12], "mai": [0, 2, 6, 7, 10], "have": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12], "an": [0, 1, 2, 6, 8, 10], "alia": [0, 2, 7], "give": [0, 2], "new": [0, 2, 12], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "support": [0, 7, 8], "some": [0, 1, 2, 3, 6, 7, 10], "make": [0, 1, 2, 5, 12], "chang": [0, 1, 2, 5, 10, 12], "data": [0, 1, 4, 6, 7, 10], "thei": [0, 1, 2, 7, 10], "ar": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12], "These": [0, 1, 2, 6, 7, 8, 9], "clean": [0, 6], "harmon": 0, "The": [0, 1, 2, 3, 5, 7, 8, 9, 10, 12], "avail": [0, 1, 2, 3, 5, 7, 8, 9, 12], "list": [0, 1, 2, 3, 4, 8, 10, 11], "below": [0, 1, 2, 3, 8, 9, 10], "section": [0, 1, 2, 12], "By": [0, 2, 10], "default": [0, 1, 2, 7, 8, 10], "must": [0, 1, 2, 3, 8, 9, 11], "same": [0, 1, 2, 6, 7, 10], "With": [0, 9], "override_column_a": [0, 2, 3], "override_column_b": [0, 2, 3], "you": [0, 1, 2, 5, 10, 11, 12], "can": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12], "differ": [0, 1, 2, 4, 6, 7], "either": [0, 1, 2, 6, 11], "A": [0, 1, 2, 3, 9, 10], "b": [0, 1, 2, 3, 10], "when": [0, 1, 2, 3, 7, 12], "do": [0, 1, 3, 10, 12], "thi": [0, 1, 2, 3, 5, 6, 7, 9, 10, 12], "appli": [0, 2, 3, 7, 12], "onli": [0, 1, 2, 7, 12], "non": 0, "overrid": [0, 2], "also": [0, 1, 2, 5, 6, 7, 9, 10, 12], "provid": [0, 2, 3, 6, 7, 9, 10], "override_transform": [0, 2], "describ": [0, 2, 10], "type": [0, 2, 3, 4, 7, 8, 10, 11, 12], "oper": [0, 2], "singl": [0, 2, 3, 10, 12], "output": [0, 1, 2, 3, 6, 7, 10, 12], "more": [0, 1, 2, 9, 10, 12], "than": [0, 1, 2, 8], "one": [0, 1, 2, 7], "order": [0, 2, 7], "so": [0, 1, 2, 5, 12], "anoth": [0, 1, 3, 7], "format": 0, "letter": 0, "t": [0, 1, 2, 3, 12], "u": 0, "repres": [0, 1, 2, 9, 10], "arbitrari": 0, "requir": [0, 1, 2, 3, 4, 7, 9, 10, 11], "addit": [0, 1, 2, 5, 6, 10], "vari": [0, 2, 3], "inform": [0, 1, 2, 10], "appear": [0, 1], "its": [0, 1, 6, 10], "suffix": 0, "mean": [0, 2], "two": [0, 1, 2, 3, 6, 7, 9, 10, 12], "link": [0, 1, 2, 4, 6, 8], "most": [0, 1, 7, 10], "independ": [0, 2], "For": [0, 1, 2, 7, 10, 12], "exampl": [0, 1, 2], "taken": [0, 1], "10": [0, 2, 5, 12], "year": [0, 1, 2, 3, 4], "apart": 0, "want": [0, 1, 2, 10, 12], "standard": [0, 1, 11], "ag": [0, 1, 2, 3], "variabl": [0, 1, 2, 12], "i": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12], "compar": [0, 1, 2, 6, 7], "between": [0, 1, 2, 6, 7, 10, 12], "To": [0, 1, 5, 7, 10], "could": [0, 2], "creat": [0, 2, 6, 7, 9, 10, 11, 12], "age_at_dataset_b": 0, "ad": [0, 1, 2], "column_map": [0, 2, 7], "valu": [0, 1, 2, 3, 8, 9, 10, 11], "As": 0, "suppos": [0, 2], "record": [0, 1, 2, 6, 7], "person": [0, 1, 6], "": [0, 1, 2, 6, 7, 10, 11], "first": [0, 1, 2, 5, 7, 10, 11], "string": [0, 1, 2, 3, 7, 8, 10, 11], "In": [0, 1, 6, 10, 12], "call": 0, "namefrst": [0, 1, 2], "entir": [0, 2], "lowercas": 0, "first_nam": 0, "uppercas": 0, "follow": [0, 1, 6, 10, 11, 12], "configur": [0, 1, 6, 10, 12], "add": [0, 4], "_": [0, 1, 2, 3, 4, 8, 9, 10], "given": [0, 1, 2, 3, 8, 12], "numer": [0, 1], "11": [0, 2, 5, 9], "concat": 0, "concaten": [0, 1], "end": [0, 1, 2, 3, 11], "col": [0, 1], "togeth": [0, 1, 2], "take": [0, 1, 2, 3, 7, 10], "column_to_append": 0, "multipl": [0, 1, 2, 10], "time": [0, 2, 7, 10], "row": [0, 2, 3], "If": [0, 1, 2, 3, 7, 8, 10, 11], "automat": [0, 2, 5, 7], "convert": [0, 1, 2], "befor": [0, 1, 2, 3, 5, 7], "statefip": [0, 1, 2], "counti": [0, 1], "strip": [0, 7], "alphabet": 0, "charact": 0, "lower": [0, 1], "case": [0, 1, 2, 3, 6], "white": 0, "space": [0, 2, 3, 11], "start": [0, 11], "ration": 0, "word": [0, 4], "replac": [0, 1, 4], "sinc": [0, 2], "peopl": [0, 1, 6, 10], "raw": [0, 2, 7, 10], "censu": [0, 7, 12], "contain": [0, 1, 11], "lead": 0, "better": [0, 6], "match": [0, 1, 4, 6, 10, 11, 12], "remov": 0, "qmark": 0, "hyphen": 0, "punctuat": 0, "apostroph": 0, "altern": [0, 2], "surround": 0, "all": [0, 1, 2, 3, 7, 8, 9, 10], "them": [0, 1, 2, 7], "jr": [0, 2], "sr": [0, 2], "ii": [0, 2], "iii": [0, 2], "stop": 0, "last": [0, 1, 7, 9], "street": [0, 1], "avenu": [0, 11], "blvd": 0, "circl": 0, "court": 0, "road": 0, "prefix": 0, "like": [0, 2, 7, 10], "m": [0, 1], "mr": 0, "ah": 0, "chines": 0, "condens": 0, "whitespac": [0, 7], "leav": 0, "behind": 0, "arrai": [0, 2, 4, 8, 9], "namefrst_split": [0, 2], "namefrst_clean": [0, 2], "index": [0, 5], "select": [0, 1, 4, 6, 10, 12], "element": 0, "posit": [0, 1, 2, 4, 6], "second": [0, 1, 2, 11], "1": [0, 1, 2, 4, 7, 8, 9, 10, 12], "item": 0, "set": [0, 1, 2, 3, 6, 7, 10, 12], "Then": [0, 5], "0": [0, 1, 2, 7, 8, 9, 10, 12], "initi": [0, 1, 10], "probabl": [0, 2, 8], "middl": [0, 1], "namefrst_mid_init": [0, 1], "otherwis": [0, 1, 9, 12], "known": 0, "recod": 0, "birthyr": [0, 2], "clean_birthyr": [0, 2, 3], "9999": [0, 2, 9], "1999": [0, 2], "9998": 0, "divid": 0, "int": [0, 1, 2, 3, 8], "integ": [0, 1, 2, 9], "result": [0, 1, 6, 9, 10, 12], "instanc": [0, 8], "birthplac": [0, 2], "detail": [0, 2, 10], "version": [0, 5, 12], "gener": [0, 1, 4, 6, 7, 10], "least": [0, 1], "signific": 0, "digit": 0, "we": [0, 1, 10, 12], "simpli": [0, 2], "drop": [0, 2, 10], "100": [0, 2, 12], "round": [0, 2], "lowest": 0, "whole": [0, 6], "number": [0, 1, 2, 3, 7, 8, 10], "floor": 0, "function": [0, 1, 2, 6, 10], "bpl": [0, 1, 2], "bpl_root": 0, "condit": [0, 1, 2, 3, 4, 7], "logic": 0, "work": [0, 1, 2, 5, 7, 10, 12], "sql": [0, 1, 2, 3, 4, 7, 10], "express": [0, 1, 2], "claus": [0, 1], "if_valu": 0, "else_valu": 0, "race": [0, 1, 2, 9, 12], "ipum": [0, 6], "code": [0, 1, 2, 5], "categori": [0, 8], "get": [0, 1, 2, 10], "down": [0, 6, 12], "nearest": 0, "produc": [0, 10], "relat": [0, 1, 2], "hundr": 0, "300": 0, "child": [0, 8], "household": [0, 4, 6, 8, 10, 12], "head": 0, "301": 0, "302": 0, "adopt": 0, "303": 0, "step": [0, 1, 2, 6], "usual": [0, 7, 12], "need": [0, 1, 2, 7, 10, 12], "2": [0, 1, 2, 3, 7, 8, 11, 12], "spous": 0, "3": [0, 1, 2, 5, 7, 8, 9, 12], "4": [0, 1, 8], "law": 0, "5": [0, 1, 2, 8, 9, 10, 12], "parent": [0, 1, 11], "6": [0, 2, 8, 9, 12], "7": [0, 1, 2, 8, 12], "sibl": 0, "12": [0, 5], "relate_div_100": [0, 1, 2], "page": [1, 2, 10], "comparison_featur": [1, 2, 7], "along": 1, "header": [1, 2, 9, 11], "context": [1, 9], "relatematch": [1, 2], "comparison_typ": [1, 2], "categor": [1, 2, 8, 9], "true": [1, 2, 3, 7, 9, 11, 12], "maximum": [1, 8], "jaro": [1, 9], "winkler": [1, 9], "find": [1, 7, 12], "greatest": 1, "among": 1, "cartesian": 1, "product": [1, 6, 12], "column": [1, 3, 4, 7, 9, 10, 11, 12], "namelast": [1, 2], "would": [1, 2, 12], "return": [1, 3, 8, 10], "four": 1, "namefrst_a": 1, "namefrst_b": 1, "namelast_b": 1, "namelast_a": 1, "maximum_jw": 1, "score": [1, 2, 7, 9], "namefrst_jw": [1, 2, 12], "geograph": 1, "filter": [1, 4, 7, 11], "major": [1, 10], "locat": [1, 2, 10], "boundari": 1, "zero": 1, "jw_street": 1, "enum_dist": 1, "max": [1, 8, 10], "member": [1, 7], "neighborhood": 1, "surnam": 1, "related_individual_max_jw": 1, "namefrst_rel": 1, "assert": [1, 10], "NOT": 1, "distinct": 1, "f1": 1, "evalu": [1, 2, 6, 7, 8], "ani": [1, 2, 3, 5, 8], "potenti": [1, 4, 7], "mismatch": 1, "queri": [1, 2], "fi": 1, "OR": [1, 2], "mi0": 1, "mi1": 1, "THEN": 1, "els": [1, 2, 3], "first_init_col": 1, "namefrst_init": 1, "mid_init_col": 1, "namefrst_mid_init_2": 1, "f2": 1, "empti": 1, "null": [1, 2, 3], "AND": [1, 2], "individu": [1, 2, 7, 12], "mainli": 1, "caution": [1, 9], "flag": [1, 9, 10, 12], "f": [1, 10], "sp": 1, "m_caution": [1, 2, 9, 12], "mbpl": 1, "mother_birthyr": 1, "stepmom": 1, "momloc": 1, "comp_a": [1, 2], "comp_b": [1, 2], "comp_c": 1, "parent_step_chang": 1, "comp_d": 1, "check": [1, 10], "sign": 1, "boolean": [1, 2, 3, 11, 12], "form": [1, 7, 11], "cast": 1, "namelast_equal_as_int": 1, "namelast_clean": [1, 2, 3], "whether": [1, 2, 11], "join": [1, 2, 11], "across": 1, "being": [1, 7], "exact": [1, 2], "namefrst_unstd": [1, 2], "present": [1, 2, 9], "nonzero": 1, "primarili": [1, 7], "indic": [1, 12], "kind": 1, "incompar": 1, "akin": 1, "miss": [1, 10], "see": [1, 2, 5, 10, 12], "univers": [1, 4, 7], "similar": 1, "fbpl_nomatch": 1, "fbpl": 1, "allow": [1, 2, 7, 12], "up": [1, 2, 10, 11], "sub": 1, "object": [1, 2, 6, 10], "document": [1, 8, 10, 12], "sp_caution": [1, 2, 12], "spouse_bpl": 1, "spouse_birthyr": 1, "durmarr": [1, 2], "new_marr": [1, 2], "street_jw": [1, 2, 12], "9": 1, "multipli": 1, "after": [1, 2, 4, 8, 10], "float": [1, 2, 8], "comp": 1, "c": 1, "sploc": 1, "012": 1, "fals": [1, 2, 3, 4, 6, 10], "d": 1, "under": [1, 2], "specif": [1, 2, 10], "circumst": 1, "should": [1, 2, 3, 8, 9, 10], "mid_init_match": 1, "either_1": 1, "nativ": 1, "either_0": 1, "gen": 1, "imm": [1, 2, 12], "immigr": 1, "look": [1, 10, 11], "foreign": 1, "born": 1, "sgen": [1, 2, 12], "rel": [1, 2, 12], "scala": 1, "determin": [1, 7], "greater": [1, 5], "jw_threshold": 1, "less": [1, 2], "age_threshold": 1, "sex": [1, 2, 11], "sampl": 1, "related_individual_row": 1, "unrel": 1, "depend": [1, 2, 5, 12], "name_col": 1, "birthyr_col": 1, "namefrst_related_row": 1, "replaced_birthyr": [1, 2, 3], "extra": 1, "children": 1, "who": 1, "base": [1, 2, 7], "expect": 1, "count": [1, 10, 12], "suspect": [1, 6], "relate_col": 1, "histid_col": 1, "id": [1, 2], "birth": 1, "year_b": 1, "wa": [1, 12], "minimum": [1, 8], "accept": [1, 2, 12], "consid": [1, 8], "histid": [1, 2, 12], "1910": [1, 2, 12], "8": [1, 2, 5, 10], "rate": 1, "calcul": [1, 12], "percentag": 1, "seen": 1, "neighbor": 1, "meet": 1, "95": 1, "nbor": [1, 2, 12], "namelast_neighbor": 1, "05": [1, 2], "namelast_popularity_sum": 1, "namelast_popular": 1, "length": [1, 2, 9], "size": 1, "ab": 1, "diff": 1, "absolut": 1, "invalid": [1, 8], "instead": [1, 2, 3, 5, 7], "marriag": 1, "durat": 1, "99": [1, 2], "placehold": 1, "unknown": 1, "exclud": 1, "those": [1, 2], "consider": 1, "byrdiff": [1, 2, 12], "mardurmatch": [1, 2], "14": 1, "minu": [1, 2], "subtract": 1, "geo": 1, "distanc": [1, 8], "lookup": 1, "tabl": [1, 2, 4, 7, 10, 12], "core": [1, 7, 10, 12], "dist_tabl": 1, "py": [1, 2], "There": [1, 2, 7], "sever": [1, 6], "wai": [1, 5, 10], "file": [1, 4, 6, 7, 10, 11, 12], "kei": [1, 7, 10], "key_count": 1, "secondari": 1, "serv": 1, "back": 1, "primari": [1, 6], "doe": [1, 3, 7, 12], "particularli": 1, "state": [1, 6], "much": [1, 7], "fewer": [1, 8], "combin": [1, 2, 3, 7], "thu": 1, "risk": 1, "fill": 1, "aren": 1, "ex": 1, "just": [1, 2, 10, 12], "even": 1, "though": 1, "distances_fil": 1, "path": [1, 2, 10, 11, 12], "table_nam": 1, "what": [1, 2, 10, 12], "onc": [1, 10], "loc_a": 1, "where": [1, 7, 10, 12], "come": 1, "loc_b": 1, "distance_col": 1, "source_column_a": 1, "sourc": [1, 4, 7, 10, 12], "source_column_b": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b_0": 1, "loc_b_1": 1, "secondary_key_count": 1, "backup": 1, "secondary_table_nam": 1, "secondary_distances_fil": 1, "secondary_source_column": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_distance_col": 1, "state_dist": 1, "state_distance_lookup": 1, "county_state_dist": 1, "csv": [1, 2, 7, 10, 11, 12], "statecode1": 1, "statecode2": 1, "dist": 1, "county_dist": [1, 2, 12], "county_distance_lookup": 1, "county_1900_1910_distances_km": 1, "from_icpsrctyi": 1, "to_icpsrctyi": 1, "from_statefip": 1, "to_statefip": 1, "distance_km": 1, "state_1900_1910_distances_km": 1, "fetch": 1, "neither": 1, "nor": 1, "mpre": 1, "m_namefrst": 1, "accord": 1, "niu": 1, "other": [1, 2, 12], "mfbplmatch": 1, "multi": 1, "search": 1, "special": 1, "simplifi": 1, "particular": [1, 2], "constraint": 1, "num_col": 1, "whose": 1, "templat": 1, "n": [1, 8, 9], "per": [1, 2, 8, 9, 10], "current": [1, 2, 10], "respect": [1, 7], "jw_col_templ": 1, "jw": 1, "pair": [1, 12], "equal_and_not_null_templ": 1, "final": [1, 2, 12], "comput": [1, 3, 7], "_namefrst": 1, "_bpl": 1, "_sex": 1, "25": 1, "nvl": 1, "sm_namefrst": 1, "sn_namefrst": 1, "sm_bpl": 1, "sn_bpl": 1, "sm_sex": 1, "sn_sex": 1, "pass": [1, 2, 7, 8], "flexibl": 1, "user": [1, 10], "write": [1, 10, 12], "own": [1, 2], "favor": 1, "reason": 1, "good": 1, "fallback": 1, "defin": [1, 7, 8, 9, 10], "spark": [1, 2, 3, 5, 8, 9, 10, 12], "builtin": 1, "argument": [1, 10, 12], "namelast_jw_max": 1, "namelast1": 1, "namelast2": 1, "namelast3": 1, "abov": [1, 5], "extend": 1, "beyond": 1, "top": [1, 4], "level": [1, 4, 10], "everi": [1, 3], "jw_f": [1, 2, 12], "father_namefrst": 1, "rais": [1, 3], "exponenti": 1, "squar": 1, "county_distance_squar": [1, 2, 12], "county_a": 1, "county_b": 1, "upper": 1, "gt": 1, "btwn": 1, "addl": 1, "var": [1, 2], "program": [1, 2, 7, 12], "report": [1, 4, 6, 10], "addl_var": 1, "check_val_expr": 1, "else_v": 1, "volumn": 1, "datasourc": [1, 2, 10], "yrimmig": 1, "immyear_diff": [1, 2, 9, 12], "includ": [1, 2, 7, 9, 10], "train": [1, 4, 6, 8, 10], "independent_var": [1, 2, 12], "config": [1, 4, 7, 10, 12], "id_column": [1, 2], "_a": 1, "mult": 1, "exist": [1, 2, 10], "within": [1, 2, 6, 10, 11], "hh_train": [1, 2, 7, 10, 12], "hh": 1, "highest": [1, 2], "against": [1, 11], "ten": [1, 2], "tell": [2, 3], "how": [2, 7], "descript": [2, 8, 10], "refer": 2, "here": [2, 7, 10, 12], "tutori": [2, 10], "script": [2, 6, 10], "discuss": 2, "readm": 2, "note": 2, "written": [2, 6], "toml": [2, 6, 10], "abl": 2, "json": [2, 10], "datasource_a": [2, 7], "datasource_b": [2, 7], "transform": [2, 4, 6, 7], "lowercase_strip": 2, "add_to_a": 2, "age_2": 2, "derived_from": 2, "expand_length": 2, "explod": [2, 7], "jaro_winkl": 2, "namelast_jw": [2, 12], "threshold": [2, 8, 12], "feature_nam": 2, "79": 2, "84": 2, "complex": [2, 3], "machin": [2, 6, 7, 10, 12], "learn": [2, 6, 7, 10, 12], "probabilist": [2, 6], "drop_data_from_scored_match": 2, "us1900": 2, "us1900m_usa": 2, "p": 2, "parquet": [2, 7], "us1910": 2, "us1910m_usa": 2, "training_data_subset": 2, "serialp": 2, "rationalize_name_word": 2, "remove_qmark_hyphen": 2, "replace_apostroph": 2, "remove_suffix": 2, "remove_alternate_nam": 2, "condense_strip_whitespac": 2, "split": [2, 3, 7, 8, 9, 12], "namefrst_std": [2, 11], "array_index": 2, "bpl_orig": 2, "divide_by_int": 2, "get_floor": 2, "statefip_h": 2, "output_typ": 2, "substitution_column": [2, 7, 11], "join_column": [2, 11], "join_valu": [2, 11], "substitution_fil": [2, 11], "name_std": [2, 11], "male": [2, 11], "femal": [2, 11], "feature_select": [2, 3, 7], "input_column": [2, 3, 9], "output_column": [2, 3, 9], "sql_condit": 2, "namelast_bigram": 2, "bigram": [2, 4], "bpl_clean": 2, "bpl_str": 2, "washington": 2, "bpl2_str": 2, "53": 2, "region": [2, 12], "attach_vari": 2, "region_dict": 2, "col_to_join_on": 2, "col_to_add": 2, "null_fil": 2, "col_typ": 2, "potential_matches_univers": [2, 7], "birthyr_3": 2, "namefrst_std_jw": [2, 12], "75": [2, 8, 12], "comparis": 2, "post": [2, 7], "hh_comparison": [2, 7], "threshold_expr": 2, "fetch_a": 2, "sex_equ": 2, "equal": [2, 11], "relate_a": [2, 9], "pipeline_featur": [2, 7, 9], "sex_region_interact": 2, "transformer_typ": [2, 9], "interact": [2, 4, 7, 12], "relatetyp": [2, 9], "bucket": [2, 7], "hit": [2, 10, 12], "scale_data": [2, 12], "training_data": [2, 10], "dependent_var": [2, 12], "score_with_model": [2, 12], "use_training_data_featur": [2, 7, 12], "split_by_id_a": [2, 12], "decis": [2, 4, 8, 12], "drop_duplicate_with_threshold_ratio": [2, 12], "n_training_iter": [2, 7, 12], "output_suspicious_td": [2, 12], "param_grid": [2, 12], "model_paramet": [2, 7, 8, 12], "random_forest": [2, 12], "maxdepth": [2, 8, 12], "numtre": [2, 8, 12], "005": 2, "threshold_ratio": [2, 8, 12], "logistic_regress": [2, 12], "50": [2, 12], "65": 2, "80": 2, "chosen_model": [2, 8, 12], "prediction_col": 2, "predict": [2, 12], "hh_col": 2, "hh_training_data_1900_1910": 2, "probit": [2, 4], "go": [2, 10], "your": [2, 5, 7, 10, 12], "uniqu": 2, "identifi": [2, 6, 12], "full": [2, 7, 12], "short": 2, "alphanumer": 2, "convert_ints_to_long": 2, "long": [2, 11], "especi": 2, "assum": 2, "schema": 2, "sometim": 2, "term": 2, "bigint": 2, "thing": 2, "my_fil": 2, "subset": [2, 11], "limit": 2, "extract": 2, "modifi": 2, "meant": 2, "usag": [2, 3, 4, 10], "set_value_column_a": [2, 3], "liter": 2, "set_value_column_b": [2, 3], "iv": 2, "v": 2, "vi": 2, "vii": 2, "viii": 2, "namelast_clean_bigram": [2, 3], "fed": [2, 7], "prep": 2, "df": [2, 10], "men": 2, "newli": 2, "attempt": 2, "duplic": [2, 8], "conjuct": 2, "Will": 2, "conjunct": 2, "rang": [2, 9], "original_valu": 2, "plu": 2, "1870": 2, "expand": 2, "1867": 2, "1868": 2, "1869": 2, "1871": 2, "1872": 2, "1873": 2, "kept": 2, "keep": 2, "appropri": 2, "treat": [2, 9], "import": [2, 7, 10, 12], "dure": [2, 7], "hot": 2, "encod": [2, 3], "vector": [2, 9], "stage": 2, "well": 2, "upper_threshold": 2, "cannot": 2, "robust": 2, "ml": [2, 4, 8, 9], "typic": [2, 7], "leverag": 2, "api": [2, 6, 9], "piplin": 2, "regionf": 2, "sex_regionf_interact": 2, "immyear_caut": [2, 9], "myriad": 2, "explor": [2, 4, 6, 10], "part": [2, 7], "task": [2, 4, 6, 8, 12], "drop_duplicate_a": 2, "out": [2, 7, 12], "best": [2, 7], "smallest": 2, "possibl": 2, "ratio": [2, 8], "beta": [2, 8], "test": [2, 7, 12], "model_explor": [2, 10, 12], "hyper": [2, 6, 12], "paramet": [2, 6, 7, 8, 10, 12], "eval": 2, "skip": [2, 7], "apply_model": 2, "run_all_step": [2, 10, 12], "command": [2, 6, 10, 12], "try": 2, "creation": 2, "iter": 2, "scale": 2, "error": [2, 9], "1900": [2, 12], "about": [2, 10, 12], "1930": [2, 12], "1940": [2, 12], "fail": 2, "were": 2, "sure": [2, 5, 10], "scratch": 2, "although": 2, "know": 2, "haven": 2, "save": [2, 7, 12], "small": 2, "amount": 2, "process": [2, 6, 10], "repeatedli": 2, "help": [2, 7, 10], "neg": [2, 4, 6], "area": 2, "coverag": 2, "increas": [2, 9], "represent": [2, 7], "ensur": 2, "group": [2, 7], "a304bt": 2, "three": [2, 7], "b200": 2, "c201": 2, "d425": 2, "perform": [2, 6, 7, 11], "feature_import": [2, 7, 12], "coeffici": [2, 7], "enabl": [2, 7, 10], "srace": [2, 9, 12], "race_interacted_srac": [2, 9, 12], "hits2": [2, 12], "exact_mult": [2, 12], "ncount": [2, 3, 12], "ncount2": [2, 3, 12], "f_interacted_jw_f": [2, 12], "f_caution": [2, 12], "f_pre": [2, 12], "fbplmatch": [2, 12], "m_interacted_jw_m": [2, 9, 12], "jw_m": [2, 9, 12], "m_pre": [2, 9, 12], "mbplmatch": [2, 12], "sp_interacted_jw_sp": [2, 12], "jw_sp": [2, 12], "sp_pre": [2, 12], "mi": [2, 12], "fsoundex": [2, 12], "lsoundex": [2, 12], "oth": [2, 12], "imm_interacted_immyear_caut": [2, 12], "1900_1910_training_data_20191023": 2, "jw_max_a": 2, "jw_max_b": 2, "f1_match": 2, "f2_match": 2, "byrdifcat": 2, "racematch": 2, "bplmatch": 2, "imm_interacted_bplmatch": 2, "sexmatch": 2, "relatetype_interacted_relatematch": 2, "checkpoint": 3, "no_first_pad": 3, "don": [2, 3], "prepend": 3, "namefrst_unstd_bigram": 3, "namelast_frst_bigram": 3, "namelast_clean_soundex": 3, "input_col": 3, "output_col": 3, "expon": 3, "introduct": 4, "overview": 4, "instal": 4, "pypi": 4, "preprocess": [4, 6, 10, 12], "model": [4, 6, 10], "run": [4, 5, 6, 7, 12], "librari": [4, 6], "mode": [4, 5, 12], "advanc": 4, "workflow": [3, 4], "export": [4, 7, 10], "featur": [4, 6, 7, 8, 10], "reus": 4, "basic": 4, "map": [4, 7, 9], "substitut": [4, 7], "block": [4, 7], "comparison": [4, 7], "pipelin": 4, "ons": 4, "aggreg": 4, "union": 4, "soundex": 4, "power": 4, "regex": 4, "random": [4, 8], "forest": [4, 8], "logist": [4, 8], "regress": [4, 8], "tree": [4, 8], "gradient": [4, 8], "boost": [4, 8], "system": 5, "python": [5, 6, 10], "java": 5, "integr": 5, "apach": 5, "via": [5, 6], "pyspark": [5, 8, 9, 10], "packag": 5, "org": 5, "latest": 5, "pip": 5, "easiest": [5, 10], "through": [5, 7, 9, 10], "instruct": [5, 10], "But": [2, 5], "clone": 5, "github": 5, "repositori": 5, "root": 5, "project": 5, "directori": [5, 10, 12], "develop": [5, 6], "e": 5, "dev": 5, "edit": 5, "made": 5, "built": 5, "tool": [5, 6], "line": [6, 10], "share": 6, "characterist": [6, 7], "correspond": [6, 7], "real": 6, "world": 6, "determinist": [6, 7], "rule": [6, 7], "algorithm": [6, 7], "At": [6, 7], "been": 6, "unit": 6, "census": 6, "hierarch": [6, 10], "structur": 6, "nest": 6, "howev": [3, 6, 12], "tailor": 6, "ignor": 6, "common": [6, 7, 12], "highli": [6, 7], "languag": 6, "further": [6, 12], "broken": 6, "smaller": 6, "sequenc": 6, "linkrun": [6, 10], "prepar": [6, 7, 10], "research": 6, "experi": 6, "understand": 6, "tune": [6, 12], "relationship": 6, "varieti": 7, "normal": 7, "abbrevi": [7, 11], "regist": [7, 10], "datafram": [7, 10, 12], "request": 7, "classif": [7, 8], "metadata": 7, "introspect": 7, "ingest": 7, "inspect": 7, "mani": [7, 10], "aspect": [7, 10], "extens": 7, "longest": 7, "definit": 7, "reduc": [3, 7], "drastic": 7, "improv": 7, "runtim": 7, "separ": 7, "total": 7, "potential_match": [7, 10], "satisfi": 7, "elig": 7, "reshap": 7, "thought": 7, "ahead": 7, "chosen": 7, "experiment": [7, 10], "focus": 7, "demograph": 7, "moment": 7, "veri": [3, 7, 12], "anyon": 7, "percent": 7, "remain": 7, "popul": 7, "pull": 7, "fix": 7, "width": 7, "crosswalk": 7, "construct": 7, "alpha": 8, "hyperparamet": [8, 12], "de": 8, "param": [8, 12], "label": 8, "doc": [8, 9], "commonli": 8, "explan": 8, "randomforestclassifi": 8, "depth": 8, "20": 8, "featuresubsetstrategi": 8, "node": 8, "auto": 8, "onethird": 8, "sqrt": 8, "log2": 8, "15": 8, "generalizedlinearregress": 8, "famili": 8, "binomi": 8, "85": [8, 10], "logisticregress": 8, "decisiontreeclassifi": 8, "mininstancespernod": 8, "caus": 8, "left": 8, "right": [8, 10], "discard": 8, "maxbin": 8, "bin": 8, "discret": 8, "continu": [8, 9, 12], "gbtclassifi": 8, "mother": 9, "point": [9, 12], "x": [9, 10], "y": 9, "hold": 9, "except": 9, "strictli": 9, "inf": 9, "explicitli": 9, "cover": 9, "doubl": 9, "outsid": 9, "job": 10, "high": 10, "class": 10, "handl": 10, "main": 10, "complet": 10, "access": [10, 12], "link_run": 10, "factori": 10, "sparkfactori": 10, "load_config": 10, "load_conf_fil": 10, "sparksess": 10, "now": 10, "let": 10, "load": 10, "our": 10, "my_conf": 10, "lr": 10, "prep_step": 10, "get_step": 10, "enumer": 10, "print": 10, "input_table_nam": 10, "output_table_nam": 10, "run_step": 10, "get_tabl": 10, "matches_df": 10, "hh_model_explor": 10, "method": [10, 12], "interfac": 10, "easili": 10, "conveni": 10, "adjust": 10, "set_loc": 10, "set_num_cor": 10, "set_executor_memori": 10, "5g": 10, "ll": 10, "dictionari": 10, "often": [3, 10], "modul": 10, "pleas": 10, "reproduc": 10, "consol": 10, "cpu": 10, "h": 10, "executor_memori": [10, 12], "execute_task": 10, "execute_command": 10, "conf": [10, 12], "show": 10, "messag": 10, "exit": 10, "memori": 10, "executor": 10, "begin": 10, "execut": 10, "seri": 10, "excute_command": 10, "filepath": 10, "sai": 10, "fullcount_1870_1880": 10, "pattern": 10, "full_count_1870_1880": 10, "prompt": 10, "enter": 10, "text": 10, "unstabl": 10, "topic": 10, "analyz": [10, 12], "set_preexisting_t": 10, "x_persist": 10, "borrow_t": 10, "get_task": 10, "set_print_sql": 10, "x_sql": 10, "x_sqlf": 10, "ipython": 10, "showf": 10, "x_summari": 10, "desc": 10, "x_crosswalk": 10, "x_tab": 10, "q": [10, 12], "x_hh_tfam": 10, "x_tfam": 10, "drop_al": 10, "reload": 10, "x_hh_tfam_2a": 10, "x_tfam_raw": 10, "drop_all_prc": 10, "x_hh_tfam_2b": 10, "x_union": 10, "drop_all_temp": 10, "x_load": 10, "get_set": 10, "set_link_task": 10, "x_parquet_from_csv": 10, "organ": 10, "hierarchi": 10, "five": 10, "hh_match": 10, "someth": 10, "choic": 10, "preexist": 10, "prepped_df_a": 10, "prepped_df_b": 10, "raw_df_b": 10, "raw_df_a": 10, "training_featur": [10, 12], "scored_potential_match": 10, "potential_matches_prep": 10, "exploded_df_b": 10, "exploded_df_a": 10, "predicted_match": 10, "hh_training_featur": [10, 12], "hh_training_data": 10, "hh_predicted_match": 10, "hh_scored_potential_match": 10, "hh_potential_match": 10, "hh_blocked_match": 10, "hh_potential_matchs_prep": 10, "model_eval_training_vector": 10, "model_eval_training_data": 10, "model_eval_repeat_fp": 10, "model_eval_training_featur": 10, "model_eval_training_result": 10, "model_eval_repeat_fn": 10, "hh_model_eval_training_vector": 10, "hh_model_eval_repeat_fp": 10, "hh_model_eval_repeat_fn": 10, "hh_model_eval_training_result": 10, "hh_model_eval_training_featur": 10, "hh_model_eval_training_data": 10, "persist": 10, "hidden": 10, "intermedi": 10, "yet": 10, "databas": 10, "tablenam": 10, "istemporari": 10, "task_nam": 10, "num": 10, "finish": 10, "put": [10, 12], "launch": [10, 12], "my": [10, 12], "subhead": 11, "suppli": 11, "regex_word_replac": 11, "variant": 11, "av": 11, "7th": 11, "swap": 11, "still": 11, "anywher": 11, "proceed": 11, "street_unstd": 11, "dir": 11, "substitutions_street_abbrev": 11, "span": 12, "1920": 12, "deriv": 12, "necessari": [3, 12], "scenario": 12, "copi": [3, 12], "use_potential_matches_featur": 12, "full_count_1900_1910": 12, "50g": 12, "ask": 12, "arg": 12, "partit": 12, "training_data_1900_1910_hlink_featur": 12, "might": 12, "shut": 12, "framework": 12, "etc": 12, "relev": 12, "matrix": 12, "implement": 12, "regular": 12, "training_data_1900_1910": 12, "weren": 12, "ident": 12, "manual": 12, "updat": 12, "isn": 12, "analysi": 12, "training_result": 12, "hh_training_result": 12, "1900_1910_training_result": 12, "repeat_fp": 12, "repeat_fn": 12, "hh_repeat_fp": 12, "hh_repeat_fn": 12, "1900_1910_potential_fp": 12, "1900_1910_potential_fn": 12, "prefer": 12, "ve": 12, "or_group": 2, "belong": 2, "bpl1": 2, "bpl2": 2, "bpl3": 2, "parenthes": 2, "around": 2, "connect": 2, "few": 3, "util": 3, "resourc": 3, "affect": 3}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"column": [0, 2], "map": [0, 2], "basic": [0, 2], "usag": 0, "advanc": [0, 2, 12], "transform": [0, 1, 3, 9], "add_to_a": 0, "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "lowercase_strip": 0, "rationalize_name_word": 0, "remove_qmark_hyphen": 0, "remove_punctu": 0, "replace_apostroph": 0, "remove_alternate_nam": 0, "remove_suffix": 0, "remove_stop_word": 0, "remove_prefix": 0, "condense_strip_whitespac": 0, "remove_one_letter_nam": 0, "split": 0, "array_index": 0, "substr": 0, "divide_by_int": 0, "when_valu": 0, "get_floor": 0, "comparison": [1, 2], "type": [1, 9], "add": 1, "ons": 1, "aggreg": 1, "featur": [1, 2, 3, 9, 12], "household": [1, 2, 7], "maximum_jaro_winkl": 1, "jaro_winkl": 1, "jaro_winkler_street": 1, "max_jaro_winkl": 1, "equal": 1, "f1_match": 1, "f2_match": 1, "not_equ": 1, "equals_as_int": 1, "all_equ": 1, "not_zero_and_not_equ": 1, "time": 1, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "any_equ": 1, "either_are_1": 1, "either_are_0": 1, "second_gen_imm": 1, "rel_jaro_winkl": 1, "extra_children": 1, "jaro_winkler_r": 1, "sum": 1, "length_b": 1, "abs_diff": 1, "b_minus_a": 1, "geo_dist": 1, "fetch_a": 1, "fetch_b": 1, "present_both_year": 1, "neither_are_nul": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_and_equal_categorical_in_univers": 1, "multi_jaro_winkler_search": 1, "sql_condit": [1, 3], "alia": 1, "power": [1, 3], "threshold": 1, "lower_threshold": 1, "upper_threshold": 1, "gt_threshold": 1, "btwn_threshold": 1, "look_at_addl_var": 1, "hit": 1, "hits2": 1, "exact_mult": 1, "jw_max_a": 1, "jw_max_b": 1, "configur": [2, 4, 7], "config": 2, "file": 2, "top": 2, "level": 2, "data": [2, 11, 12], "sourc": [2, 5], "filter": 2, "substitut": [2, 11], "select": [2, 3], "potenti": [2, 12], "match": [2, 7], "univers": 2, "block": 2, "pipelin": [2, 9], "gener": [2, 9, 12], "train": [2, 7, 12], "model": [2, 7, 8, 12], "bigram": 3, "arrai": 3, "union": 3, "soundex": 3, "welcom": 4, "hlink": [4, 10], "": 4, "document": 4, "api": 4, "instal": 5, "requir": 5, "from": 5, "pypi": 5, "introduct": 6, "overview": [6, 7], "link": [7, 10, 12], "task": [7, 10], "preprocess": 7, "step": [7, 10], "relat": 7, "section": 7, "explor": [7, 12], "report": 7, "random_forest": 8, "probit": 8, "logistic_regress": 8, "decision_tre": 8, "gradient_boosted_tre": 8, "interact": [9, 10], "bucket": 9, "run": 10, "us": 10, "librari": 10, "mode": 10, "start": 10, "program": 10, "exampl": [10, 12], "workflow": [10, 12], "1": 11, "tabl": 11, "regex": 11, "word": 11, "replac": 11, "export": 12, "after": 12, "reus": 12, "differ": 12, "year": 12, "ml": 12, "list": 12, "fals": 12, "posit": 12, "neg": 12, "fp": 12, "fn": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Column Mappings": [[0, "column-mappings"], [2, "column-mappings"]], "Basic Usage": [[0, "basic-usage"]], "Advanced Usage": [[0, "advanced-usage"]], "Transforms": [[0, "transforms"]], "add_to_a": [[0, "add-to-a"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "lowercase_strip": [[0, "lowercase-strip"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_punctuation": [[0, "remove-punctuation"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_suffixes": [[0, "remove-suffixes"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_prefixes": [[0, "remove-prefixes"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "split": [[0, "split"]], "array_index": [[0, "array-index"]], "mapping": [[0, "mapping"]], "substring": [[0, "substring"]], "divide_by_int": [[0, "divide-by-int"]], "when_value": [[0, "when-value"]], "get_floor": [[0, "get-floor"]], "Comparison types, transform add-ons, aggregate features, and household aggregate features": [[1, "comparison-types-transform-add-ons-aggregate-features-and-household-aggregate-features"]], "Comparison types": [[1, "comparison-types"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "equals": [[1, "equals"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "not_equals": [[1, "not-equals"]], "equals_as_int": [[1, "equals-as-int"]], "all_equals": [[1, "all-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "and": [[1, "and"]], "times": [[1, "times"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "any_equals": [[1, "any-equals"]], "either_are_1": [[1, "either-are-1"]], "either_are_0": [[1, "either-are-0"]], "second_gen_imm": [[1, "second-gen-imm"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "extra_children": [[1, "extra-children"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "sum": [[1, "sum"]], "length_b": [[1, "length-b"]], "abs_diff": [[1, "abs-diff"]], "b_minus_a": [[1, "b-minus-a"]], "geo_distance": [[1, "geo-distance"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "present_both_years": [[1, "present-both-years"]], "neither_are_null": [[1, "neither-are-null"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "sql_condition": [[1, "sql-condition"], [3, "sql-condition"]], "Feature add-ons": [[1, "feature-add-ons"]], "alias": [[1, "alias"]], "power": [[1, "power"], [3, "power"]], "threshold": [[1, "threshold"]], "lower_threshold": [[1, "lower-threshold"]], "upper_threshold": [[1, "upper-threshold"]], "gt_threshold": [[1, "gt-threshold"]], "btwn_threshold": [[1, "btwn-threshold"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "Aggregate Features": [[1, "aggregate-features"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "exact_mult": [[1, "exact-mult"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "Configuration": [[2, "configuration"]], "Basic Config File": [[2, "basic-config-file"]], "Advanced Config File": [[2, "advanced-config-file"]], "Top level configs": [[2, "top-level-configs"]], "Data sources": [[2, "data-sources"]], "Filter": [[2, "filter"]], "Substitution Columns": [[2, "substitution-columns"]], "Feature Selections": [[2, "feature-selections"]], "Potential Matches Universe": [[2, "potential-matches-universe"]], "Blocking": [[2, "blocking"]], "Comparisons": [[2, "comparisons"]], "Household Comparisons": [[2, "household-comparisons"]], "Comparison Features": [[2, "comparison-features"]], "Pipeline-generated Features": [[2, "pipeline-generated-features"]], "Training and models": [[2, "training-and-models"]], "Household training and models": [[2, "household-training-and-models"]], "Welcome to hlink\u2019s documentation!": [[4, "welcome-to-hlink-s-documentation"]], "Configuration API": [[4, "configuration-api"], [4, null]], "Installation": [[5, "installation"]], "Requirements": [[5, "requirements"]], "Installing from PyPI": [[5, "installing-from-pypi"]], "Installing from source": [[5, "installing-from-source"]], "Introduction": [[6, "introduction"]], "Overview": [[6, "overview"], [7, "overview"], [7, "id1"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"]], "Link Tasks": [[7, "link-tasks"]], "Preprocessing": [[7, "preprocessing"]], "Task steps": [[7, "task-steps"], [7, "id2"], [7, "id5"], [7, "id8"], [7, "id11"], [7, "id14"]], "Related Configuration Sections": [[7, "related-configuration-sections"], [7, "id3"], [7, "id6"], [7, "id9"], [7, "id12"], [7, "id15"]], "Training and Household Training": [[7, "training-and-household-training"]], "Matching": [[7, "matching"]], "Household Matching": [[7, "household-matching"]], "Model Exploration and Household Model Exploration": [[7, "model-exploration-and-household-model-exploration"]], "Reporting": [[7, "reporting"]], "Models": [[8, "models"]], "random_forest": [[8, "random-forest"]], "probit": [[8, "probit"]], "logistic_regression": [[8, "logistic-regression"]], "decision_tree": [[8, "decision-tree"]], "gradient_boosted_trees": [[8, "gradient-boosted-trees"]], "Pipeline generated features": [[9, "pipeline-generated-features"]], "Transformer types": [[9, "transformer-types"]], "interaction": [[9, "interaction"]], "bucketizer": [[9, "bucketizer"]], "Running hlink": [[10, "running-hlink"]], "Using hlink as a Library": [[10, "using-hlink-as-a-library"]], "Interactive Mode": [[10, "interactive-mode"]], "Starting the program": [[10, "starting-the-program"]], "Running Linking Tasks and Steps": [[10, "running-linking-tasks-and-steps"]], "Example interactive mode workflow": [[10, "example-interactive-mode-workflow"]], "Substitutions": [[11, "substitutions"]], "1:1 substitution by data table": [[11, "substitution-by-data-table"]], "Substitution by regex word replace": [[11, "substitution-by-regex-word-replace"]], "Advanced Workflow Examples": [[12, "advanced-workflow-examples"]], "Export training data after generating features to reuse in different linking years": [[12, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Example training data export with generated ML features": [[12, "example-training-data-export-with-generated-ml-features"]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[12, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Example model exploration and FP/FN export workflow": [[12, "example-model-exploration-and-fp-fn-export-workflow"]], "Feature Selection Transforms": [[3, "feature-selection-transforms"]], "bigrams": [[3, "bigrams"]], "array": [[3, "array"]], "union": [[3, "union"]], "soundex": [[3, "soundex"]]}, "indexentries": {}})
\ No newline at end of file
diff --git a/sphinx-docs/feature_selection_transforms.md b/sphinx-docs/feature_selection_transforms.md
index c87440a..3f89ee1 100644
--- a/sphinx-docs/feature_selection_transforms.md
+++ b/sphinx-docs/feature_selection_transforms.md
@@ -1,16 +1,26 @@
-# Feature Selection transforms
-
-Each header below represents a feature selection transform.  These transforms are used in the context of `feature_selections`.
-
-```
-[[feature_selections]]
-input_column = "clean_birthyr"
-output_column = "replaced_birthyr"
-condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end"
-transform = "sql_condition"
-```
-
-There are some additional attributes available for all transforms: `checkpoint`, `override_column_a`, `override_column_b`, `set_value_column_a`, `set_value_column_b`.
+# Feature Selection Transforms
+
+Each feature selection in the `[[feature_selections]]` list must have a
+`transform` attribute which tells hlink which transform it uses. The available
+feature selection transforms are listed below. The attributes of the feature
+selection often vary with the feature selection transform. However, there are a
+few utility attributes which are available for all transforms:
+
+- `override_column_a` - Type: `string`. Optional. Given the name of a column in
+  dataset A, copy that column into the output column instead of computing the
+  feature selection for dataset A. This does not affect dataset B.
+- `override_column_b` - Type: `string`. Optional. Given the name of a column in
+  dataset B, copy that column into the output column instead of computing the
+  feature selection for dataset B. This does not affect dataset A.
+- `set_value_column_a` - Type: any. Optional. Instead of computing the feature
+  selection for dataset A, use the given value for every row in the output
+  column. This does not affect dataset B.
+- `set_value_column_b` - Type: any. Optional. Instead of computing the feature
+  selection for dataset B, use the given value for every row in the output
+  column. This does not affect dataset A.
+- `checkpoint` - Type: `boolean`. Optional. If set to true, checkpoint the
+  dataset in Spark before computing the feature selection. This can reduce some
+  resource usage for very complex workflows, but should not be necessary.
 
 ## bigrams
 

From b1a08abdf4a1bd9c0e1e313d2c4eb19dab820a66 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 26 Aug 2024 19:29:30 +0000
Subject: [PATCH 06/13] Add a test for the error when there's an unrecognized
 feature selection transform

---
 hlink/tests/core/transforms_test.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
index 7b6041f..7c83775 100644
--- a/hlink/tests/core/transforms_test.py
+++ b/hlink/tests/core/transforms_test.py
@@ -192,3 +192,16 @@ def test_generate_transforms_override_column_b(
         Row(id=2, mother_nativity=0, test_override_column=-1, mbpl_range=-1),
         Row(id=3, mother_nativity=6, test_override_column=-1, mbpl_range=-1),
     ]
+
+
+@pytest.mark.parametrize("is_a", [True, False])
+def test_generate_transforms_error_when_unrecognized_transform(
+    spark: SparkSession, preprocessing: LinkTask, is_a: bool
+) -> None:
+    feature_selections = [
+        {"input_column": "age", "output_column": "age2", "transform": "not_supported"}
+    ]
+    df = spark.createDataFrame([], "id:integer, age:integer")
+
+    with pytest.raises(ValueError, match="Invalid transform type"):
+        generate_transforms(spark, df, feature_selections, preprocessing, is_a, "id")

From d7b6c4b896fa4f74fec19e09d754b64d93d9c2f2 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 26 Aug 2024 19:53:45 +0000
Subject: [PATCH 07/13] Add type hints to core.transforms.apply_transform()

---
 hlink/linking/core/transforms.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hlink/linking/core/transforms.py b/hlink/linking/core/transforms.py
index e7073b5..c0eac7b 100755
--- a/hlink/linking/core/transforms.py
+++ b/hlink/linking/core/transforms.py
@@ -26,7 +26,7 @@
 )
 from pyspark.sql.types import ArrayType, LongType, StringType
 from pyspark.ml import Pipeline
-from pyspark.sql import DataFrame, SparkSession, Window
+from pyspark.sql import Column, DataFrame, SparkSession, Window
 from pyspark.ml.feature import NGram, RegexTokenizer, CountVectorizer, MinHashLSH
 
 
@@ -402,7 +402,9 @@ def get_transforms(name: str, is_a: bool) -> list[dict[str, Any]]:
 
 
 #  These apply to the column mappings in the current config
-def apply_transform(column_select, transform, is_a):
+def apply_transform(
+    column_select: Column, transform: dict[str, Any], is_a: bool
+) -> Column:
     """Given a dataframe select string return a new string having applied the given transform.
     column_select: A PySpark column type
     transform: The transform info from the current config

From ccb665215b92a95d468c16ac8fb1d696e7831cc4 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 26 Aug 2024 20:02:48 +0000
Subject: [PATCH 08/13] Update the documentation for
 core.transforms.apply_transform()

---
 hlink/linking/core/transforms.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/hlink/linking/core/transforms.py b/hlink/linking/core/transforms.py
index c0eac7b..0c771f0 100755
--- a/hlink/linking/core/transforms.py
+++ b/hlink/linking/core/transforms.py
@@ -405,12 +405,15 @@ def get_transforms(name: str, is_a: bool) -> list[dict[str, Any]]:
 def apply_transform(
     column_select: Column, transform: dict[str, Any], is_a: bool
 ) -> Column:
-    """Given a dataframe select string return a new string having applied the given transform.
-    column_select: A PySpark column type
-    transform: The transform info from the current config
-    is_a: Is running on dataset 'a' or 'b ?
-
-    See the json_schema config file in config_schemas/config.json for definitions on each transform type.
+    """Return a new column that is the result of applying the given transform
+    to the given input column (column_select). The is_a parameter controls the
+    behavior of the transforms like "add_to_a" which act differently on
+    datasets A and B.
+
+    Args:
+    column_select: a PySpark Column
+    transform: the transform to apply
+    is_a: whether this is dataset A (True) or B (False)
     """
     transform_type = transform["type"]
     if transform_type == "add_to_a":

From 8e3ddb82efeb8ceb0305d7102ee02c7d4f0d1c82 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 26 Aug 2024 20:31:49 +0000
Subject: [PATCH 09/13] Add a test for the when_value column mapping transform

---
 hlink/tests/core/transforms_test.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
index 7c83775..e56a662 100644
--- a/hlink/tests/core/transforms_test.py
+++ b/hlink/tests/core/transforms_test.py
@@ -1,7 +1,8 @@
 from pyspark.sql import Row, SparkSession
+from pyspark.sql.functions import col
 import pytest
 
-from hlink.linking.core.transforms import generate_transforms
+from hlink.linking.core.transforms import apply_transform, generate_transforms
 from hlink.linking.link_task import LinkTask
 
 
@@ -205,3 +206,22 @@ def test_generate_transforms_error_when_unrecognized_transform(
 
     with pytest.raises(ValueError, match="Invalid transform type"):
         generate_transforms(spark, df, feature_selections, preprocessing, is_a, "id")
+
+
+@pytest.mark.parametrize("is_a", [True, False])
+def test_apply_transform_when_value(spark: SparkSession, is_a: bool) -> None:
+    transform = {"type": "when_value", "value": 6, "if_value": 0, "else_value": 1}
+    column_select = col("marst")
+    output_col = apply_transform(column_select, transform, is_a)
+
+    df = spark.createDataFrame([[3], [6], [2], [6], [1]], "marst:integer")
+    transformed = df.select("marst", output_col.alias("output"))
+    result = transformed.collect()
+
+    assert result == [
+        Row(marst=3, output=1),
+        Row(marst=6, output=0),
+        Row(marst=2, output=1),
+        Row(marst=6, output=0),
+        Row(marst=1, output=1),
+    ]

From 1b768a0b672a159a0103c8def5c6278fea380392 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 26 Aug 2024 20:49:03 +0000
Subject: [PATCH 10/13] Add a test to check the error when you pass an
 unrecognized transform type to apply_transform()

---
 hlink/tests/core/transforms_test.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
index e56a662..54220e1 100644
--- a/hlink/tests/core/transforms_test.py
+++ b/hlink/tests/core/transforms_test.py
@@ -210,6 +210,13 @@ def test_generate_transforms_error_when_unrecognized_transform(
 
 @pytest.mark.parametrize("is_a", [True, False])
 def test_apply_transform_when_value(spark: SparkSession, is_a: bool) -> None:
+    """The when_value transform supports simple if-then-otherwise logic on
+    columns:
+
+    if the column is equal to "when_value"
+    then return "if_value"
+    otherwise return "else_value"
+    """
     transform = {"type": "when_value", "value": 6, "if_value": 0, "else_value": 1}
     column_select = col("marst")
     output_col = apply_transform(column_select, transform, is_a)
@@ -225,3 +232,11 @@ def test_apply_transform_when_value(spark: SparkSession, is_a: bool) -> None:
         Row(marst=6, output=0),
         Row(marst=1, output=1),
     ]
+
+
+@pytest.mark.parametrize("is_a", [True, False])
+def test_apply_transform_error_when_unrecognized_transform_type(is_a: bool) -> None:
+    column_select = col("test")
+    transform = {"type": "not_supported"}
+    with pytest.raises(ValueError, match="Invalid transform type"):
+        apply_transform(column_select, transform, is_a)

From 903a2009ec5775b7a9e584b232e8db39db7e34a9 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 27 Aug 2024 14:35:19 +0000
Subject: [PATCH 11/13] Add a test for the SparkFactory class

---
 hlink/tests/spark_factory_test.py | 35 +++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 hlink/tests/spark_factory_test.py

diff --git a/hlink/tests/spark_factory_test.py b/hlink/tests/spark_factory_test.py
new file mode 100644
index 0000000..895131c
--- /dev/null
+++ b/hlink/tests/spark_factory_test.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+
+from pyspark.sql import Row
+
+from hlink.spark.factory import SparkFactory
+
+
+def test_spark_factory_can_create_spark_session(tmp_path: Path) -> None:
+    derby_dir = tmp_path / "derby"
+    spark_tmp_dir = tmp_path / "tmp"
+    warehouse_dir = tmp_path / "warehouse"
+
+    factory = (
+        SparkFactory()
+        .set_local()
+        .set_derby_dir(derby_dir)
+        .set_warehouse_dir(warehouse_dir)
+        .set_tmp_dir(spark_tmp_dir)
+        .set_num_cores(1)
+        .set_executor_cores(1)
+        .set_executor_memory("1G")
+    )
+    spark = factory.create()
+
+    # Make sure we can do some basic operations with the SparkSession we get back
+    df = spark.createDataFrame(
+        [[0, "a"], [1, "b"], [2, "c"]], "id:integer,letter:string"
+    )
+    expr = (df.letter == "b").alias("equals_b")
+    result = df.select(expr).collect()
+    assert result == [
+        Row(equals_b=False),
+        Row(equals_b=True),
+        Row(equals_b=False),
+    ]

From 41588413a56370c743a1824a3a4a26b44116566b Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 27 Aug 2024 15:29:46 +0000
Subject: [PATCH 12/13] Add a test for the remove_punctuation column mapping
 transform

---
 hlink/tests/core/transforms_test.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
index 54220e1..e71650e 100644
--- a/hlink/tests/core/transforms_test.py
+++ b/hlink/tests/core/transforms_test.py
@@ -234,6 +234,32 @@ def test_apply_transform_when_value(spark: SparkSession, is_a: bool) -> None:
     ]
 
 
+@pytest.mark.parametrize("is_a", [True, False])
+def test_apply_transform_remove_punctuation(spark: SparkSession, is_a: bool) -> None:
+    transform = {"type": "remove_punctuation"}
+    input_col = col("input")
+    output_col = apply_transform(input_col, transform, is_a)
+
+    df = spark.createDataFrame(
+        [
+            # All of these characters are considered punctuation and should be removed
+            ["?-\\/\"':,.[]{}"],
+            ["abcdefghijklmnop"],
+            # The address of the Minnesota state capitol
+            ["75 Rev. Dr. Martin Luther King, Jr. Blvd. Saint Paul, MN 55155"],
+        ],
+        "input:string",
+    )
+    transformed = df.select(output_col.alias("output"))
+    result = transformed.collect()
+
+    assert result == [
+        Row(output=""),
+        Row(output="abcdefghijklmnop"),
+        Row(output="75 Rev Dr Martin Luther King Jr Blvd Saint Paul MN 55155"),
+    ]
+
+
 @pytest.mark.parametrize("is_a", [True, False])
 def test_apply_transform_error_when_unrecognized_transform_type(is_a: bool) -> None:
     column_select = col("test")

From e4c9941141153f31aded705d95ff7ceb770d150b Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 27 Aug 2024 18:20:54 +0000
Subject: [PATCH 13/13] Add a test for the substring column mapping transform

This confirms that the transform handles the case where the values list doesn't
have length 2 by raising an error. This prompted me to make issue #146, which I
think should really simplify this transform.
---
 hlink/tests/core/transforms_test.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
index e71650e..f072333 100644
--- a/hlink/tests/core/transforms_test.py
+++ b/hlink/tests/core/transforms_test.py
@@ -260,6 +260,35 @@ def test_apply_transform_remove_punctuation(spark: SparkSession, is_a: bool) ->
     ]
 
 
+@pytest.mark.parametrize("values", [[1], [1, 2, 3]])
+@pytest.mark.parametrize("is_a", [True, False])
+def test_apply_transform_substring_error_when_not_exactly_2_values(
+    values: list[int], is_a: bool
+) -> None:
+    """
+    The substring transform takes a list of exactly two values, which are the
+    start position of the substring and its length. If the list has the wrong
+    number of values, then apply_transform() raises an error.
+
+    TODO: It would be simpler to have two separate attributes for the substring
+    start and length, like this:
+
+    {
+        "type": "substring",
+        "start_index": 0,
+        "length": 4,
+    }
+
+    See issue #146. Making these changes would eliminate the need for this
+    test.
+    """
+    input_col = col("input")
+    transform = {"type": "substring", "values": values}
+
+    with pytest.raises(ValueError, match="Length of substr transform should be 2"):
+        apply_transform(input_col, transform, is_a)
+
+
 @pytest.mark.parametrize("is_a", [True, False])
 def test_apply_transform_error_when_unrecognized_transform_type(is_a: bool) -> None:
     column_select = col("test")