From a0ce682c557c0889df7619f8ba4418265cf0f24a Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 21 Nov 2023 19:31:16 +0000
Subject: [PATCH 1/6] [#118] Add some unit tests for core.column_mapping

I'm treating these as part of the documentation of the features available for
column_mappings.
---
 hlink/tests/core/column_mapping_test.py | 132 ++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 hlink/tests/core/column_mapping_test.py

diff --git a/hlink/tests/core/column_mapping_test.py b/hlink/tests/core/column_mapping_test.py
new file mode 100644
index 0000000..86fb80c
--- /dev/null
+++ b/hlink/tests/core/column_mapping_test.py
@@ -0,0 +1,132 @@
+from numpy import select
+import pytest
+import pandas as pd
+
+from hlink.linking.core.column_mapping import select_column_mapping
+
+
+TEST_DF_1 = pd.DataFrame(
+    {
+        "id": [0, 1, 2, 3, 4, 5],
+        "age": [19, 37, 27, 101, 59, 22],
+        "occupation": [
+            "farmer",
+            "computer scientist",
+            "waitress",
+            "retired",
+            "lawyer",
+            "doctor",
+        ],
+    }
+)
+
+TEST_DF_2 = pd.DataFrame(
+    {
+        "identifier": [1000, 1002, 1004, 1006],
+        "age": [73, 55, 10, 18],
+        "occ": ["retired", "childcare", None, None],
+    }
+)
+
+
+@pytest.mark.parametrize("is_a", [True, False])
+def test_select_column_mapping_basic(spark, is_a):
+    """
+    A single column mapping with just column_name specified selects that column
+    from the dataframe.
+    """
+    column_mapping = {
+        "column_name": "age",
+    }
+    df = spark.createDataFrame(TEST_DF_1)
+
+    df_selected, column_selects = select_column_mapping(column_mapping, df, is_a, [])
+
+    assert column_selects == ["age"]
+    assert set(df_selected.columns) == {"age", "id", "occupation"}
+    assert df_selected.count() == 6
+
+
+@pytest.mark.parametrize("is_a", [True, False])
+@pytest.mark.parametrize("alias", ["age", "myage", "num_years"])
+def test_select_column_mapping_alias(spark, is_a, alias):
+    """
+    alias sets the output name for the column mapping. It can be the same as the
+    input column name or different.
+    """
+    column_mapping = {
+        "column_name": "age",
+        "alias": alias,
+    }
+    df = spark.createDataFrame(TEST_DF_1)
+
+    df_selected, column_selects = select_column_mapping(column_mapping, df, is_a, [])
+
+    assert column_selects == [alias]
+    # The alias is an additional column that is later selected out with column_selects
+    assert set(df_selected.columns) == {"age", alias, "occupation", "id"}
+    assert df_selected.count() == 6
+    assert (
+        df_selected.filter(df_selected.age == df_selected[alias]).count()
+        == df_selected.count()
+    )
+
+
+def test_select_column_mapping_set_value_column_a(spark):
+    """
+    set_value_column_a overrides the input column for dataset A and sets all of its
+    values to the given value. Dataset B is unaffected.
+    """
+    column_mapping = {
+        "column_name": "age",
+        "set_value_column_a": 44,
+    }
+    df_a = spark.createDataFrame(TEST_DF_1)
+    df_b = spark.createDataFrame(TEST_DF_2)
+
+    df_selected_a, column_selects_a = select_column_mapping(
+        column_mapping, df_a, is_a=True, column_selects=[]
+    )
+    df_selected_b, column_selects_b = select_column_mapping(
+        column_mapping, df_b, is_a=False, column_selects=[]
+    )
+
+    assert column_selects_a == column_selects_b == ["age"]
+
+    assert (
+        df_selected_a.filter(df_selected_a.age == 44).count() == df_selected_a.count()
+    )
+    assert df_selected_b.filter(df_selected_b.age == 44).count() == 0
+
+
+def test_select_column_mapping_set_value_column_b(spark):
+    """
+    set_value_column_b overrides the input column for dataset B and sets all of its
+    values to the given value. Dataset A is unaffected.
+    """
+    column_mapping = {
+        "column_name": "age",
+        "set_value_column_b": 44,
+    }
+    df_a = spark.createDataFrame(TEST_DF_1)
+    df_b = spark.createDataFrame(TEST_DF_2)
+
+    df_selected_a, column_selects_a = select_column_mapping(
+        column_mapping, df_a, is_a=True, column_selects=[]
+    )
+    df_selected_b, column_selects_b = select_column_mapping(
+        column_mapping, df_b, is_a=False, column_selects=[]
+    )
+
+    assert column_selects_a == column_selects_b == ["age"]
+
+    assert (
+        df_selected_b.filter(df_selected_b.age == 44).count() == df_selected_b.count()
+    )
+    assert df_selected_a.filter(df_selected_a.age == 44).count() == 0
+
+
+def test_select_column_mapping_error_missing_column_name(spark):
+    df = spark.createDataFrame(TEST_DF_1)
+    with pytest.raises(KeyError):
+        select_column_mapping({}, df, is_a=False, column_selects=[])

From 9371877cac96ff97c4280c3bdfbcf7a693d9fcca Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 21 Nov 2023 20:22:03 +0000
Subject: [PATCH 2/6] [#118] Add type hints to select_column_mapping, fix a
 flake8 error

---
 hlink/linking/core/column_mapping.py    | 9 ++++++++-
 hlink/tests/core/column_mapping_test.py | 1 -
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/hlink/linking/core/column_mapping.py b/hlink/linking/core/column_mapping.py
index 98726d0..e822dbc 100755
--- a/hlink/linking/core/column_mapping.py
+++ b/hlink/linking/core/column_mapping.py
@@ -2,12 +2,19 @@
 # For copyright and licensing information, see the NOTICE and LICENSE files
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
+from typing import Any
 
 from pyspark.sql.functions import col, lit
+from pyspark.sql import DataFrame
 import hlink.linking.core.transforms as transforms_core
 
 
-def select_column_mapping(column_mapping, df_selected, is_a, column_selects):
+def select_column_mapping(
+    column_mapping: dict[str, Any],
+    df_selected: DataFrame,
+    is_a: bool,
+    column_selects: list[str],
+) -> tuple[DataFrame, list[str]]:
     name = column_mapping["column_name"]
     if "override_column_a" in column_mapping and is_a:
         override_name = column_mapping["override_column_a"]
diff --git a/hlink/tests/core/column_mapping_test.py b/hlink/tests/core/column_mapping_test.py
index 86fb80c..2d0f30a 100644
--- a/hlink/tests/core/column_mapping_test.py
+++ b/hlink/tests/core/column_mapping_test.py
@@ -1,4 +1,3 @@
-from numpy import select
 import pytest
 import pandas as pd
 

From f9ff38c3c4f1090befc74b948a73ce0e53d19189 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Wed, 22 Nov 2023 17:55:15 +0000
Subject: [PATCH 3/6] [#118] Continue adding some unit tests for
 core.column_mapping.select_column_mapping()

---
 hlink/tests/core/column_mapping_test.py | 87 +++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 7 deletions(-)

diff --git a/hlink/tests/core/column_mapping_test.py b/hlink/tests/core/column_mapping_test.py
index 2d0f30a..4db1639 100644
--- a/hlink/tests/core/column_mapping_test.py
+++ b/hlink/tests/core/column_mapping_test.py
@@ -9,12 +9,12 @@
         "id": [0, 1, 2, 3, 4, 5],
         "age": [19, 37, 27, 101, 59, 22],
         "occupation": [
-            "farmer",
-            "computer scientist",
-            "waitress",
-            "retired",
-            "lawyer",
-            "doctor",
+            "FARMER",
+            "COMPUTER SCIENTIST",
+            "WAITRESS",
+            "RETIRED",
+            "LAWYER",
+            "DOCTOR",
         ],
     }
 )
@@ -23,7 +23,7 @@
     {
         "identifier": [1000, 1002, 1004, 1006],
         "age": [73, 55, 10, 18],
-        "occ": ["retired", "childcare", None, None],
+        "occ": ["RETIRED", "CHILDCARE", None, None],
     }
 )
 
@@ -125,7 +125,80 @@ def test_select_column_mapping_set_value_column_b(spark):
     assert df_selected_a.filter(df_selected_a.age == 44).count() == 0
 
 
+def test_select_column_mapping_transforms_add_to_a(spark):
+    """
+    column_mappings support transforms, which modify the values of the column as
+    it is read in. These often apply to both dataset A and dataset B, but some
+    apply only to a single dataset.
+
+    add_to_a applies only to dataset A.
+    """
+    column_mapping = {
+        "column_name": "age",
+        "alias": "age_at_dataset_b",
+        "transforms": [{"type": "add_to_a", "value": 11}],
+    }
+    df_a = spark.createDataFrame(TEST_DF_1)
+    df_b = spark.createDataFrame(TEST_DF_2)
+
+    df_selected_a, column_selects_a = select_column_mapping(
+        column_mapping,
+        df_a,
+        is_a=True,
+        column_selects=[],
+    )
+    df_selected_b, column_selects_b = select_column_mapping(
+        column_mapping,
+        df_b,
+        is_a=False,
+        column_selects=[],
+    )
+
+    assert column_selects_a == column_selects_b == ["age_at_dataset_b"]
+    ages_a = df_selected_a.select("age_at_dataset_b").toPandas()
+    assert ages_a["age_at_dataset_b"].to_list() == [30, 48, 38, 112, 70, 33]
+
+    ages_b = df_selected_b.select("age_at_dataset_b").toPandas()
+    assert ages_b["age_at_dataset_b"].to_list() == [73, 55, 10, 18]
+
+
+def test_select_column_mapping_column_selects_preserved(spark):
+    """
+    select_column_mapping() appends column names to the end of column_selects and
+    then returns the new, longer list. You can even map the same column multiple
+    times with different aliases.
+    """
+    column_mapping_1 = {
+        "column_name": "occupation",
+        "alias": "occ_with_underscores",
+        "transforms": [{"type": "concat_two_cols", "column_to_append": "age"}],
+    }
+    column_mapping_2 = {
+        "column_name": "occupation",
+    }
+    df_a = spark.createDataFrame(TEST_DF_1)
+
+    df_selected, column_selects = select_column_mapping(
+        column_mapping_1,
+        df_a,
+        is_a=True,
+        column_selects=[],
+    )
+
+    assert column_selects == ["occ_with_underscores"]
+
+    df_selected, column_selects = select_column_mapping(
+        column_mapping_2, df_a, is_a=True, column_selects=column_selects
+    )
+
+    assert set(column_selects) == {"occ_with_underscores", "occupation"}
+
+
 def test_select_column_mapping_error_missing_column_name(spark):
+    """
+    Without a column_name key in the column_mapping, the function raises
+    a KeyError.
+    """
     df = spark.createDataFrame(TEST_DF_1)
     with pytest.raises(KeyError):
         select_column_mapping({}, df, is_a=False, column_selects=[])

From 87090a257c6874deaeafde62a53d1520848d6025 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 27 Nov 2023 17:05:08 +0000
Subject: [PATCH 4/6] [#118] Add unit tests for override_column_<a/b> for
 column_mappings

---
 hlink/tests/core/column_mapping_test.py | 104 ++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/hlink/tests/core/column_mapping_test.py b/hlink/tests/core/column_mapping_test.py
index 4db1639..eb694e6 100644
--- a/hlink/tests/core/column_mapping_test.py
+++ b/hlink/tests/core/column_mapping_test.py
@@ -194,6 +194,110 @@ def test_select_column_mapping_column_selects_preserved(spark):
     assert set(column_selects) == {"occ_with_underscores", "occupation"}
 
 
+def test_select_column_mapping_override_column_a(spark):
+    """
+    override_column_a lets the user specify a different column name for
+    dataset A. override_transforms are applied only to dataset A in
+    this case.
+    """
+    column_mapping = {
+        "column_name": "occ",
+        "override_column_a": "occupation",
+        "override_transforms": [{"type": "lowercase_strip"}],
+    }
+    df_a = spark.createDataFrame(TEST_DF_1)
+    df_b = spark.createDataFrame(TEST_DF_2)
+
+    df_selected_a, column_selects_a = select_column_mapping(
+        column_mapping,
+        df_a,
+        is_a=True,
+        column_selects=[],
+    )
+
+    df_selected_b, column_selects_b = select_column_mapping(
+        column_mapping,
+        df_b,
+        is_a=False,
+        column_selects=[],
+    )
+
+    assert column_selects_a == column_selects_b == ["occ"]
+
+    occ_a = df_selected_a.select("occ").toPandas()
+    assert occ_a["occ"].to_list() == [
+        "farmer",
+        "computer scientist",
+        "waitress",
+        "retired",
+        "lawyer",
+        "doctor",
+    ]
+
+    occ_b = df_selected_b.select("occ").toPandas()
+    assert occ_b["occ"].to_list() == [
+        "RETIRED",
+        "CHILDCARE",
+        None,
+        None,
+    ]
+
+
+def test_select_column_mapping_override_column_b(spark):
+    """
+    override_column_b lets the user specify a different column name for
+    dataset B. override_transforms are applied only to dataset B in
+    this case, and transforms are applied only to dataset A.
+    """
+    column_mapping = {
+        "column_name": "occupation",
+        "override_column_b": "occ",
+        "override_transforms": [
+            {"type": "concat_two_cols", "column_to_append": "identifier"}
+        ],
+        "transforms": [
+            {"type": "lowercase_strip"},
+            {"type": "concat_two_cols", "column_to_append": "id"},
+        ],
+    }
+    df_a = spark.createDataFrame(TEST_DF_1)
+    df_b = spark.createDataFrame(TEST_DF_2)
+
+    df_selected_a, column_selects_a = select_column_mapping(
+        column_mapping,
+        df_a,
+        is_a=True,
+        column_selects=[],
+    )
+
+    df_selected_b, column_selects_b = select_column_mapping(
+        column_mapping,
+        df_b,
+        is_a=False,
+        column_selects=[],
+    )
+
+    assert column_selects_a == column_selects_b == ["occupation"]
+
+    occ_a = df_selected_a.select("occupation").toPandas()
+    assert occ_a["occupation"].to_list() == [
+        "farmer0",
+        "computer scientist1",
+        "waitress2",
+        "retired3",
+        "lawyer4",
+        "doctor5",
+    ]
+
+    occ_b = df_selected_b.select("occupation").toPandas()
+    assert occ_b["occupation"].to_list() == [
+        "RETIRED1000",
+        "CHILDCARE1002",
+        None,
+        None,
+    ]
+
+
 def test_select_column_mapping_error_missing_column_name(spark):
     """
     Without a column_name key in the column_mapping, the function raises

From 331212917ac543910b998f025d22508c059134cd Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 27 Nov 2023 19:42:38 +0000
Subject: [PATCH 5/6] [#118] Document column overrides for column mappings

I've reworked the column_mapping_transforms page to be a more general
column_mappings page that has a section for transforms. I've added some missing
attributes to the overview of column mappings in config.md and some additional
documentation about the advanced features to column_mappings.md.
---
 ...ransforms.md.txt => column_mapping.md.txt} |   0
 .../_sources/column_mappings.md.txt           | 133 ++++--
 docs/_sources/config.md.txt                   |  50 ++-
 docs/_sources/index.rst.txt                   |   2 +-
 ...ng_transforms.html => column_mapping.html} |   2 +-
 docs/column_mappings.html                     | 425 ++++++++++++++++++
 docs/comparison_types.html                    |   6 +-
 docs/config.html                              |  56 ++-
 docs/feature_selection_transforms.html        |   2 +-
 docs/genindex.html                            |   2 +-
 docs/index.html                               |  28 +-
 docs/installation.html                        |   2 +-
 docs/introduction.html                        |   2 +-
 docs/link_tasks.html                          |   2 +-
 docs/models.html                              |   2 +-
 docs/objects.inv                              | Bin 527 -> 528 bytes
 docs/pipeline_features.html                   |   2 +-
 docs/running_the_program.html                 |   2 +-
 docs/search.html                              |   2 +-
 docs/searchindex.js                           |   2 +-
 docs/substitutions.html                       |   2 +-
 docs/use_examples.html                        |   2 +-
 sphinx-docs/column_mappings.md                | 362 +++++++++++++++
 sphinx-docs/config.md                         |  50 ++-
 sphinx-docs/index.rst                         |   2 +-
 25 files changed, 1004 insertions(+), 136 deletions(-)
 rename docs/_sources/{column_mapping_transforms.md.txt => column_mapping.md.txt} (100%)
 rename sphinx-docs/column_mapping_transforms.md => docs/_sources/column_mappings.md.txt (64%)
 mode change 100755 => 100644
 rename docs/{column_mapping_transforms.html => column_mapping.html} (99%)
 create mode 100644 docs/column_mappings.html
 create mode 100755 sphinx-docs/column_mappings.md

diff --git a/docs/_sources/column_mapping_transforms.md.txt b/docs/_sources/column_mapping.md.txt
similarity index 100%
rename from docs/_sources/column_mapping_transforms.md.txt
rename to docs/_sources/column_mapping.md.txt
diff --git a/sphinx-docs/column_mapping_transforms.md b/docs/_sources/column_mappings.md.txt
old mode 100755
new mode 100644
similarity index 64%
rename from sphinx-docs/column_mapping_transforms.md
rename to docs/_sources/column_mappings.md.txt
index 17b2afa..96c2e93
--- a/sphinx-docs/column_mapping_transforms.md
+++ b/docs/_sources/column_mappings.md.txt
@@ -1,36 +1,81 @@
-# Column mapping transforms
+# Column Mappings
 
-Each header below represents a column mapping transform type. Transforms are used in the context of the `column_mappings` list.
-Each transform operates on a single input column and outputs a single output column.
+## Basic Usage
 
-Some transforms are suffixed by "a" or "b". These suffixes mean that the transforms apply
-to columns from only one of the two datasets to be linked (dataset A or dataset B).
-Most transforms operate on both dataset A and dataset B independently.
+Each column mapping reads a column from the input datasets into hlink. It has a
+`column_name` attribute which specifies the name of the input column to read in
+from both datasets. Optionally, it may have an `alias` attribute which gives a
+new name to use for the column in hlink.
 
-More than one transform can be applied to a column. Transforms apply in the order that
-they're listed in the configuration file, so the output of one transform may be the input of another.
-Input and output column types are listed in the format "Maps input column type → output column type".
-The letters T and U represent arbitrary column types.
+Column mappings support some *transforms* which make changes to the data as they
+are read in. These changes support data cleaning and harmonization. The available
+column mapping transforms are listed below in the [transforms](#transforms) section.
 
-Each column mapping applies to the column specified by the `column_name` attribute in
-the configuration file under its `[[column_mappings]]` section. The output column
-name is specified by the `alias` attribute, and the `transforms` attribute lists the transforms to apply. Along
-with `type`, which must be one of the names listed below, there may be additional attributes used by a transform.
-These vary by type, and additional information is given for each type of transform in its section
-below. Often attributes are just named `value` or `values` if there is only one attribute expected.
+## Advanced Usage
+
+By default, the input column must have the same name in both input datasets.
+With the `override_column_a` and `override_column_b` attributes, you can
+specify a different name for either dataset A or dataset B. When you do this,
+the `transforms` attribute applies only to the non-override dataset. You can also
+provide an `override_transforms` attribute which applies only to the override
+dataset.
+
+## Transforms
+
+Each section below describes a column mapping transform type. Each transform
+operates on a single input column and outputs a single output column. More than
+one transform may be applied to a column. Transforms apply in the order that
+they are listed in the `transforms` list, so the output of one transform may
+be the input of another. Input and output column types are listed in the format
+"Maps input column type → output column type". The letters T and U represent
+arbitrary column types.
+
+Each transform requires a `type` attribute, which must be one of the names
+listed below. Some transforms may use additional attributes. These vary by
+type, and additional information appears for each type of transform in its
+section below.
+
+Some transforms are suffixed by "a" or "b". These suffixes mean that the
+transforms apply to columns from only one of the two datasets to be linked
+(dataset A or dataset B). Most transforms operate on both dataset A and dataset
+B independently.
+
+For example, if you have two datasets taken 10 years apart, you may want to
+standardize the `age` variable so that it is comparable between the two
+datasets. To do this, you could create a new `age_at_dataset_b` variable by
+reading in the `age` variable from each dataset and then adding 10 to the
+variable from dataset A with the `add_to_a` transform.
 
 ```
-# An example column mappings section
 [[column_mappings]]
-# Name of the output column
-alias = "namefrst_split"
-# Name of the input column
-column_name = "namefrst_clean"
-# List of transforms to apply
-transforms = [{type = "split"}]
+alias = "age_at_dataset_b"
+column_name = "age"
+transforms = [
+    {type = "add_to_a", value = 10}
+]
 ```
 
-## add_to_a
+As another example, suppose that both datasets record each person's first name
+as a string. In dataset A the variable is called `namefrst` and is entirely
+lowercase, but in dataset B it is called `first_name` and is entirely uppercase.
+You could read these two columns into a `namefrst` column in hlink and apply
+a lowercase transform to only dataset B with the following configuration section.
+
+```
+[[column_mappings]]
+alias = "namefrst"
+column_name = "namefrst"
+# Read from column first_name in dataset B
+override_column_b = "first_name"
+# Apply these transforms only to dataset B
+override_transforms = [
+    {type = "lowercase_strip"}
+]
+```
+
+
+
+### add_to_a
 
 Add the given `value` to a column from dataset A.
 
@@ -40,7 +85,7 @@ Maps numerical → numerical.
 transforms = [{type = "add_to_a", value = 11}]
 ```
 
-## concat_to_a
+### concat_to_a
 
 Concatenate the string `value` to the end of a column in dataset A.
 
@@ -51,7 +96,7 @@ transforms = [{type = "concat_to_a", value = " "}]
 ```
 
 
-## concat_to_b
+### concat_to_b
 
 Concatenate the string `value` to the end of a column in dataset B.
 
@@ -62,7 +107,7 @@ transforms = [{type = "concat_to_b", value = " "}]
 ```
 
 
-## lowercase_strip
+### lowercase_strip
 
 Used in name cleaning. Convert alphabetical characters to lower-case and strip white
 space characters from the start and end of the strings in the column.
@@ -73,7 +118,7 @@ Maps string → string.
 transforms = [{type = "lowercase_strip"}]
 ```
 
-## rationalize_name_words
+### rationalize_name_words
 
 Used in name cleaning. Replace the characters `?`, `*`, and `-` with spaces. Since
 people's names in raw census data can contain these characters, replacing these characters
@@ -86,7 +131,7 @@ transforms = [{type = "rationalize_name_words"}]
 ```
 
 
-## remove_qmark_hyphen
+### remove_qmark_hyphen
 
 Used in name cleaning. Remove the characters `?` and `-` from strings in the column.
 
@@ -96,7 +141,7 @@ Maps string → string.
 transforms = [{type = "remove_qmark_hyphen"}]
 ```
 
-## remove_punctuation
+### remove_punctuation
 
 Remove most punctuation from strings in the column. This transform removes these characters:
 `? - \ / " ' : , . [ ] { }`.
@@ -107,7 +152,7 @@ Maps string → string.
 transforms = [{type = "remove_punctuation"}]
 ```
 
-## replace_apostrophe
+### replace_apostrophe
 
 Used in name cleaning. Replace each apostrophe `'` with a space.
 
@@ -118,7 +163,7 @@ transforms = [{type = "replace_apostrophe"}]
 
 ```
 
-## remove_alternate_names
+### remove_alternate_names
 
 Used in name cleaning. If a string in the column contains the string ` or ` ("or" surrounded by spaces),
 then remove the ` or ` and all following characters.
@@ -129,7 +174,7 @@ Maps string → string.
 transforms = [{type = "remove_alternate_names"}]
 ```
 
-## remove_suffixes
+### remove_suffixes
 
 Used in name cleaning. Given a list of suffixes, remove them from the strings in the column.
 
@@ -144,7 +189,7 @@ transforms = [
 ]
 ```
 
-## remove_stop_words
+### remove_stop_words
 
 Used in name cleaning. Remove last words from names such as street names.
 
@@ -159,7 +204,7 @@ transforms = [
 ]
 ```
 
-## remove_prefixes
+### remove_prefixes
 
 Used in name cleaning. Remove prefixes like "Ms.", "Mr.", or "Mrs." from names.
 
@@ -170,7 +215,7 @@ Maps string → string.
 transforms = [{type = "remove_prefixes", values = ["ah"]}]
 ```
 
-## condense_strip_whitespace
+### condense_strip_whitespace
 
 Used in name cleaning. Take white space that may be more than one character or contain
 non-space characters and replace it with a single space.
@@ -181,7 +226,7 @@ Maps string → string.
 transforms = [{type = "condense_strip_whitespace"}]
 ```
 
-## remove_one_letter_names
+### remove_one_letter_names
 
 Used in name cleaning. If a name is a single character, remove it and leave the white space behind.
 
@@ -191,7 +236,7 @@ Maps string → string.
 transforms = [{type = "remove_one_letter_names"}]
 ```
 
-## split
+### split
 
 Split the column value on space characters.
 
@@ -204,7 +249,7 @@ column_name = "namefrst_clean"
 transforms = [{type = "split"}]
 ```
 
-## array_index
+### array_index
 
 If the column contains an array, select the element at the given position.
 
@@ -222,7 +267,7 @@ transforms = [
 ]
 ```
 
-## mapping
+### mapping
 
 Map single or multiple values to a single output value, otherwise known as a "recoding."
 
@@ -243,7 +288,7 @@ transforms = [
 ]
 ```
 
-## substring
+### substring
 
 Replace a column with a substring of the data in the column.
 
@@ -255,7 +300,7 @@ transforms = [
 ]
  ```
 
-## divide_by_int
+### divide_by_int
 
 Divide data in a column by an integer value. It may leave a non-integer result.
 
@@ -277,7 +322,7 @@ transforms = [
 ```
 
 
-## when_value
+### when_value
 
 Apply conditional logic to replacement of values in a column. Works like the SQL `if()` or `case()` expressions in the SQL `select` clause.
 When the value of a column is `value` replace it with `if_value`. Otherwise replace it with `else_value`.
@@ -294,7 +339,7 @@ transforms = [
 ```
 
 
-## get_floor
+### get_floor
 
 Round down to the nearest whole number.
 
diff --git a/docs/_sources/config.md.txt b/docs/_sources/config.md.txt
index 8747c52..b02dc2e 100644
--- a/docs/_sources/config.md.txt
+++ b/docs/_sources/config.md.txt
@@ -1,20 +1,20 @@
 # Configuration
 1. [Basic Example Config File](#basic-config-file)
 2. [Advanced Example Config File](#advanced-config-file)
-3. [Top level configs](#top-level-configs)
-4. [Data sources](#data-sources)
+3. [Top-Level Configs](#top-level-configs)
+4. [Data Sources](#data-sources)
 5. [Filter](#filter)
-6. [Column mappings](#column-mappings)
-7. [Substitution columns](#substitution-columns)
-8. [Feature selections](#feature-selections)
-9. [Potential matches universe](#potential-matches-universe)
+6. [Column Mappings](#column-mappings)
+7. [Substitution Columns](#substitution-columns)
+8. [Feature Selections](#feature-selections)
+9. [Potential Matches Universe](#potential-matches-universe)
 10. [Blocking](#blocking)
 11. [Comparisons](#comparisons)
-12. [Household comparisons](#household-comparisons)
-13. [Comparison features](#comparison-features)
-14. [Pipeline-generated features](#pipeline-generated-features)
-15. [Training and models](#training-and-models)
-16. [Household training and models](#household-training-and-models)
+12. [Household Comparisons](#household-comparisons)
+13. [Comparison Features](#comparison-features)
+14. [Pipeline-Generated Features](#pipeline-generated-features)
+15. [Training and Models](#training-and-models)
+16. [Household Training and Models](#household-training-and-models)
 
 ## Basic Config File
 
@@ -438,16 +438,34 @@ datasource = "b"
 ```
 
 
-## [Column Mappings](column_mapping_transforms)
+## [Column Mappings](column_mappings)
 
 * Header name: `column_mappings`
-* Description: Base column mappings and transformations to extract from your input datasets.
+* Description: Base column mappings and transformations to extract from your
+  input datasets. Each column mapping requires a `column_name` which tells it
+  which input column to read from. Optionally you may provide an `alias` for
+  the column and `transforms` to modify it as it is read in. There are some additional
+  attributes listed below that are meant for advanced usage. These are described
+  in more detail on the [column mappings](column_mappings) page.
 * Required: True
 * Type: List
 * Attributes:
-  * `alias` -- Type: `string`. Optional; if not specified the new column name defaults to `column_name`. New name of column.
-  * `column_name` -- Type: `string`. Name of column in input data. Used as the name of the output column if `alias` is not specified.
-  * `transforms` -- Type: `List`. Optional. A list of transforms to apply, in order, to the input data. See the [column mapping transforms](column_mapping_transforms) section for more information.
+  * `column_name` -- Type: `string`. The name of the column in the input data.
+  * `alias` -- Type: `string`. Optional. The new name of the column to use
+    in hlink. By default, this is the same as `column_name`.
+  * `transforms` -- Type: `List`. Optional. A list of transforms to apply, in
+    order, to the input data. See the [column mapping transforms](column_mappings.html#transforms)
+    section for more information.
+  * `set_value_column_a` -- Type: `Any`. Optional. Set all records for dataset
+    A to the given literal value.
+  * `set_value_column_b` -- Type: `Any`. Optional. Set all records for dataset
+    B to the given literal value.
+  * `override_column_a` -- Type: `string`. Read from this column in dataset A
+    instead of the column specified with `column_name`.
+  * `override_column_b` -- Type: `string`. Read from this column in dataset B
+    instead of the column specified with `column_name`.
+  * `override_transforms` -- Type: `List`. Transforms to apply to the override
+    column specified with `override_column_a` or `override_column_b`.
 
 ```
 [[column_mappings]]
diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt
index 1f903fd..2c9a76e 100644
--- a/docs/_sources/index.rst.txt
+++ b/docs/_sources/index.rst.txt
@@ -23,7 +23,7 @@ Configuration API
    :maxdepth: 2
    :caption: Configuration API
 
-   Column Mapping <column_mapping_transforms.md>
+   Column Mapping <column_mappings.md>
    Comparison Types <comparison_types.md>
    Feature Selection <feature_selection_transforms.md>
    Pipeline Features <pipeline_features.md>
diff --git a/docs/column_mapping_transforms.html b/docs/column_mapping.html
similarity index 99%
rename from docs/column_mapping_transforms.html
rename to docs/column_mapping.html
index 6e838f0..339d67e 100644
--- a/docs/column_mapping_transforms.html
+++ b/docs/column_mapping.html
@@ -391,7 +391,7 @@ <h3 id="searchlabel">Quick search</h3>
       &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
       
       |
-      <a href="_sources/column_mapping_transforms.md.txt"
+      <a href="_sources/column_mapping.md.txt"
           rel="nofollow">Page source</a>
     </div>
 
diff --git a/docs/column_mappings.html b/docs/column_mappings.html
new file mode 100644
index 0000000..e63c746
--- /dev/null
+++ b/docs/column_mappings.html
@@ -0,0 +1,425 @@
+<!DOCTYPE html>
+
+<html lang="en" data-content_root="./">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+
+    <title>Column Mappings &#8212; hlink 3.5.3 documentation</title>
+    <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=039e1c02" />
+    <script src="_static/documentation_options.js?v=f6cea0e3"></script>
+    <script src="_static/doctools.js?v=888ff710"></script>
+    <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
+    <link rel="index" title="Index" href="genindex.html" />
+    <link rel="search" title="Search" href="search.html" />
+    <link rel="next" title="Comparison types, transform add-ons, aggregate features, and household aggregate features" href="comparison_types.html" />
+    <link rel="prev" title="Configuration" href="config.html" />
+   
+  <link rel="stylesheet" href="_static/custom.css" type="text/css" />
+  
+  
+  <meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
+
+  </head><body>
+  
+
+    <div class="document">
+      <div class="documentwrapper">
+        <div class="bodywrapper">
+          
+
+          <div class="body" role="main">
+            
+  <section id="column-mappings">
+<h1>Column Mappings<a class="headerlink" href="#column-mappings" title="Link to this heading">¶</a></h1>
+<section id="basic-usage">
+<h2>Basic Usage<a class="headerlink" href="#basic-usage" title="Link to this heading">¶</a></h2>
+<p>Each column mapping reads a column from the input datasets into hlink. It has a
+<code class="docutils literal notranslate"><span class="pre">column_name</span></code> attribute which specifies the name of the input column to read in
+from both datasets. Optionally, it may have an <code class="docutils literal notranslate"><span class="pre">alias</span></code> attribute which gives a
+new name to use for the column in hlink.</p>
+<p>Column mappings support some <em>transforms</em> which make changes to the data as they
+are read in. These changes support data cleaning and harmonization. The available
+column mapping transforms are listed below in the <a class="reference external" href="#transforms">transforms</a> section.</p>
+</section>
+<section id="advanced-usage">
+<h2>Advanced Usage<a class="headerlink" href="#advanced-usage" title="Link to this heading">¶</a></h2>
+<p>By default, the input column must have the same name in both input datasets.
+With the <code class="docutils literal notranslate"><span class="pre">override_column_a</span></code> and <code class="docutils literal notranslate"><span class="pre">override_column_b</span></code> attributes, you can
+specify a different name for either dataset A or dataset B. When you do this,
+the <code class="docutils literal notranslate"><span class="pre">transforms</span></code> attribute applies only to the non-override dataset. You can also
+provide an <code class="docutils literal notranslate"><span class="pre">override_transforms</span></code> attribute which applies only to the override
+dataset.</p>
+</section>
+<section id="transforms">
+<h2>Transforms<a class="headerlink" href="#transforms" title="Link to this heading">¶</a></h2>
+<p>Each section below describes a column mapping transform type. Each transform
+operates on a single input column and outputs a single output column. More than
+one transform may be applied to a column. Transforms apply in the order that
+they are listed in the <code class="docutils literal notranslate"><span class="pre">transforms</span></code> list, so the output of one transform may
+be the input of another. Input and output column types are listed in the format
+“Maps input column type → output column type”. The letters T and U represent
+arbitrary column types.</p>
+<p>Each transform requires a <code class="docutils literal notranslate"><span class="pre">type</span></code> attribute, which must be one of the names
+listed below. Some transforms may use additional attributes. These vary by
+type, and additional information appears for each type of transform in its
+section below.</p>
+<p>Some transforms are suffixed by “a” or “b”. These suffixes mean that the
+transforms apply to columns from only one of the two datasets to be linked
+(dataset A or dataset B). Most transforms operate on both dataset A and dataset
+B independently.</p>
+<p>For example, if you have two datasets taken 10 years apart, you may want to
+standardize the <code class="docutils literal notranslate"><span class="pre">age</span></code> variable so that it is comparable between the two
+datasets. To do this, you could create a new <code class="docutils literal notranslate"><span class="pre">age_at_dataset_b</span></code> variable by
+reading in the <code class="docutils literal notranslate"><span class="pre">age</span></code> variable from each dataset and then adding 10 to the
+variable from dataset A with the <code class="docutils literal notranslate"><span class="pre">add_to_a</span></code> transform.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">column_mappings</span><span class="p">]]</span>
+<span class="n">alias</span> <span class="o">=</span> <span class="s2">&quot;age_at_dataset_b&quot;</span>
+<span class="n">column_name</span> <span class="o">=</span> <span class="s2">&quot;age&quot;</span>
+<span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;add_to_a&quot;</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="mi">10</span><span class="p">}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+<p>As another example, suppose that both datasets record each person’s first name
+as a string. In dataset A the variable is called <code class="docutils literal notranslate"><span class="pre">namefrst</span></code> and is entirely
+lowercase, but in dataset B it is called <code class="docutils literal notranslate"><span class="pre">first_name</span></code> and is entirely uppercase.
+You could read these two columns into a <code class="docutils literal notranslate"><span class="pre">namefrst</span></code> column in hlink and apply
+a lowercase transform to only dataset B with the following configuration section.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">column_mappings</span><span class="p">]]</span>
+<span class="n">alias</span> <span class="o">=</span> <span class="s2">&quot;namefrst&quot;</span>
+<span class="n">column_name</span> <span class="o">=</span> <span class="s2">&quot;namefrst&quot;</span>
+<span class="c1"># Read from column first_name in dataset B</span>
+<span class="n">override_column_b</span> <span class="o">=</span> <span class="s2">&quot;first_name&quot;</span>
+<span class="c1"># Apply these transforms only to dataset B</span>
+<span class="n">override_transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;lowercase_strip&quot;</span><span class="p">}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+<section id="add-to-a">
+<h3>add_to_a<a class="headerlink" href="#add-to-a" title="Link to this heading">¶</a></h3>
+<p>Add the given <code class="docutils literal notranslate"><span class="pre">value</span></code> to a column from dataset A.</p>
+<p>Maps numerical → numerical.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;add_to_a&quot;</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="mi">11</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="concat-to-a">
+<h3>concat_to_a<a class="headerlink" href="#concat-to-a" title="Link to this heading">¶</a></h3>
+<p>Concatenate the string <code class="docutils literal notranslate"><span class="pre">value</span></code> to the end of a column in dataset A.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;concat_to_a&quot;</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="s2">&quot; &quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="concat-to-b">
+<h3>concat_to_b<a class="headerlink" href="#concat-to-b" title="Link to this heading">¶</a></h3>
+<p>Concatenate the string <code class="docutils literal notranslate"><span class="pre">value</span></code> to the end of a column in dataset B.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;concat_to_b&quot;</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="s2">&quot; &quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="lowercase-strip">
+<h3>lowercase_strip<a class="headerlink" href="#lowercase-strip" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. Convert alphabetical characters to lower-case and strip white
+space characters from the start and end of the strings in the column.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;lowercase_strip&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="rationalize-name-words">
+<h3>rationalize_name_words<a class="headerlink" href="#rationalize-name-words" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. Replace the characters <code class="docutils literal notranslate"><span class="pre">?</span></code>, <code class="docutils literal notranslate"><span class="pre">*</span></code>, and <code class="docutils literal notranslate"><span class="pre">-</span></code> with spaces. Since
+people’s names in raw census data can contain these characters, replacing these characters
+can lead to better matching.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;rationalize_name_words&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="remove-qmark-hyphen">
+<h3>remove_qmark_hyphen<a class="headerlink" href="#remove-qmark-hyphen" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. Remove the characters <code class="docutils literal notranslate"><span class="pre">?</span></code> and <code class="docutils literal notranslate"><span class="pre">-</span></code> from strings in the column.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;remove_qmark_hyphen&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="remove-punctuation">
+<h3>remove_punctuation<a class="headerlink" href="#remove-punctuation" title="Link to this heading">¶</a></h3>
+<p>Remove most punctuation from strings in the column. This transform removes these characters:
+<code class="docutils literal notranslate"><span class="pre">?</span> <span class="pre">-</span> <span class="pre">\</span> <span class="pre">/</span> <span class="pre">&quot;</span> <span class="pre">'</span> <span class="pre">:</span> <span class="pre">,</span> <span class="pre">.</span> <span class="pre">[</span> <span class="pre">]</span> <span class="pre">{</span> <span class="pre">}</span></code>.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;remove_punctuation&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="replace-apostrophe">
+<h3>replace_apostrophe<a class="headerlink" href="#replace-apostrophe" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. Replace each apostrophe <code class="docutils literal notranslate"><span class="pre">'</span></code> with a space.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;replace_apostrophe&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="remove-alternate-names">
+<h3>remove_alternate_names<a class="headerlink" href="#remove-alternate-names" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. If a string in the column contains the string <code class="docutils literal notranslate"><span class="pre">or</span></code> (”or” surrounded by spaces),
+then remove the <code class="docutils literal notranslate"><span class="pre">or</span></code> and all following characters.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;remove_alternate_names&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="remove-suffixes">
+<h3>remove_suffixes<a class="headerlink" href="#remove-suffixes" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. Given a list of suffixes, remove them from the strings in the column.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span>
+        <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;remove_suffixes&quot;</span><span class="p">,</span>
+        <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;jr&quot;</span><span class="p">,</span> <span class="s2">&quot;sr&quot;</span><span class="p">,</span> <span class="s2">&quot;ii&quot;</span><span class="p">,</span> <span class="s2">&quot;iii&quot;</span><span class="p">]</span>
+    <span class="p">}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+</section>
+<section id="remove-stop-words">
+<h3>remove_stop_words<a class="headerlink" href="#remove-stop-words" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. Remove last words from names such as street names.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span>
+        <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;remove_stop_words&quot;</span><span class="p">,</span>
+        <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;avenue&#39;</span><span class="p">,</span> <span class="s1">&#39;blvd&#39;</span><span class="p">,</span> <span class="s1">&#39;circle&#39;</span><span class="p">,</span> <span class="s1">&#39;court&#39;</span><span class="p">,</span> <span class="s1">&#39;road&#39;</span><span class="p">,</span> <span class="s1">&#39;street&#39;</span><span class="p">]</span>
+    <span class="p">}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+</section>
+<section id="remove-prefixes">
+<h3>remove_prefixes<a class="headerlink" href="#remove-prefixes" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. Remove prefixes like “Ms.”, “Mr.”, or “Mrs.” from names.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># In some census data, &quot;ah&quot; is a prefix from Chinese names.</span>
+<span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;remove_prefixes&quot;</span><span class="p">,</span> <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;ah&quot;</span><span class="p">]}]</span>
+</pre></div>
+</div>
+</section>
+<section id="condense-strip-whitespace">
+<h3>condense_strip_whitespace<a class="headerlink" href="#condense-strip-whitespace" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. Take white space that may be more than one character or contain
+non-space characters and replace it with a single space.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;condense_strip_whitespace&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="remove-one-letter-names">
+<h3>remove_one_letter_names<a class="headerlink" href="#remove-one-letter-names" title="Link to this heading">¶</a></h3>
+<p>Used in name cleaning. If a name is a single character, remove it and leave the white space behind.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;remove_one_letter_names&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="split">
+<h3>split<a class="headerlink" href="#split" title="Link to this heading">¶</a></h3>
+<p>Split the column value on space characters.</p>
+<p>Maps string → array of string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">column_mappings</span><span class="p">]]</span>
+<span class="n">alias</span> <span class="o">=</span> <span class="s2">&quot;namefrst_split&quot;</span>
+<span class="n">column_name</span> <span class="o">=</span> <span class="s2">&quot;namefrst_clean&quot;</span>
+<span class="n">transforms</span> <span class="o">=</span> <span class="p">[{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;split&quot;</span><span class="p">}]</span>
+</pre></div>
+</div>
+</section>
+<section id="array-index">
+<h3>array_index<a class="headerlink" href="#array-index" title="Link to this heading">¶</a></h3>
+<p>If the column contains an array, select the element at the given position.</p>
+<p>This can be used as the input to another transform. In the example below, the first transform selects the second (index 1) item from  the “namefrst_split” column that contains a set of names split on white space. Then the substring 0,1 is selected, which gives the first initial of the person’s probable middle name.</p>
+<p>Maps array of T → T.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">column_mappings</span><span class="p">]]</span>
+<span class="n">alias</span> <span class="o">=</span> <span class="s2">&quot;namefrst_mid_init&quot;</span>
+<span class="n">column_name</span> <span class="o">=</span> <span class="s2">&quot;namefrst_split&quot;</span>
+<span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;array_index&quot;</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="mi">1</span><span class="p">},</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;substring&quot;</span><span class="p">,</span> <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">]}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+</section>
+<section id="mapping">
+<h3>mapping<a class="headerlink" href="#mapping" title="Link to this heading">¶</a></h3>
+<p>Map single or multiple values to a single output value, otherwise known as a “recoding.”</p>
+<p>Maps T → U.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">column_mappings</span><span class="p">]]</span>
+<span class="n">column_name</span> <span class="o">=</span> <span class="s2">&quot;birthyr&quot;</span>
+<span class="n">alias</span> <span class="o">=</span> <span class="s2">&quot;clean_birthyr&quot;</span>
+<span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span>
+        <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;mapping&quot;</span><span class="p">,</span>
+        <span class="n">values</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="p">{</span><span class="s2">&quot;from&quot;</span><span class="o">=</span><span class="p">[</span><span class="mi">9999</span><span class="p">,</span><span class="mi">1999</span><span class="p">],</span> <span class="s2">&quot;to&quot;</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">},</span>
+            <span class="p">{</span><span class="s2">&quot;from&quot;</span> <span class="o">=</span> <span class="o">-</span><span class="mi">9998</span><span class="p">,</span> <span class="s2">&quot;to&quot;</span> <span class="o">=</span> <span class="mi">9999</span><span class="p">}</span>
+        <span class="p">]</span>
+    <span class="p">}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+</section>
+<section id="substring">
+<h3>substring<a class="headerlink" href="#substring" title="Link to this heading">¶</a></h3>
+<p>Replace a column with a substring of the data in the column.</p>
+<p>Maps string → string.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;substring&quot;</span><span class="p">,</span> <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">]}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+</section>
+<section id="divide-by-int">
+<h3>divide_by_int<a class="headerlink" href="#divide-by-int" title="Link to this heading">¶</a></h3>
+<p>Divide data in a column by an integer value. It may leave a non-integer result.</p>
+<p>For instance, the following example takes the birthplace variable and converts it
+from the detailed version to the general version. The two least significant digits
+are detailed birthplace information; to make the more general version, we simply drop
+them by dividing by 100 and rounding to the lowest whole number (floor function).</p>
+<p>Maps numerical → numerical.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">column_mappings</span><span class="p">]]</span>
+<span class="n">column_name</span> <span class="o">=</span> <span class="s2">&quot;bpl&quot;</span>
+<span class="n">alias</span> <span class="o">=</span> <span class="s2">&quot;bpl_root&quot;</span>
+<span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;divide_by_int&quot;</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="mi">100</span><span class="p">},</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;get_floor&quot;</span><span class="p">}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+</section>
+<section id="when-value">
+<h3>when_value<a class="headerlink" href="#when-value" title="Link to this heading">¶</a></h3>
+<p>Apply conditional logic to replacement of values in a column. Works like the SQL <code class="docutils literal notranslate"><span class="pre">if()</span></code> or <code class="docutils literal notranslate"><span class="pre">case()</span></code> expressions in the SQL <code class="docutils literal notranslate"><span class="pre">select</span></code> clause.
+When the value of a column is <code class="docutils literal notranslate"><span class="pre">value</span></code> replace it with <code class="docutils literal notranslate"><span class="pre">if_value</span></code>. Otherwise replace it with <code class="docutils literal notranslate"><span class="pre">else_value</span></code>.</p>
+<p>The following example replaces all “race” IPUMS codes with 0 (white) or 1 (non-white). An IPUMS code of 100 is the “white” race category.</p>
+<p>Maps T → U.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">column_name</span> <span class="o">=</span> <span class="s2">&quot;race&quot;</span>
+<span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;when_value&quot;</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="mi">100</span><span class="p">,</span> <span class="n">if_value</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">else_value</span> <span class="o">=</span> <span class="mi">1</span><span class="p">}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+</section>
+<section id="get-floor">
+<h3>get_floor<a class="headerlink" href="#get-floor" title="Link to this heading">¶</a></h3>
+<p>Round down to the nearest whole number.</p>
+<p>This example produces the general version of the IPUMS “relate” variable. The variable
+is coded such that detailed categories are between the hundreds (300 is child of household
+head, 301 is simply ‘child’, 302 is adopted child, 303 is step-child for instance).
+The general categories are usually all that’s needed (1 == household head, 2 == spouse,
+3 == child, 4 == child-in-law, 5 == parent, 6 == parent-in-law, 7== sibling, 12 == not related to head).</p>
+<p>Maps numerical → numerical.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">column_mappings</span><span class="p">]]</span>
+<span class="n">alias</span> <span class="o">=</span> <span class="s2">&quot;relate_div_100&quot;</span>
+<span class="n">column_name</span> <span class="o">=</span> <span class="s2">&quot;relate&quot;</span>
+<span class="n">transforms</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;divide_by_int&quot;</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="mi">100</span><span class="p">},</span>
+    <span class="p">{</span><span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;get_floor&quot;</span><span class="p">}</span>
+<span class="p">]</span>
+</pre></div>
+</div>
+</section>
+</section>
+</section>
+
+
+          </div>
+          
+        </div>
+      </div>
+      <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
+        <div class="sphinxsidebarwrapper">
+<h1 class="logo"><a href="index.html">hlink</a></h1>
+
+
+
+
+
+
+
+
+<h3>Navigation</h3>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
+<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
+<li class="toctree-l1"><a class="reference internal" href="link_tasks.html">Link Tasks</a></li>
+<li class="toctree-l1"><a class="reference internal" href="running_the_program.html">Running hlink</a></li>
+<li class="toctree-l1"><a class="reference internal" href="use_examples.html">Advanced Workflows</a></li>
+<li class="toctree-l1"><a class="reference internal" href="config.html">Configuration</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
+<ul class="current">
+<li class="toctree-l1 current"><a class="current reference internal" href="#">Column Mapping</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="#basic-usage">Basic Usage</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#advanced-usage">Advanced Usage</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#transforms">Transforms</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
+<li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
+<li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
+<li class="toctree-l1"><a class="reference internal" href="substitutions.html">Substitutions</a></li>
+<li class="toctree-l1"><a class="reference internal" href="models.html">Models</a></li>
+</ul>
+
+<div class="relations">
+<h3>Related Topics</h3>
+<ul>
+  <li><a href="index.html">Documentation overview</a><ul>
+      <li>Previous: <a href="config.html" title="previous chapter">Configuration</a></li>
+      <li>Next: <a href="comparison_types.html" title="next chapter">Comparison types, transform add-ons, aggregate features, and household aggregate features</a></li>
+  </ul></li>
+</ul>
+</div>
+<div id="searchbox" style="display: none" role="search">
+  <h3 id="searchlabel">Quick search</h3>
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</div>
+<script>document.getElementById('searchbox').style.display = "block"</script>
+
+
+
+
+
+
+
+
+        </div>
+      </div>
+      <div class="clearer"></div>
+    </div>
+    <div class="footer">
+      &copy;2019-2022, IPUMS.
+      
+      |
+      Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
+      &amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
+      
+      |
+      <a href="_sources/column_mappings.md.txt"
+          rel="nofollow">Page source</a>
+    </div>
+
+    
+
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/docs/comparison_types.html b/docs/comparison_types.html
index cce9b73..cab8535 100644
--- a/docs/comparison_types.html
+++ b/docs/comparison_types.html
@@ -14,7 +14,7 @@
     <link rel="index" title="Index" href="genindex.html" />
     <link rel="search" title="Search" href="search.html" />
     <link rel="next" title="Feature Selection transforms" href="feature_selection_transforms.html" />
-    <link rel="prev" title="Column mapping transforms" href="column_mapping_transforms.html" />
+    <link rel="prev" title="Column Mappings" href="column_mappings.html" />
    
   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
   
@@ -1270,7 +1270,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul class="current">
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1 current"><a class="current reference internal" href="#">Comparison Types</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="#comparison-types">Comparison types</a></li>
 <li class="toctree-l2"><a class="reference internal" href="#feature-add-ons">Feature add-ons</a></li>
@@ -1288,7 +1288,7 @@ <h3>Navigation</h3>
 <h3>Related Topics</h3>
 <ul>
   <li><a href="index.html">Documentation overview</a><ul>
-      <li>Previous: <a href="column_mapping_transforms.html" title="previous chapter">Column mapping transforms</a></li>
+      <li>Previous: <a href="column_mappings.html" title="previous chapter">Column Mappings</a></li>
       <li>Next: <a href="feature_selection_transforms.html" title="next chapter">Feature Selection transforms</a></li>
   </ul></li>
 </ul>
diff --git a/docs/config.html b/docs/config.html
index c0736bd..feb8c15 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -13,7 +13,7 @@
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <link rel="index" title="Index" href="genindex.html" />
     <link rel="search" title="Search" href="search.html" />
-    <link rel="next" title="Column mapping transforms" href="column_mapping_transforms.html" />
+    <link rel="next" title="Column Mappings" href="column_mappings.html" />
     <link rel="prev" title="Advanced Workflow Examples" href="use_examples.html" />
    
   <link rel="stylesheet" href="_static/custom.css" type="text/css" />
@@ -36,20 +36,20 @@ <h1>Configuration<a class="headerlink" href="#configuration" title="Link to this
 <ol class="simple">
 <li><p><a class="reference external" href="#basic-config-file">Basic Example Config File</a></p></li>
 <li><p><a class="reference external" href="#advanced-config-file">Advanced Example Config File</a></p></li>
-<li><p><a class="reference external" href="#top-level-configs">Top level configs</a></p></li>
-<li><p><a class="reference external" href="#data-sources">Data sources</a></p></li>
+<li><p><a class="reference external" href="#top-level-configs">Top-Level Configs</a></p></li>
+<li><p><a class="reference external" href="#data-sources">Data Sources</a></p></li>
 <li><p><a class="reference external" href="#filter">Filter</a></p></li>
-<li><p><a class="reference external" href="#column-mappings">Column mappings</a></p></li>
-<li><p><a class="reference external" href="#substitution-columns">Substitution columns</a></p></li>
-<li><p><a class="reference external" href="#feature-selections">Feature selections</a></p></li>
-<li><p><a class="reference external" href="#potential-matches-universe">Potential matches universe</a></p></li>
+<li><p><a class="reference external" href="#column-mappings">Column Mappings</a></p></li>
+<li><p><a class="reference external" href="#substitution-columns">Substitution Columns</a></p></li>
+<li><p><a class="reference external" href="#feature-selections">Feature Selections</a></p></li>
+<li><p><a class="reference external" href="#potential-matches-universe">Potential Matches Universe</a></p></li>
 <li><p><a class="reference external" href="#blocking">Blocking</a></p></li>
 <li><p><a class="reference external" href="#comparisons">Comparisons</a></p></li>
-<li><p><a class="reference external" href="#household-comparisons">Household comparisons</a></p></li>
-<li><p><a class="reference external" href="#comparison-features">Comparison features</a></p></li>
-<li><p><a class="reference external" href="#pipeline-generated-features">Pipeline-generated features</a></p></li>
-<li><p><a class="reference external" href="#training-and-models">Training and models</a></p></li>
-<li><p><a class="reference external" href="#household-training-and-models">Household training and models</a></p></li>
+<li><p><a class="reference external" href="#household-comparisons">Household Comparisons</a></p></li>
+<li><p><a class="reference external" href="#comparison-features">Comparison Features</a></p></li>
+<li><p><a class="reference external" href="#pipeline-generated-features">Pipeline-Generated Features</a></p></li>
+<li><p><a class="reference external" href="#training-and-models">Training and Models</a></p></li>
+<li><p><a class="reference external" href="#household-training-and-models">Household Training and Models</a></p></li>
 </ol>
 <section id="basic-config-file">
 <h2>Basic Config File<a class="headerlink" href="#basic-config-file" title="Link to this heading">¶</a></h2>
@@ -473,17 +473,35 @@ <h2>Filter<a class="headerlink" href="#filter" title="Link to this heading">¶</
 </div>
 </section>
 <section id="column-mappings">
-<h2><a class="reference internal" href="column_mapping_transforms.html"><span class="doc">Column Mappings</span></a><a class="headerlink" href="#column-mappings" title="Link to this heading">¶</a></h2>
+<h2><a class="reference internal" href="column_mappings.html"><span class="doc">Column Mappings</span></a><a class="headerlink" href="#column-mappings" title="Link to this heading">¶</a></h2>
 <ul class="simple">
 <li><p>Header name: <code class="docutils literal notranslate"><span class="pre">column_mappings</span></code></p></li>
-<li><p>Description: Base column mappings and transformations to extract from your input datasets.</p></li>
+<li><p>Description: Base column mappings and transformations to extract from your
+input datasets. Each column mapping requires a <code class="docutils literal notranslate"><span class="pre">column_name</span></code> which tells it
+which input column to read from. Optionally you may provide an <code class="docutils literal notranslate"><span class="pre">alias</span></code> for
+the column and <code class="docutils literal notranslate"><span class="pre">transforms</span></code> to modify it as it is read in. There are some additional
+attributes listed below that are meant for advanced usage. These are described
+in more detail on the <a class="reference internal" href="column_mappings.html"><span class="doc">column mappings</span></a> page.</p></li>
 <li><p>Required: True</p></li>
 <li><p>Type: List</p></li>
 <li><p>Attributes:</p>
 <ul>
-<li><p><code class="docutils literal notranslate"><span class="pre">alias</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Optional; if not specified the new column name defaults to <code class="docutils literal notranslate"><span class="pre">column_name</span></code>. New name of column.</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">column_name</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Name of column in input data. Used as the name of the output column if <code class="docutils literal notranslate"><span class="pre">alias</span></code> is not specified.</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">transforms</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">List</span></code>. Optional. A list of transforms to apply, in order, to the input data. See the <a class="reference internal" href="column_mapping_transforms.html"><span class="doc">column mapping transforms</span></a> section for more information.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">column_name</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. The name of the column in the input data.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">alias</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Optional. The new name of the column to use
+in hlink. By default, this is the same as <code class="docutils literal notranslate"><span class="pre">column_name</span></code>.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">transforms</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">List</span></code>. Optional. A list of transforms to apply, in
+order, to the input data. See the <a class="reference external" href="column_mappings.html#transforms">column mapping transforms</a>
+section for more information.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">set_value_column_a</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">Any</span></code>. Optional. Set all records for dataset
+A to the given literal value.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">set_value_column_b</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">Any</span></code>. Optional. Set all records for dataset
+B to the given literal value.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">override_column_a</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Read from this column in dataset A
+instead of the column specified with <code class="docutils literal notranslate"><span class="pre">column_name</span></code>.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">override_column_b</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Read from this column in dataset B
+instead of the column specified with <code class="docutils literal notranslate"><span class="pre">column_name</span></code>.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">override_transforms</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">List</span></code>. Transforms to apply to the override
+column specified with <code class="docutils literal notranslate"><span class="pre">override_column_a</span></code> or <code class="docutils literal notranslate"><span class="pre">override_column_b</span></code>.</p></li>
 </ul>
 </li>
 </ul>
@@ -877,7 +895,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
@@ -890,7 +908,7 @@ <h3>Related Topics</h3>
 <ul>
   <li><a href="index.html">Documentation overview</a><ul>
       <li>Previous: <a href="use_examples.html" title="previous chapter">Advanced Workflow Examples</a></li>
-      <li>Next: <a href="column_mapping_transforms.html" title="next chapter">Column mapping transforms</a></li>
+      <li>Next: <a href="column_mappings.html" title="next chapter">Column Mappings</a></li>
   </ul></li>
 </ul>
 </div>
diff --git a/docs/feature_selection_transforms.html b/docs/feature_selection_transforms.html
index c550ee8..813d48d 100644
--- a/docs/feature_selection_transforms.html
+++ b/docs/feature_selection_transforms.html
@@ -178,7 +178,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul class="current">
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1 current"><a class="current reference internal" href="#">Feature Selection</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="#bigrams">bigrams</a></li>
diff --git a/docs/genindex.html b/docs/genindex.html
index 48945c2..2705398 100644
--- a/docs/genindex.html
+++ b/docs/genindex.html
@@ -62,7 +62,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/index.html b/docs/index.html
index 62ee47d..8c14a90 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -90,28 +90,10 @@ <h1>Configuration API<a class="headerlink" href="#configuration-api" title="Link
 <div class="toctree-wrapper compound">
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a><ul>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#add-to-a">add_to_a</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#concat-to-a">concat_to_a</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#concat-to-b">concat_to_b</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#lowercase-strip">lowercase_strip</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#rationalize-name-words">rationalize_name_words</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#remove-qmark-hyphen">remove_qmark_hyphen</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#remove-punctuation">remove_punctuation</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#replace-apostrophe">replace_apostrophe</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#remove-alternate-names">remove_alternate_names</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#remove-suffixes">remove_suffixes</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#remove-stop-words">remove_stop_words</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#remove-prefixes">remove_prefixes</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#condense-strip-whitespace">condense_strip_whitespace</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#remove-one-letter-names">remove_one_letter_names</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#split">split</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#array-index">array_index</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#mapping">mapping</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#substring">substring</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#divide-by-int">divide_by_int</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#when-value">when_value</a></li>
-<li class="toctree-l2"><a class="reference internal" href="column_mapping_transforms.html#get-floor">get_floor</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="column_mappings.html#basic-usage">Basic Usage</a></li>
+<li class="toctree-l2"><a class="reference internal" href="column_mappings.html#advanced-usage">Advanced Usage</a></li>
+<li class="toctree-l2"><a class="reference internal" href="column_mappings.html#transforms">Transforms</a></li>
 </ul>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a><ul>
@@ -178,7 +160,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/installation.html b/docs/installation.html
index fd825c8..0afb96e 100644
--- a/docs/installation.html
+++ b/docs/installation.html
@@ -90,7 +90,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/introduction.html b/docs/introduction.html
index 8c5e3a9..f115332 100644
--- a/docs/introduction.html
+++ b/docs/introduction.html
@@ -102,7 +102,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/link_tasks.html b/docs/link_tasks.html
index c2fb1ce..d5a2d72 100644
--- a/docs/link_tasks.html
+++ b/docs/link_tasks.html
@@ -255,7 +255,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/models.html b/docs/models.html
index 2625ef3..01ccd6d 100644
--- a/docs/models.html
+++ b/docs/models.html
@@ -160,7 +160,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul class="current">
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/objects.inv b/docs/objects.inv
index c518b11b454fad0f0089602b66d78c8919cc172c..fb7e2ccc0cd8cb69a17ad5c100c2a6a6f5417e46 100644
GIT binary patch
delta 417
zcmV;S0bc%(1ds%fd4H43P6HtnhW9)L?yZSicgAR9qDf;-x+KTpl$n%UxM;icHGDmv
zL>LBV&_;Lg{r|aefX>LMG*e*hpoOPC@GEXQy1JxYD_!(KNqRFv`Zz(-8D+r@-e?w%
z7JnpYwVI()I?{6*9B98YPEp`|WpqLyghK%V=?)=8mu;Xq?SD=5*c-`DI8rZqH;A@_
zOlsY{jt;_LG?{{!M~Ph~?7NHR41f7b-n-(Uo9dX)<B32$XpV<5rGyuhIccRO!F^tm
z*)~;PIJPL$ERhC7dpYR+=lh4J$rcsrFrMcu#rptKj=MI^%xk@wq_E%&kKOOEnb)wH
zklY0e;P=h2W`9~j9@3sZ=RHw|qioED6yxK#&ZZO7vd2{wzTHvcG&eg~Om&gfXRxgC
zODqZ26V4Zb=|Lc$AJL-PKW(g!p-)3=x^Ypbs}(Ht$gDHMfm-%+LN2+UicqeXcm&t=
zZTZ5AdsFl^2qZqf_%MW+ucbMABTR8iMAA=UcmQQ(I~uelFLBrW6?B)Tv^8$Olje=y
LPCEGl-*uD-smIQT

delta 416
zcmV;R0bl-*1djxed4E&OPQx$|y!$K6og#7Nh7d?dAXOBp$7maGlc;{l4`^@v2EWIb
zV8=0zX$v=dW_ETxwp+!$kc|MX2iY~wfV5j>gr(N88>U*ixTftn(N*sRr*|VwCz=+T
zv`T0&gH^I|2aUf>Yc6|bl#b|Sv7^A)LdgI@*BR`<A>AT4Z-1f;B%{6Z7JJ3n2}PJi
zZilX(AR@VMTzdm<P?AhVjYW=aM9u{BmoE7}Bn4e%#XO!4<scdEM@tSH<k2*v$>1?A
zNs&#RXNooQkR?*5XwL_^`}+8_G|0R_>BjS1gnH|M^Kn<(6g>AUNGNkgG2b4S6+D*}
zgv2Qt2X<G5Re#V_@)Y*;HSUQ}40$C^ND@Ae>nJ*rmOd;p@uT(}r`+_QG4#bIKZALX
zBQhB*CpcaNh6jPHJfc~*f5uoITpvQKrZHa1@dNePXrnp<Vm{9aI(fY$B3&=B2&V1p
z_6bYsRpOV>A+dgA-Qaw@mg?*kG1bi>5`I#Hdk~t}p&8BbRCmkXK(-;JwKBUcSMT(G
K;>kDcY@nLMgvD6^

diff --git a/docs/pipeline_features.html b/docs/pipeline_features.html
index efab01e..f0c76ca 100644
--- a/docs/pipeline_features.html
+++ b/docs/pipeline_features.html
@@ -109,7 +109,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul class="current">
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1 current"><a class="current reference internal" href="#">Pipeline Features</a><ul>
diff --git a/docs/running_the_program.html b/docs/running_the_program.html
index fb3278b..3840045 100644
--- a/docs/running_the_program.html
+++ b/docs/running_the_program.html
@@ -299,7 +299,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/search.html b/docs/search.html
index a8c9588..f9b4670 100644
--- a/docs/search.html
+++ b/docs/search.html
@@ -91,7 +91,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/searchindex.js b/docs/searchindex.js
index 391719c..7fd47bc 100644
--- a/docs/searchindex.js
+++ b/docs/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["column_mapping_transforms", "comparison_types", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "filenames": ["column_mapping_transforms.md", "comparison_types.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "titles": ["Column mapping transforms", "Comparison types, transform add-ons, aggregate features, and household aggregate features", "Configuration", "Feature Selection transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "terms": {"each": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "header": [0, 1, 2, 3, 9, 11], "below": [0, 1, 2, 3, 8, 9, 10], "repres": [0, 1, 2, 3, 9, 10], "type": [0, 2, 3, 4, 7, 8, 10, 11, 12], "ar": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "context": [0, 1, 3, 9], "column_map": [0, 2, 7], "list": [0, 1, 2, 3, 4, 8, 10, 11], "oper": [0, 2], "singl": [0, 2, 10, 12], "input": [0, 1, 2, 3, 6, 7, 10, 11], "output": [0, 1, 2, 3, 6, 7, 10, 12], "some": [0, 1, 3, 6, 7, 10], "suffix": [0, 4], "b": [0, 1, 2, 4, 10], "These": [0, 1, 2, 3, 6, 7, 8, 9], "mean": [0, 2], "appli": [0, 2, 3, 7, 12], "from": [0, 1, 2, 4, 6, 7, 8, 9, 10, 12], "onli": [0, 1, 2, 7, 12], "one": [0, 1, 2, 4, 7], "two": [0, 1, 2, 3, 6, 7, 9, 10, 12], "dataset": [0, 1, 2, 6, 7, 10, 12], "link": [0, 1, 2, 4, 6, 8], "A": [0, 1, 2, 9, 10], "most": [0, 1, 7, 10], "both": [0, 1, 2, 7, 12], "independ": [0, 2], "more": [0, 1, 2, 9, 10, 12], "than": [0, 1, 2, 8], "can": [0, 1, 2, 5, 6, 7, 8, 10, 12], "order": [0, 2, 7], "thei": [0, 1, 2, 7, 10], "re": 0, "configur": [0, 1, 6, 10, 12], "file": [0, 1, 4, 6, 7, 10, 11, 12], "so": [0, 1, 2, 5, 12], "mai": [0, 2, 6, 7, 10], "anoth": [0, 1, 3, 7], "format": 0, "The": [0, 1, 2, 3, 5, 7, 8, 9, 10, 12], "letter": [0, 4], "t": [0, 1, 2, 3, 12], "u": 0, "arbitrari": 0, "specifi": [0, 1, 2, 6, 7, 9, 10, 11], "column_nam": [0, 1, 2, 11], "attribut": [0, 1, 2, 3, 7, 8, 9, 10, 11], "under": [0, 1, 2], "its": [0, 1, 6, 10], "section": [0, 1, 2, 12], "name": [0, 1, 2, 4, 10, 11], "i": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12], "alia": [0, 2, 7], "along": [0, 1], "which": [0, 1, 2, 3, 6, 7, 9, 10, 12], "must": [0, 1, 2, 8, 9, 11], "addit": [0, 1, 2, 3, 5, 6, 10], "vari": [0, 2], "inform": [0, 1, 2, 10], "given": [0, 1, 2, 3, 8, 12], "often": [0, 10], "just": [0, 1, 2, 10, 12], "valu": [0, 1, 2, 4, 8, 9, 10, 11], "expect": [0, 1], "an": [0, 1, 2, 3, 6, 8, 10], "exampl": [0, 1, 2], "namefrst_split": [0, 2], "namefrst_clean": [0, 2], "add": [0, 4], "_": [0, 1, 2, 3, 4, 8, 9, 10], "numer": [0, 1], "11": [0, 2, 5, 9], "concat": [0, 4], "concaten": [0, 1], "string": [0, 1, 2, 3, 7, 8, 10, 11], "end": [0, 1, 2, 3, 11], "lowercas": [0, 4], "strip": [0, 4, 7], "clean": [0, 6], "convert": [0, 1, 2], "alphabet": 0, "charact": 0, "lower": [0, 1], "case": [0, 1, 2, 3, 6], "white": 0, "space": [0, 2, 3, 11], "start": [0, 11], "ration": [0, 4], "word": [0, 4], "replac": [0, 1, 4], "sinc": [0, 2], "peopl": [0, 1, 6, 10], "": [0, 1, 2, 6, 7, 10, 11], "raw": [0, 2, 7, 10], "censu": [0, 7, 12], "data": [0, 1, 4, 6, 7, 10], "contain": [0, 1, 11], "lead": 0, "better": [0, 6], "match": [0, 1, 4, 6, 10, 11, 12], "remov": [0, 4], "qmark": [0, 4], "hyphen": [0, 4], "punctuat": [0, 4], "thi": [0, 1, 2, 5, 6, 7, 9, 10, 12], "apostroph": [0, 4], "altern": [0, 2, 4], "If": [0, 1, 2, 3, 7, 8, 10, 11], "surround": 0, "all": [0, 1, 2, 3, 7, 8, 9, 10], "follow": [0, 1, 6, 10, 11, 12], "them": [0, 1, 2, 7], "jr": [0, 2], "sr": [0, 2], "ii": [0, 2], "iii": [0, 2], "stop": [0, 4], "last": [0, 1, 7, 9], "street": [0, 1], "avenu": [0, 11], "blvd": 0, "circl": 0, "court": 0, "road": 0, "prefix": [0, 4], "like": [0, 2, 7, 10], "m": [0, 1], "mr": 0, "In": [0, 1, 6, 10, 12], "ah": 0, "chines": 0, "condens": [0, 4], "whitespac": [0, 4, 7], "take": [0, 1, 2, 3, 7, 10], "non": 0, "leav": 0, "behind": 0, "arrai": [0, 2, 4, 8, 9], "index": [0, 4, 5], "select": [0, 1, 4, 6, 10, 12], "element": 0, "posit": [0, 1, 2, 4, 6], "first": [0, 1, 2, 5, 7, 10, 11], "second": [0, 1, 2, 11], "1": [0, 1, 2, 4, 7, 8, 9, 10, 12], "item": 0, "namefrst": [0, 1, 2], "set": [0, 1, 2, 3, 6, 7, 10, 12], "Then": [0, 5], "0": [0, 1, 2, 7, 8, 9, 10, 12], "give": [0, 2], "initi": [0, 1, 10], "person": [0, 1, 6], "probabl": [0, 2, 8], "middl": [0, 1], "namefrst_mid_init": [0, 1], "multipl": [0, 1, 2, 10], "otherwis": [0, 1, 9, 12], "known": 0, "recod": 0, "birthyr": [0, 2], "clean_birthyr": [0, 2, 3], "9999": [0, 2, 9], "1999": [0, 2], "9998": 0, "divid": [0, 4], "int": [0, 1, 2, 3, 4, 8], "integ": [0, 1, 2, 9], "It": [0, 1, 2, 6, 10, 12], "result": [0, 1, 6, 9, 10, 12], "For": [0, 1, 2, 7, 10, 12], "instanc": [0, 8], "birthplac": 0, "variabl": [0, 1, 2, 12], "detail": [0, 2, 10], "version": [0, 5, 12], "gener": [0, 1, 4, 6, 7, 10], "least": [0, 1], "signific": 0, "digit": 0, "make": [0, 1, 2, 5, 12], "we": [0, 1, 10, 12], "simpli": [0, 2], "drop": [0, 2, 10], "100": [0, 2, 12], "round": [0, 2], "lowest": 0, "whole": [0, 6], "number": [0, 1, 2, 7, 8, 10], "floor": [0, 4], "function": [0, 1, 2, 6, 10], "bpl": [0, 1, 2], "bpl_root": 0, "when": [0, 1, 2, 3, 4, 7, 12], "condit": [0, 1, 2, 3, 4, 7], "logic": 0, "work": [0, 1, 2, 5, 7, 10, 12], "sql": [0, 1, 2, 3, 4, 7, 10], "express": [0, 1, 2], "claus": [0, 1], "if_valu": 0, "else_valu": 0, "race": [0, 1, 2, 9, 12], "ipum": [0, 6], "code": [0, 1, 2, 5], "categori": [0, 8], "get": [0, 1, 4, 10], "down": [0, 6, 12], "nearest": 0, "produc": [0, 10], "relat": [0, 1, 2], "between": [0, 1, 2, 6, 7, 10, 12], "hundr": 0, "300": 0, "child": [0, 8], "household": [0, 4, 6, 8, 10, 12], "head": 0, "301": 0, "302": 0, "adopt": 0, "303": 0, "step": [0, 1, 2, 6], "usual": [0, 7, 12], "need": [0, 1, 2, 7, 10, 12], "2": [0, 1, 2, 3, 7, 8, 11, 12], "spous": 0, "3": [0, 1, 2, 5, 7, 8, 9, 12], "4": [0, 1, 8], "law": 0, "5": [0, 1, 2, 8, 9, 10, 12], "parent": [0, 1, 11], "6": [0, 2, 8, 9, 12], "7": [0, 1, 2, 8, 12], "sibl": 0, "12": [0, 5], "relate_div_100": [0, 1, 2], "page": [1, 10], "ha": [1, 2, 6, 10, 12], "differ": [1, 2, 4, 6, 7], "avail": [1, 2, 3, 5, 7, 8, 9, 12], "comparison_featur": [1, 2, 7], "relatematch": [1, 2], "comparison_typ": [1, 2], "categor": [1, 2, 8, 9], "true": [1, 2, 3, 7, 9, 11, 12], "maximum": [1, 8], "jaro": [1, 9], "winkler": [1, 9], "find": [1, 7, 12], "greatest": 1, "among": 1, "cartesian": 1, "product": [1, 6, 12], "column": [1, 3, 4, 7, 9, 10, 11, 12], "namelast": [1, 2], "would": [1, 2, 12], "return": [1, 3, 8, 10], "four": 1, "namefrst_a": 1, "namefrst_b": 1, "namelast_b": 1, "namelast_a": 1, "requir": [1, 2, 3, 4, 7, 9, 10, 11], "maximum_jw": 1, "score": [1, 2, 7, 9], "compar": [1, 2, 6, 7], "namefrst_jw": [1, 2, 12], "geograph": 1, "filter": [1, 4, 7, 11], "major": [1, 10], "locat": [1, 2, 10], "chang": [1, 2, 5, 10, 12], "befor": [1, 2, 3, 5, 7], "boundari": 1, "zero": 1, "jw_street": 1, "enum_dist": 1, "max": [1, 8, 10], "member": [1, 7], "neighborhood": 1, "surnam": 1, "related_individual_max_jw": 1, "namefrst_rel": 1, "assert": [1, 10], "same": [1, 2, 6, 7, 10], "NOT": 1, "distinct": 1, "f1": 1, "evalu": [1, 2, 6, 7, 8], "either": [1, 2, 6, 11], "ani": [1, 2, 5, 8], "potenti": [1, 4, 7], "mismatch": 1, "queri": [1, 2], "fi": 1, "OR": 1, "mi0": 1, "mi1": 1, "THEN": 1, "els": [1, 2, 3], "first_init_col": 1, "namefrst_init": 1, "mid_init_col": 1, "namefrst_mid_init_2": 1, "f2": 1, "empti": 1, "null": [1, 2, 3], "AND": [1, 2], "individu": [1, 2, 7, 12], "mainli": 1, "caution": [1, 9], "flag": [1, 9, 10, 12], "f": [1, 10], "sp": 1, "m_caution": [1, 2, 9, 12], "mbpl": 1, "mother_birthyr": 1, "stepmom": 1, "momloc": 1, "comp_a": [1, 2], "comp_b": [1, 2], "comp_c": 1, "parent_step_chang": 1, "comp_d": 1, "check": [1, 10], "sign": 1, "boolean": [1, 2, 3, 11, 12], "form": [1, 7, 11], "cast": 1, "col": 1, "namelast_equal_as_int": 1, "namelast_clean": [1, 2, 3], "whether": [1, 2, 11], "join": [1, 11], "across": 1, "record": [1, 2, 6, 7], "being": [1, 7], "exact": [1, 2], "namefrst_unstd": [1, 2], "present": [1, 2, 9], "nonzero": 1, "primarili": [1, 7], "indic": [1, 12], "kind": 1, "incompar": 1, "akin": 1, "miss": [1, 10], "see": [1, 2, 5, 10, 12], "also": [1, 2, 5, 6, 7, 9, 10, 12], "univers": [1, 4, 7], "similar": 1, "fbpl_nomatch": 1, "fbpl": 1, "allow": [1, 2, 7, 12], "up": [1, 2, 10, 11], "sub": 1, "object": [1, 2, 6, 10], "document": [1, 8, 10, 12], "option": [1, 2, 3, 6, 7, 8, 10, 12], "sp_caution": [1, 2, 12], "spouse_bpl": 1, "spouse_birthyr": 1, "durmarr": [1, 2], "new_marr": [1, 2], "appear": 1, "street_jw": [1, 2, 12], "counti": 1, "statefip": [1, 2], "9": 1, "multipli": 1, "togeth": [1, 2], "after": [1, 2, 4, 8, 10], "float": [1, 2, 8], "have": [1, 2, 5, 6, 7, 8, 10, 12], "comp": 1, "c": 1, "sploc": 1, "012": 1, "fals": [1, 2, 3, 4, 6, 10], "d": 1, "specif": [1, 2, 10], "circumst": 1, "should": [1, 2, 8, 9, 10], "mid_init_match": 1, "either_1": 1, "nativ": 1, "either_0": 1, "gen": 1, "imm": [1, 2, 12], "immigr": 1, "look": [1, 10, 11], "foreign": 1, "born": 1, "sgen": [1, 2, 12], "rel": [1, 2, 12], "scala": 1, "determin": [1, 7], "greater": [1, 5], "jw_threshold": 1, "ag": [1, 2, 3], "less": [1, 2], "age_threshold": 1, "sex": [1, 2, 11], "sampl": 1, "related_individual_row": 1, "unrel": 1, "depend": [1, 2, 5, 12], "name_col": 1, "birthyr_col": 1, "namefrst_related_row": 1, "replaced_birthyr": [1, 2, 3], "extra": 1, "children": 1, "who": 1, "base": [1, 2, 7], "count": [1, 10, 12], "suspect": [1, 6], "relate_col": 1, "histid_col": 1, "id": [1, 2], "birth": 1, "year": [1, 2, 3, 4], "year_b": 1, "wa": [1, 12], "taken": 1, "minimum": [1, 8], "accept": [1, 2, 12], "consid": [1, 8], "histid": [1, 2, 12], "1910": [1, 2, 12], "8": [1, 2, 5, 10], "rate": 1, "calcul": [1, 12], "percentag": 1, "seen": 1, "neighbor": 1, "meet": 1, "95": 1, "nbor": [1, 2, 12], "namelast_neighbor": 1, "05": [1, 2], "ad": [1, 2], "namelast_popularity_sum": 1, "namelast_popular": 1, "length": [1, 2, 9], "size": 1, "ab": 1, "diff": 1, "absolut": 1, "you": [1, 2, 5, 10, 11, 12], "invalid": [1, 8], "instead": [1, 2, 5, 7], "marriag": 1, "durat": 1, "99": [1, 2], "placehold": 1, "unknown": 1, "exclud": 1, "those": [1, 2], "consider": 1, "byrdiff": [1, 2, 12], "mardurmatch": [1, 2], "14": 1, "minu": [1, 2], "subtract": 1, "geo": 1, "distanc": [1, 8], "lookup": 1, "tabl": [1, 2, 4, 7, 10, 12], "hlink": [1, 2, 5, 6, 7, 12], "core": [1, 7, 10, 12], "dist_tabl": 1, "py": [1, 2], "There": [1, 3, 7], "sever": [1, 6], "wai": [1, 5, 10], "kei": [1, 7, 10], "key_count": 1, "secondari": 1, "serv": 1, "back": 1, "primari": [1, 6], "doe": [1, 7, 12], "particularli": 1, "state": [1, 6], "much": [1, 7], "fewer": [1, 8], "combin": [1, 2, 3, 7], "thu": 1, "risk": 1, "fill": 1, "aren": 1, "ex": 1, "even": 1, "though": 1, "distances_fil": 1, "path": [1, 2, 10, 11, 12], "table_nam": 1, "what": [1, 2, 10, 12], "want": [1, 2, 10, 12], "do": [1, 3, 10, 12], "read": [1, 2, 7, 10], "onc": [1, 10], "loc_a": 1, "where": [1, 7, 10, 12], "come": 1, "loc_b": 1, "distance_col": 1, "source_column_a": 1, "sourc": [1, 4, 7, 10, 12], "source_column_b": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b_0": 1, "loc_b_1": 1, "secondary_key_count": 1, "backup": 1, "secondary_table_nam": 1, "secondary_distances_fil": 1, "secondary_source_column": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_distance_col": 1, "state_dist": 1, "state_distance_lookup": 1, "county_state_dist": 1, "csv": [1, 2, 7, 10, 11, 12], "statecode1": 1, "statecode2": 1, "dist": 1, "county_dist": [1, 2, 12], "county_distance_lookup": 1, "county_1900_1910_distances_km": 1, "from_icpsrctyi": 1, "to_icpsrctyi": 1, "from_statefip": 1, "to_statefip": 1, "distance_km": 1, "state_1900_1910_distances_km": 1, "fetch": 1, "neither": 1, "nor": 1, "mpre": 1, "m_namefrst": 1, "accord": 1, "niu": 1, "other": [1, 2, 12], "mfbplmatch": 1, "multi": 1, "search": 1, "special": 1, "simplifi": 1, "particular": [1, 2], "constraint": 1, "num_col": 1, "whose": 1, "templat": 1, "n": [1, 8, 9], "per": [1, 2, 8, 9, 10], "current": [1, 2, 10], "respect": [1, 7], "jw_col_templ": 1, "jw": 1, "pair": [1, 12], "equal_and_not_null_templ": 1, "final": [1, 2, 12], "comput": [1, 3, 7], "_namefrst": 1, "_bpl": 1, "_sex": 1, "25": 1, "nvl": 1, "sm_namefrst": 1, "sn_namefrst": 1, "sm_bpl": 1, "sn_bpl": 1, "sm_sex": 1, "sn_sex": 1, "pass": [1, 2, 7, 8], "default": [1, 2, 7, 8, 10], "flexibl": 1, "user": [1, 10], "write": [1, 10, 12], "own": [1, 2], "favor": 1, "reason": 1, "good": 1, "fallback": 1, "defin": [1, 7, 8, 9, 10], "spark": [1, 2, 5, 8, 9, 10, 12], "builtin": 1, "argument": [1, 10, 12], "namelast_jw_max": 1, "namelast1": 1, "namelast2": 1, "namelast3": 1, "abov": [1, 5], "extend": 1, "beyond": 1, "standard": [1, 11], "top": [1, 4], "level": [1, 4, 10], "everi": 1, "jw_f": [1, 2, 12], "father_namefrst": 1, "rais": [1, 3], "exponenti": 1, "squar": 1, "county_distance_squar": [1, 2, 12], "county_a": 1, "county_b": 1, "upper": 1, "gt": 1, "btwn": 1, "addl": 1, "var": [1, 2], "program": [1, 2, 7, 12], "report": [1, 4, 6, 10], "addl_var": 1, "check_val_expr": 1, "else_v": 1, "volumn": 1, "datasourc": [1, 2, 10], "yrimmig": 1, "immyear_diff": [1, 2, 9, 12], "To": [1, 5, 7, 10], "includ": [1, 2, 7, 9, 10], "train": [1, 4, 6, 8, 10], "independent_var": [1, 2, 12], "config": [1, 4, 7, 10, 12], "id_column": [1, 2], "_a": 1, "mult": 1, "exist": [1, 2, 10], "within": [1, 2, 6, 10, 11], "hh_train": [1, 2, 7, 10, 12], "hh": 1, "highest": [1, 2], "against": [1, 11], "ten": [1, 2], "tell": 2, "how": [2, 7], "descript": [2, 8, 10], "refer": 2, "here": [2, 7, 10, 12], "tutori": [2, 10], "script": [2, 6, 10], "discuss": 2, "readm": 2, "note": 2, "written": [2, 6], "toml": [2, 6, 10], "abl": 2, "json": [2, 10], "datasource_a": [2, 7], "datasource_b": [2, 7], "transform": [2, 4, 6, 7], "lowercase_strip": 2, "add_to_a": 2, "10": [2, 5, 12], "age_2": 2, "derived_from": 2, "expand_length": 2, "explod": [2, 7], "jaro_winkl": 2, "namelast_jw": [2, 12], "threshold": [2, 8, 12], "feature_nam": 2, "79": 2, "84": 2, "complex": 2, "machin": [2, 6, 7, 10, 12], "learn": [2, 6, 7, 10, 12], "probabilist": [2, 6], "drop_data_from_scored_match": 2, "us1900": 2, "us1900m_usa": 2, "p": 2, "parquet": [2, 7], "us1910": 2, "us1910m_usa": 2, "training_data_subset": 2, "serialp": 2, "rationalize_name_word": 2, "remove_qmark_hyphen": 2, "replace_apostroph": 2, "remove_suffix": 2, "remove_alternate_nam": 2, "condense_strip_whitespac": 2, "split": [2, 3, 4, 7, 8, 9, 12], "namefrst_std": [2, 11], "array_index": 2, "bpl_orig": 2, "divide_by_int": 2, "get_floor": 2, "statefip_h": 2, "output_typ": 2, "substitution_column": [2, 7, 11], "join_column": [2, 11], "join_valu": [2, 11], "substitution_fil": [2, 11], "name_std": [2, 11], "male": [2, 11], "femal": [2, 11], "feature_select": [2, 3, 7], "input_column": [2, 3, 9], "output_column": [2, 3, 9], "sql_condit": 2, "namelast_bigram": 2, "bigram": [2, 4], "bpl_clean": 2, "bpl_str": 2, "washington": 2, "bpl2_str": 2, "53": 2, "region": [2, 12], "attach_vari": 2, "region_dict": 2, "col_to_join_on": 2, "col_to_add": 2, "null_fil": 2, "col_typ": 2, "potential_matches_univers": [2, 7], "birthyr_3": 2, "namefrst_std_jw": [2, 12], "75": [2, 8, 12], "comparis": 2, "post": [2, 7], "hh_comparison": [2, 7], "threshold_expr": 2, "fetch_a": 2, "sex_equ": 2, "equal": [2, 11], "relate_a": [2, 9], "pipeline_featur": [2, 7, 9], "sex_region_interact": 2, "transformer_typ": [2, 9], "interact": [2, 4, 7, 12], "relatetyp": [2, 9], "bucket": [2, 7], "hit": [2, 10, 12], "scale_data": [2, 12], "training_data": [2, 10], "dependent_var": [2, 12], "score_with_model": [2, 12], "use_training_data_featur": [2, 7, 12], "split_by_id_a": [2, 12], "decis": [2, 4, 8, 12], "drop_duplicate_with_threshold_ratio": [2, 12], "n_training_iter": [2, 7, 12], "output_suspicious_td": [2, 12], "param_grid": [2, 12], "model_paramet": [2, 7, 8, 12], "random_forest": [2, 12], "maxdepth": [2, 8, 12], "numtre": [2, 8, 12], "005": 2, "threshold_ratio": [2, 8, 12], "logistic_regress": [2, 12], "50": [2, 12], "65": 2, "80": 2, "chosen_model": [2, 8, 12], "prediction_col": 2, "predict": [2, 12], "hh_col": 2, "hh_training_data_1900_1910": 2, "probit": [2, 4], "go": [2, 10], "your": [2, 5, 7, 10, 12], "uniqu": 2, "identifi": [2, 6, 12], "full": [2, 7, 12], "short": 2, "alphanumer": 2, "convert_ints_to_long": 2, "automat": [2, 5, 7], "long": [2, 11], "especi": 2, "assum": 2, "schema": 2, "sometim": 2, "term": 2, "bigint": 2, "thing": 2, "my_fil": 2, "subset": [2, 11], "limit": 2, "extract": 2, "new": [2, 12], "iv": 2, "v": 2, "vi": 2, "vii": 2, "viii": 2, "namelast_clean_bigram": [2, 3], "creat": [2, 6, 7, 9, 10, 11, 12], "fed": [2, 7], "prep": 2, "df": [2, 10], "men": 2, "describ": [2, 10], "newli": 2, "attempt": 2, "duplic": [2, 8], "row": 2, "conjuct": 2, "Will": 2, "conjunct": 2, "rang": [2, 9], "original_valu": 2, "plu": 2, "1870": 2, "expand": 2, "1867": 2, "1868": 2, "1869": 2, "1871": 2, "1872": 2, "1873": 2, "kept": 2, "keep": 2, "appropri": 2, "treat": [2, 9], "import": [2, 7, 10, 12], "dure": [2, 7], "hot": 2, "encod": [2, 3], "vector": [2, 9], "stage": 2, "well": 2, "upper_threshold": 2, "cannot": 2, "robust": 2, "ml": [2, 4, 8, 9], "typic": [2, 7], "leverag": 2, "api": [2, 6, 9], "piplin": 2, "regionf": 2, "sex_regionf_interact": 2, "immyear_caut": [2, 9], "myriad": 2, "explor": [2, 4, 6, 10], "part": [2, 7], "task": [2, 4, 6, 8, 12], "drop_duplicate_a": 2, "time": [2, 7, 10], "out": [2, 7, 12], "best": [2, 7], "smallest": 2, "possibl": 2, "ratio": [2, 8], "beta": [2, 8], "test": [2, 7, 12], "model_explor": [2, 10, 12], "hyper": [2, 6, 12], "paramet": [2, 6, 7, 8, 10, 12], "eval": 2, "skip": [2, 7], "apply_model": 2, "run_all_step": [2, 10, 12], "command": [2, 6, 10, 12], "try": 2, "creation": 2, "iter": 2, "scale": 2, "error": [2, 9], "1900": [2, 12], "about": [2, 10, 12], "1930": [2, 12], "1940": [2, 12], "fail": 2, "were": 2, "sure": [2, 5, 10], "scratch": 2, "although": 2, "know": 2, "haven": 2, "could": 2, "save": [2, 7, 12], "small": 2, "amount": 2, "process": [2, 6, 10], "repeatedli": 2, "help": [2, 7, 10], "neg": [2, 4, 6], "area": 2, "coverag": 2, "increas": [2, 9], "represent": [2, 7], "ensur": 2, "group": [2, 7], "a304bt": 2, "three": [2, 7], "b200": 2, "c201": 2, "d425": 2, "perform": [2, 6, 7, 11], "feature_import": [2, 7, 12], "coeffici": [2, 7], "enabl": [2, 7, 10], "srace": [2, 9, 12], "race_interacted_srac": [2, 9, 12], "hits2": [2, 12], "exact_mult": [2, 12], "ncount": [2, 3, 12], "ncount2": [2, 3, 12], "f_interacted_jw_f": [2, 12], "f_caution": [2, 12], "f_pre": [2, 12], "fbplmatch": [2, 12], "m_interacted_jw_m": [2, 9, 12], "jw_m": [2, 9, 12], "m_pre": [2, 9, 12], "mbplmatch": [2, 12], "sp_interacted_jw_sp": [2, 12], "jw_sp": [2, 12], "sp_pre": [2, 12], "mi": [2, 12], "fsoundex": [2, 12], "lsoundex": [2, 12], "oth": [2, 12], "imm_interacted_immyear_caut": [2, 12], "1900_1910_training_data_20191023": 2, "jw_max_a": 2, "jw_max_b": 2, "f1_match": 2, "f2_match": 2, "byrdifcat": 2, "racematch": 2, "bplmatch": 2, "imm_interacted_bplmatch": 2, "sexmatch": 2, "relatetype_interacted_relatematch": 2, "checkpoint": 3, "override_column_a": 3, "override_column_b": 3, "set_value_column_a": 3, "set_value_column_b": 3, "no_first_pad": 3, "don": 3, "prepend": 3, "provid": [3, 6, 7, 9, 10], "namefrst_unstd_bigram": 3, "namelast_frst_bigram": 3, "namelast_clean_soundex": 3, "input_col": 3, "output_col": 3, "expon": 3, "introduct": 4, "overview": 4, "instal": 4, "pypi": 4, "preprocess": [4, 6, 10, 12], "model": [4, 6, 10], "run": [4, 5, 6, 7, 12], "librari": [4, 6], "mode": [4, 5, 12], "advanc": 4, "workflow": 4, "export": [4, 7, 10], "featur": [4, 6, 7, 8, 10], "reus": 4, "basic": 4, "map": [4, 7, 9], "substitut": [4, 7], "block": [4, 7], "comparison": [4, 7], "pipelin": 4, "substr": 4, "ons": 4, "aggreg": 4, "union": 4, "soundex": 4, "power": 4, "regex": 4, "random": [4, 8], "forest": [4, 8], "logist": [4, 8], "regress": [4, 8], "tree": [4, 8], "gradient": [4, 8], "boost": [4, 8], "system": 5, "python": [5, 6, 10], "java": 5, "integr": 5, "apach": 5, "via": [5, 6], "pyspark": [5, 8, 9, 10], "packag": 5, "org": 5, "latest": 5, "pip": 5, "easiest": [5, 10], "through": [5, 7, 9, 10], "instruct": [5, 10], "But": 5, "clone": 5, "github": 5, "repositori": 5, "root": 5, "project": 5, "directori": [5, 10, 12], "develop": [5, 6], "e": 5, "dev": 5, "edit": 5, "made": 5, "built": 5, "tool": [5, 6], "line": [6, 10], "share": 6, "characterist": [6, 7], "correspond": [6, 7], "real": 6, "world": 6, "determinist": [6, 7], "rule": [6, 7], "algorithm": [6, 7], "At": [6, 7], "been": 6, "unit": 6, "census": 6, "hierarch": [6, 10], "structur": 6, "nest": 6, "howev": [6, 12], "tailor": 6, "ignor": 6, "common": [6, 7, 12], "highli": [6, 7], "languag": 6, "further": [6, 12], "broken": 6, "smaller": 6, "sequenc": 6, "linkrun": [6, 10], "prepar": [6, 7, 10], "research": 6, "experi": 6, "understand": 6, "tune": [6, 12], "relationship": 6, "varieti": 7, "normal": 7, "abbrevi": [7, 11], "regist": [7, 10], "datafram": [7, 10, 12], "request": 7, "classif": [7, 8], "metadata": 7, "introspect": 7, "ingest": 7, "inspect": 7, "mani": [7, 10], "aspect": [7, 10], "extens": 7, "longest": 7, "definit": 7, "reduc": 7, "drastic": 7, "improv": 7, "runtim": 7, "separ": 7, "total": 7, "potential_match": [7, 10], "satisfi": 7, "elig": 7, "support": [7, 8], "reshap": 7, "thought": 7, "ahead": 7, "chosen": 7, "experiment": [7, 10], "focus": 7, "demograph": 7, "moment": 7, "veri": [7, 12], "anyon": 7, "percent": 7, "remain": 7, "popul": 7, "pull": 7, "fix": 7, "width": 7, "crosswalk": 7, "construct": 7, "alpha": 8, "hyperparamet": [8, 12], "de": 8, "param": [8, 12], "label": 8, "doc": [8, 9], "commonli": 8, "explan": 8, "randomforestclassifi": 8, "depth": 8, "20": 8, "featuresubsetstrategi": 8, "node": 8, "auto": 8, "onethird": 8, "sqrt": 8, "log2": 8, "15": 8, "generalizedlinearregress": 8, "famili": 8, "binomi": 8, "85": [8, 10], "logisticregress": 8, "decisiontreeclassifi": 8, "mininstancespernod": 8, "caus": 8, "left": 8, "right": [8, 10], "discard": 8, "maxbin": 8, "bin": 8, "discret": 8, "continu": [8, 9, 12], "gbtclassifi": 8, "mother": 9, "point": [9, 12], "With": 9, "x": [9, 10], "y": 9, "hold": 9, "except": 9, "strictli": 9, "inf": 9, "explicitli": 9, "cover": 9, "doubl": 9, "outsid": 9, "job": 10, "high": 10, "class": 10, "handl": 10, "main": 10, "complet": 10, "access": [10, 12], "link_run": 10, "factori": 10, "sparkfactori": 10, "load_config": 10, "load_conf_fil": 10, "sparksess": 10, "now": 10, "let": 10, "load": 10, "our": 10, "my_conf": 10, "lr": 10, "prep_step": 10, "get_step": 10, "enumer": 10, "print": 10, "input_table_nam": 10, "output_table_nam": 10, "run_step": 10, "get_tabl": 10, "matches_df": 10, "hh_model_explor": 10, "method": [10, 12], "interfac": 10, "easili": 10, "conveni": 10, "adjust": 10, "set_loc": 10, "set_num_cor": 10, "set_executor_memori": 10, "5g": 10, "ll": 10, "dictionari": 10, "modul": 10, "pleas": 10, "reproduc": 10, "consol": 10, "cpu": 10, "usag": 10, "h": 10, "executor_memori": [10, 12], "execute_task": 10, "execute_command": 10, "conf": [10, 12], "show": 10, "messag": 10, "exit": 10, "memori": 10, "executor": 10, "begin": 10, "execut": 10, "seri": 10, "excute_command": 10, "filepath": 10, "sai": 10, "fullcount_1870_1880": 10, "pattern": 10, "full_count_1870_1880": 10, "prompt": 10, "enter": 10, "text": 10, "unstabl": 10, "topic": 10, "analyz": [10, 12], "set_preexisting_t": 10, "x_persist": 10, "borrow_t": 10, "get_task": 10, "set_print_sql": 10, "x_sql": 10, "x_sqlf": 10, "ipython": 10, "showf": 10, "x_summari": 10, "desc": 10, "x_crosswalk": 10, "x_tab": 10, "q": [10, 12], "x_hh_tfam": 10, "x_tfam": 10, "drop_al": 10, "reload": 10, "x_hh_tfam_2a": 10, "x_tfam_raw": 10, "drop_all_prc": 10, "x_hh_tfam_2b": 10, "x_union": 10, "drop_all_temp": 10, "x_load": 10, "get_set": 10, "set_link_task": 10, "x_parquet_from_csv": 10, "organ": 10, "hierarchi": 10, "five": 10, "hh_match": 10, "someth": 10, "choic": 10, "preexist": 10, "prepped_df_a": 10, "prepped_df_b": 10, "raw_df_b": 10, "raw_df_a": 10, "training_featur": [10, 12], "scored_potential_match": 10, "potential_matches_prep": 10, "exploded_df_b": 10, "exploded_df_a": 10, "predicted_match": 10, "hh_training_featur": [10, 12], "hh_training_data": 10, "hh_predicted_match": 10, "hh_scored_potential_match": 10, "hh_potential_match": 10, "hh_blocked_match": 10, "hh_potential_matchs_prep": 10, "model_eval_training_vector": 10, "model_eval_training_data": 10, "model_eval_repeat_fp": 10, "model_eval_training_featur": 10, "model_eval_training_result": 10, "model_eval_repeat_fn": 10, "hh_model_eval_training_vector": 10, "hh_model_eval_repeat_fp": 10, "hh_model_eval_repeat_fn": 10, "hh_model_eval_training_result": 10, "hh_model_eval_training_featur": 10, "hh_model_eval_training_data": 10, "persist": 10, "hidden": 10, "intermedi": 10, "yet": 10, "databas": 10, "tablenam": 10, "istemporari": 10, "task_nam": 10, "num": 10, "finish": 10, "By": 10, "put": [10, 12], "launch": [10, 12], "my": [10, 12], "subhead": 11, "suppli": 11, "regex_word_replac": 11, "variant": 11, "av": 11, "7th": 11, "swap": 11, "still": 11, "anywher": 11, "proceed": 11, "street_unstd": 11, "dir": 11, "substitutions_street_abbrev": 11, "span": 12, "1920": 12, "deriv": 12, "necessari": 12, "scenario": 12, "copi": 12, "use_potential_matches_featur": 12, "full_count_1900_1910": 12, "50g": 12, "ask": 12, "arg": 12, "partit": 12, "training_data_1900_1910_hlink_featur": 12, "might": 12, "shut": 12, "framework": 12, "etc": 12, "relev": 12, "matrix": 12, "implement": 12, "regular": 12, "training_data_1900_1910": 12, "weren": 12, "ident": 12, "manual": 12, "updat": 12, "isn": 12, "analysi": 12, "training_result": 12, "hh_training_result": 12, "1900_1910_training_result": 12, "repeat_fp": 12, "repeat_fn": 12, "hh_repeat_fp": 12, "hh_repeat_fn": 12, "1900_1910_potential_fp": 12, "1900_1910_potential_fn": 12, "prefer": 12, "ve": 12}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"column": [0, 2], "map": [0, 2], "transform": [0, 1, 3, 9], "add_to_a": 0, "concat_to_a": 0, "concat_to_b": 0, "lowercase_strip": 0, "rationalize_name_word": 0, "remove_qmark_hyphen": 0, "remove_punctu": 0, "replace_apostroph": 0, "remove_alternate_nam": 0, "remove_suffix": 0, "remove_stop_word": 0, "remove_prefix": 0, "condense_strip_whitespac": 0, "remove_one_letter_nam": 0, "split": 0, "array_index": 0, "substr": 0, "divide_by_int": 0, "when_valu": 0, "get_floor": 0, "comparison": [1, 2], "type": [1, 9], "add": 1, "ons": 1, "aggreg": 1, "featur": [1, 2, 3, 9, 12], "household": [1, 2, 7], "maximum_jaro_winkl": 1, "jaro_winkl": 1, "jaro_winkler_street": 1, "max_jaro_winkl": 1, "equal": 1, "f1_match": 1, "f2_match": 1, "not_equ": 1, "equals_as_int": 1, "all_equ": 1, "not_zero_and_not_equ": 1, "time": 1, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "any_equ": 1, "either_are_1": 1, "either_are_0": 1, "second_gen_imm": 1, "rel_jaro_winkl": 1, "extra_children": 1, "jaro_winkler_r": 1, "sum": 1, "length_b": 1, "abs_diff": 1, "b_minus_a": 1, "geo_dist": 1, "fetch_a": 1, "fetch_b": 1, "present_both_year": 1, "neither_are_nul": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_and_equal_categorical_in_univers": 1, "multi_jaro_winkler_search": 1, "sql_condit": [1, 3], "alia": 1, "power": [1, 3], "threshold": 1, "lower_threshold": 1, "upper_threshold": 1, "gt_threshold": 1, "btwn_threshold": 1, "look_at_addl_var": 1, "hit": 1, "hits2": 1, "exact_mult": 1, "jw_max_a": 1, "jw_max_b": 1, "configur": [2, 4, 7], "basic": 2, "config": 2, "file": 2, "advanc": [2, 12], "top": 2, "level": 2, "data": [2, 11, 12], "sourc": [2, 5], "filter": 2, "substitut": [2, 11], "select": [2, 3], "potenti": [2, 12], "match": [2, 7], "univers": 2, "block": 2, "pipelin": [2, 9], "gener": [2, 9, 12], "train": [2, 7, 12], "model": [2, 7, 8, 12], "bigram": 3, "arrai": 3, "union": 3, "soundex": 3, "welcom": 4, "hlink": [4, 10], "": 4, "document": 4, "api": 4, "instal": 5, "requir": 5, "from": 5, "pypi": 5, "introduct": 6, "overview": [6, 7], "link": [7, 10, 12], "task": [7, 10], "preprocess": 7, "step": [7, 10], "relat": 7, "section": 7, "explor": [7, 12], "report": 7, "random_forest": 8, "probit": 8, "logistic_regress": 8, "decision_tre": 8, "gradient_boosted_tre": 8, "interact": [9, 10], "bucket": 9, "run": 10, "us": 10, "librari": 10, "mode": 10, "start": 10, "program": 10, "exampl": [10, 12], "workflow": [10, 12], "1": 11, "tabl": 11, "regex": 11, "word": 11, "replac": 11, "export": 12, "after": 12, "reus": 12, "differ": 12, "year": 12, "ml": 12, "list": 12, "fals": 12, "posit": 12, "neg": 12, "fp": 12, "fn": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Column mapping transforms": [[0, "column-mapping-transforms"]], "add_to_a": [[0, "add-to-a"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "lowercase_strip": [[0, "lowercase-strip"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_punctuation": [[0, "remove-punctuation"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_suffixes": [[0, "remove-suffixes"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_prefixes": [[0, "remove-prefixes"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "split": [[0, "split"]], "array_index": [[0, "array-index"]], "mapping": [[0, "mapping"]], "substring": [[0, "substring"]], "divide_by_int": [[0, "divide-by-int"]], "when_value": [[0, "when-value"]], "get_floor": [[0, "get-floor"]], "Comparison types, transform add-ons, aggregate features, and household aggregate features": [[1, "comparison-types-transform-add-ons-aggregate-features-and-household-aggregate-features"]], "Comparison types": [[1, "comparison-types"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "equals": [[1, "equals"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "not_equals": [[1, "not-equals"]], "equals_as_int": [[1, "equals-as-int"]], "all_equals": [[1, "all-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "and": [[1, "and"]], "times": [[1, "times"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "any_equals": [[1, "any-equals"]], "either_are_1": [[1, "either-are-1"]], "either_are_0": [[1, "either-are-0"]], "second_gen_imm": [[1, "second-gen-imm"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "extra_children": [[1, "extra-children"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "sum": [[1, "sum"]], "length_b": [[1, "length-b"]], "abs_diff": [[1, "abs-diff"]], "b_minus_a": [[1, "b-minus-a"]], "geo_distance": [[1, "geo-distance"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "present_both_years": [[1, "present-both-years"]], "neither_are_null": [[1, "neither-are-null"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "sql_condition": [[1, "sql-condition"], [3, "sql-condition"]], "Feature add-ons": [[1, "feature-add-ons"]], "alias": [[1, "alias"]], "power": [[1, "power"], [3, "power"]], "threshold": [[1, "threshold"]], "lower_threshold": [[1, "lower-threshold"]], "upper_threshold": [[1, "upper-threshold"]], "gt_threshold": [[1, "gt-threshold"]], "btwn_threshold": [[1, "btwn-threshold"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "Aggregate Features": [[1, "aggregate-features"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "exact_mult": [[1, "exact-mult"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "Configuration": [[2, "configuration"]], "Basic Config File": [[2, "basic-config-file"]], "Advanced Config File": [[2, "advanced-config-file"]], "Top level configs": [[2, "top-level-configs"]], "Data sources": [[2, "data-sources"]], "Filter": [[2, "filter"]], "Column Mappings": [[2, "column-mappings"]], "Substitution Columns": [[2, "substitution-columns"]], "Feature Selections": [[2, "feature-selections"]], "Potential Matches Universe": [[2, "potential-matches-universe"]], "Blocking": [[2, "blocking"]], "Comparisons": [[2, "comparisons"]], "Household Comparisons": [[2, "household-comparisons"]], "Comparison Features": [[2, "comparison-features"]], "Pipeline-generated Features": [[2, "pipeline-generated-features"]], "Training and models": [[2, "training-and-models"]], "Household training and models": [[2, "household-training-and-models"]], "Feature Selection transforms": [[3, "feature-selection-transforms"]], "bigrams": [[3, "bigrams"]], "array": [[3, "array"]], "union": [[3, "union"]], "soundex": [[3, "soundex"]], "Welcome to hlink\u2019s documentation!": [[4, "welcome-to-hlink-s-documentation"]], "Configuration API": [[4, "configuration-api"], [4, null]], "Installation": [[5, "installation"]], "Requirements": [[5, "requirements"]], "Installing from PyPI": [[5, "installing-from-pypi"]], "Installing from source": [[5, "installing-from-source"]], "Introduction": [[6, "introduction"]], "Overview": [[6, "overview"], [7, "overview"], [7, "id1"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"]], "Link Tasks": [[7, "link-tasks"]], "Preprocessing": [[7, "preprocessing"]], "Task steps": [[7, "task-steps"], [7, "id2"], [7, "id5"], [7, "id8"], [7, "id11"], [7, "id14"]], "Related Configuration Sections": [[7, "related-configuration-sections"], [7, "id3"], [7, "id6"], [7, "id9"], [7, "id12"], [7, "id15"]], "Training and Household Training": [[7, "training-and-household-training"]], "Matching": [[7, "matching"]], "Household Matching": [[7, "household-matching"]], "Model Exploration and Household Model Exploration": [[7, "model-exploration-and-household-model-exploration"]], "Reporting": [[7, "reporting"]], "Models": [[8, "models"]], "random_forest": [[8, "random-forest"]], "probit": [[8, "probit"]], "logistic_regression": [[8, "logistic-regression"]], "decision_tree": [[8, "decision-tree"]], "gradient_boosted_trees": [[8, "gradient-boosted-trees"]], "Pipeline generated features": [[9, "pipeline-generated-features"]], "Transformer types": [[9, "transformer-types"]], "interaction": [[9, "interaction"]], "bucketizer": [[9, "bucketizer"]], "Running hlink": [[10, "running-hlink"]], "Using hlink as a Library": [[10, "using-hlink-as-a-library"]], "Interactive Mode": [[10, "interactive-mode"]], "Starting the program": [[10, "starting-the-program"]], "Running Linking Tasks and Steps": [[10, "running-linking-tasks-and-steps"]], "Example interactive mode workflow": [[10, "example-interactive-mode-workflow"]], "Substitutions": [[11, "substitutions"]], "1:1 substitution by data table": [[11, "substitution-by-data-table"]], "Substitution by regex word replace": [[11, "substitution-by-regex-word-replace"]], "Advanced Workflow Examples": [[12, "advanced-workflow-examples"]], "Export training data after generating features to reuse in different linking years": [[12, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Example training data export with generated ML features": [[12, "example-training-data-export-with-generated-ml-features"]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[12, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Example model exploration and FP/FN export workflow": [[12, "example-model-exploration-and-fp-fn-export-workflow"]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["column_mappings", "comparison_types", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "filenames": ["column_mappings.md", "comparison_types.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "titles": ["Column Mappings", "Comparison types, transform add-ons, aggregate features, and household aggregate features", "Configuration", "Feature Selection transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "terms": {"each": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "read": [0, 1, 2, 7, 10], "from": [0, 1, 2, 4, 6, 7, 8, 9, 10, 12], "input": [0, 1, 2, 3, 6, 7, 10, 11], "dataset": [0, 1, 2, 6, 7, 10, 12], "hlink": [0, 1, 2, 5, 6, 7, 12], "It": [0, 1, 2, 6, 10, 12], "ha": [0, 1, 2, 6, 10, 12], "column_nam": [0, 1, 2, 11], "attribut": [0, 1, 2, 3, 7, 8, 9, 10, 11], "which": [0, 1, 2, 3, 6, 7, 9, 10, 12], "specifi": [0, 1, 2, 6, 7, 9, 10, 11], "name": [0, 1, 2, 10, 11], "both": [0, 1, 2, 7, 12], "option": [0, 1, 2, 3, 6, 7, 8, 10, 12], "mai": [0, 2, 6, 7, 10], "have": [0, 1, 2, 5, 6, 7, 8, 10, 12], "an": [0, 1, 2, 3, 6, 8, 10], "alia": [0, 2, 7], "give": [0, 2], "new": [0, 2, 12], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "support": [0, 7, 8], "some": [0, 1, 2, 3, 6, 7, 10], "make": [0, 1, 2, 5, 12], "chang": [0, 1, 2, 5, 10, 12], "data": [0, 1, 4, 6, 7, 10], "thei": [0, 1, 2, 7, 10], "ar": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12], "These": [0, 1, 2, 3, 6, 7, 8, 9], "clean": [0, 6], "harmon": 0, "The": [0, 1, 2, 3, 5, 7, 8, 9, 10, 12], "avail": [0, 1, 2, 3, 5, 7, 8, 9, 12], "list": [0, 1, 2, 3, 4, 8, 10, 11], "below": [0, 1, 2, 3, 8, 9, 10], "section": [0, 1, 2, 12], "By": [0, 2, 10], "default": [0, 1, 2, 7, 8, 10], "must": [0, 1, 2, 8, 9, 11], "same": [0, 1, 2, 6, 7, 10], "With": [0, 9], "override_column_a": [0, 2, 3], "override_column_b": [0, 2, 3], "you": [0, 1, 2, 5, 10, 11, 12], "can": [0, 1, 2, 5, 6, 7, 8, 10, 12], "differ": [0, 1, 2, 4, 6, 7], "either": [0, 1, 2, 6, 11], "A": [0, 1, 2, 9, 10], "b": [0, 1, 2, 10], "when": [0, 1, 2, 3, 7, 12], "do": [0, 1, 3, 10, 12], "thi": [0, 1, 2, 5, 6, 7, 9, 10, 12], "appli": [0, 2, 3, 7, 12], "onli": [0, 1, 2, 7, 12], "non": 0, "overrid": [0, 2], "also": [0, 1, 2, 5, 6, 7, 9, 10, 12], "provid": [0, 2, 3, 6, 7, 9, 10], "override_transform": [0, 2], "describ": [0, 2, 10], "type": [0, 2, 3, 4, 7, 8, 10, 11, 12], "oper": [0, 2], "singl": [0, 2, 10, 12], "output": [0, 1, 2, 3, 6, 7, 10, 12], "more": [0, 1, 2, 9, 10, 12], "than": [0, 1, 2, 8], "one": [0, 1, 2, 7], "order": [0, 2, 7], "so": [0, 1, 2, 5, 12], "anoth": [0, 1, 3, 7], "format": 0, "letter": 0, "t": [0, 1, 2, 3, 12], "u": 0, "repres": [0, 1, 2, 3, 9, 10], "arbitrari": 0, "requir": [0, 1, 2, 3, 4, 7, 9, 10, 11], "addit": [0, 1, 2, 3, 5, 6, 10], "vari": [0, 2], "inform": [0, 1, 2, 10], "appear": [0, 1], "its": [0, 1, 6, 10], "suffix": 0, "mean": [0, 2], "two": [0, 1, 2, 3, 6, 7, 9, 10, 12], "link": [0, 1, 2, 4, 6, 8], "most": [0, 1, 7, 10], "independ": [0, 2], "For": [0, 1, 2, 7, 10, 12], "exampl": [0, 1, 2], "taken": [0, 1], "10": [0, 2, 5, 12], "year": [0, 1, 2, 3, 4], "apart": 0, "want": [0, 1, 2, 10, 12], "standard": [0, 1, 11], "ag": [0, 1, 2, 3], "variabl": [0, 1, 2, 12], "i": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12], "compar": [0, 1, 2, 6, 7], "between": [0, 1, 2, 6, 7, 10, 12], "To": [0, 1, 5, 7, 10], "could": [0, 2], "creat": [0, 2, 6, 7, 9, 10, 11, 12], "age_at_dataset_b": 0, "ad": [0, 1, 2], "column_map": [0, 2, 7], "valu": [0, 1, 2, 8, 9, 10, 11], "As": 0, "suppos": 0, "record": [0, 1, 2, 6, 7], "person": [0, 1, 6], "": [0, 1, 2, 6, 7, 10, 11], "first": [0, 1, 2, 5, 7, 10, 11], "string": [0, 1, 2, 3, 7, 8, 10, 11], "In": [0, 1, 6, 10, 12], "call": 0, "namefrst": [0, 1, 2], "entir": 0, "lowercas": 0, "first_nam": 0, "uppercas": 0, "follow": [0, 1, 6, 10, 11, 12], "configur": [0, 1, 6, 10, 12], "add": [0, 4], "_": [0, 1, 2, 3, 4, 8, 9, 10], "given": [0, 1, 2, 3, 8, 12], "numer": [0, 1], "11": [0, 2, 5, 9], "concat": 0, "concaten": [0, 1], "end": [0, 1, 2, 3, 11], "strip": [0, 7], "convert": [0, 1, 2], "alphabet": 0, "charact": 0, "lower": [0, 1], "case": [0, 1, 2, 3, 6], "white": 0, "space": [0, 2, 3, 11], "start": [0, 11], "ration": 0, "word": [0, 4], "replac": [0, 1, 4], "sinc": [0, 2], "peopl": [0, 1, 6, 10], "raw": [0, 2, 7, 10], "censu": [0, 7, 12], "contain": [0, 1, 11], "lead": 0, "better": [0, 6], "match": [0, 1, 4, 6, 10, 11, 12], "remov": 0, "qmark": 0, "hyphen": 0, "punctuat": 0, "apostroph": 0, "altern": [0, 2], "If": [0, 1, 2, 3, 7, 8, 10, 11], "surround": 0, "all": [0, 1, 2, 3, 7, 8, 9, 10], "them": [0, 1, 2, 7], "jr": [0, 2], "sr": [0, 2], "ii": [0, 2], "iii": [0, 2], "stop": 0, "last": [0, 1, 7, 9], "street": [0, 1], "avenu": [0, 11], "blvd": 0, "circl": 0, "court": 0, "road": 0, "prefix": 0, "like": [0, 2, 7, 10], "m": [0, 1], "mr": 0, "ah": 0, "chines": 0, "condens": 0, "whitespac": [0, 7], "take": [0, 1, 2, 3, 7, 10], "leav": 0, "behind": 0, "arrai": [0, 2, 4, 8, 9], "namefrst_split": [0, 2], "namefrst_clean": [0, 2], "index": [0, 5], "select": [0, 1, 4, 6, 10, 12], "element": 0, "posit": [0, 1, 2, 4, 6], "second": [0, 1, 2, 11], "1": [0, 1, 2, 4, 7, 8, 9, 10, 12], "item": 0, "set": [0, 1, 2, 3, 6, 7, 10, 12], "Then": [0, 5], "0": [0, 1, 2, 7, 8, 9, 10, 12], "initi": [0, 1, 10], "probabl": [0, 2, 8], "middl": [0, 1], "namefrst_mid_init": [0, 1], "multipl": [0, 1, 2, 10], "otherwis": [0, 1, 9, 12], "known": 0, "recod": 0, "birthyr": [0, 2], "clean_birthyr": [0, 2, 3], "9999": [0, 2, 9], "1999": [0, 2], "9998": 0, "divid": 0, "int": [0, 1, 2, 3, 8], "integ": [0, 1, 2, 9], "result": [0, 1, 6, 9, 10, 12], "instanc": [0, 8], "birthplac": 0, "detail": [0, 2, 10], "version": [0, 5, 12], "gener": [0, 1, 4, 6, 7, 10], "least": [0, 1], "signific": 0, "digit": 0, "we": [0, 1, 10, 12], "simpli": [0, 2], "drop": [0, 2, 10], "100": [0, 2, 12], "round": [0, 2], "lowest": 0, "whole": [0, 6], "number": [0, 1, 2, 7, 8, 10], "floor": 0, "function": [0, 1, 2, 6, 10], "bpl": [0, 1, 2], "bpl_root": 0, "condit": [0, 1, 2, 3, 4, 7], "logic": 0, "work": [0, 1, 2, 5, 7, 10, 12], "sql": [0, 1, 2, 3, 4, 7, 10], "express": [0, 1, 2], "claus": [0, 1], "if_valu": 0, "else_valu": 0, "race": [0, 1, 2, 9, 12], "ipum": [0, 6], "code": [0, 1, 2, 5], "categori": [0, 8], "get": [0, 1, 10], "down": [0, 6, 12], "nearest": 0, "produc": [0, 10], "relat": [0, 1, 2], "hundr": 0, "300": 0, "child": [0, 8], "household": [0, 4, 6, 8, 10, 12], "head": 0, "301": 0, "302": 0, "adopt": 0, "303": 0, "step": [0, 1, 2, 6], "usual": [0, 7, 12], "need": [0, 1, 2, 7, 10, 12], "2": [0, 1, 2, 3, 7, 8, 11, 12], "spous": 0, "3": [0, 1, 2, 5, 7, 8, 9, 12], "4": [0, 1, 8], "law": 0, "5": [0, 1, 2, 8, 9, 10, 12], "parent": [0, 1, 11], "6": [0, 2, 8, 9, 12], "7": [0, 1, 2, 8, 12], "sibl": 0, "12": [0, 5], "relate_div_100": [0, 1, 2], "page": [1, 2, 10], "comparison_featur": [1, 2, 7], "along": 1, "header": [1, 2, 3, 9, 11], "context": [1, 3, 9], "relatematch": [1, 2], "comparison_typ": [1, 2], "categor": [1, 2, 8, 9], "true": [1, 2, 3, 7, 9, 11, 12], "maximum": [1, 8], "jaro": [1, 9], "winkler": [1, 9], "find": [1, 7, 12], "greatest": 1, "among": 1, "cartesian": 1, "product": [1, 6, 12], "column": [1, 3, 4, 7, 9, 10, 11, 12], "namelast": [1, 2], "would": [1, 2, 12], "return": [1, 3, 8, 10], "four": 1, "namefrst_a": 1, "namefrst_b": 1, "namelast_b": 1, "namelast_a": 1, "maximum_jw": 1, "score": [1, 2, 7, 9], "namefrst_jw": [1, 2, 12], "geograph": 1, "filter": [1, 4, 7, 11], "major": [1, 10], "locat": [1, 2, 10], "befor": [1, 2, 3, 5, 7], "boundari": 1, "zero": 1, "jw_street": 1, "enum_dist": 1, "max": [1, 8, 10], "member": [1, 7], "neighborhood": 1, "surnam": 1, "related_individual_max_jw": 1, "namefrst_rel": 1, "assert": [1, 10], "NOT": 1, "distinct": 1, "f1": 1, "evalu": [1, 2, 6, 7, 8], "ani": [1, 2, 5, 8], "potenti": [1, 4, 7], "mismatch": 1, "queri": [1, 2], "fi": 1, "OR": 1, "mi0": 1, "mi1": 1, "THEN": 1, "els": [1, 2, 3], "first_init_col": 1, "namefrst_init": 1, "mid_init_col": 1, "namefrst_mid_init_2": 1, "f2": 1, "empti": 1, "null": [1, 2, 3], "AND": [1, 2], "individu": [1, 2, 7, 12], "mainli": 1, "caution": [1, 9], "flag": [1, 9, 10, 12], "f": [1, 10], "sp": 1, "m_caution": [1, 2, 9, 12], "mbpl": 1, "mother_birthyr": 1, "stepmom": 1, "momloc": 1, "comp_a": [1, 2], "comp_b": [1, 2], "comp_c": 1, "parent_step_chang": 1, "comp_d": 1, "check": [1, 10], "sign": 1, "boolean": [1, 2, 3, 11, 12], "form": [1, 7, 11], "cast": 1, "col": 1, "namelast_equal_as_int": 1, "namelast_clean": [1, 2, 3], "whether": [1, 2, 11], "join": [1, 11], "across": 1, "being": [1, 7], "exact": [1, 2], "namefrst_unstd": [1, 2], "present": [1, 2, 9], "nonzero": 1, "primarili": [1, 7], "indic": [1, 12], "kind": 1, "incompar": 1, "akin": 1, "miss": [1, 10], "see": [1, 2, 5, 10, 12], "univers": [1, 4, 7], "similar": 1, "fbpl_nomatch": 1, "fbpl": 1, "allow": [1, 2, 7, 12], "up": [1, 2, 10, 11], "sub": 1, "object": [1, 2, 6, 10], "document": [1, 8, 10, 12], "sp_caution": [1, 2, 12], "spouse_bpl": 1, "spouse_birthyr": 1, "durmarr": [1, 2], "new_marr": [1, 2], "street_jw": [1, 2, 12], "counti": 1, "statefip": [1, 2], "9": 1, "multipli": 1, "togeth": [1, 2], "after": [1, 2, 4, 8, 10], "float": [1, 2, 8], "comp": 1, "c": 1, "sploc": 1, "012": 1, "fals": [1, 2, 3, 4, 6, 10], "d": 1, "under": [1, 2], "specif": [1, 2, 10], "circumst": 1, "should": [1, 2, 8, 9, 10], "mid_init_match": 1, "either_1": 1, "nativ": 1, "either_0": 1, "gen": 1, "imm": [1, 2, 12], "immigr": 1, "look": [1, 10, 11], "foreign": 1, "born": 1, "sgen": [1, 2, 12], "rel": [1, 2, 12], "scala": 1, "determin": [1, 7], "greater": [1, 5], "jw_threshold": 1, "less": [1, 2], "age_threshold": 1, "sex": [1, 2, 11], "sampl": 1, "related_individual_row": 1, "unrel": 1, "depend": [1, 2, 5, 12], "name_col": 1, "birthyr_col": 1, "namefrst_related_row": 1, "replaced_birthyr": [1, 2, 3], "extra": 1, "children": 1, "who": 1, "base": [1, 2, 7], "expect": 1, "count": [1, 10, 12], "suspect": [1, 6], "relate_col": 1, "histid_col": 1, "id": [1, 2], "birth": 1, "year_b": 1, "wa": [1, 12], "minimum": [1, 8], "accept": [1, 2, 12], "consid": [1, 8], "histid": [1, 2, 12], "1910": [1, 2, 12], "8": [1, 2, 5, 10], "rate": 1, "calcul": [1, 12], "percentag": 1, "seen": 1, "neighbor": 1, "meet": 1, "95": 1, "nbor": [1, 2, 12], "namelast_neighbor": 1, "05": [1, 2], "namelast_popularity_sum": 1, "namelast_popular": 1, "length": [1, 2, 9], "size": 1, "ab": 1, "diff": 1, "absolut": 1, "invalid": [1, 8], "instead": [1, 2, 5, 7], "marriag": 1, "durat": 1, "99": [1, 2], "placehold": 1, "unknown": 1, "exclud": 1, "those": [1, 2], "consider": 1, "byrdiff": [1, 2, 12], "mardurmatch": [1, 2], "14": 1, "minu": [1, 2], "subtract": 1, "geo": 1, "distanc": [1, 8], "lookup": 1, "tabl": [1, 2, 4, 7, 10, 12], "core": [1, 7, 10, 12], "dist_tabl": 1, "py": [1, 2], "There": [1, 2, 3, 7], "sever": [1, 6], "wai": [1, 5, 10], "file": [1, 4, 6, 7, 10, 11, 12], "kei": [1, 7, 10], "key_count": 1, "secondari": 1, "serv": 1, "back": 1, "primari": [1, 6], "doe": [1, 7, 12], "particularli": 1, "state": [1, 6], "much": [1, 7], "fewer": [1, 8], "combin": [1, 2, 3, 7], "thu": 1, "risk": 1, "fill": 1, "aren": 1, "ex": 1, "just": [1, 2, 10, 12], "even": 1, "though": 1, "distances_fil": 1, "path": [1, 2, 10, 11, 12], "table_nam": 1, "what": [1, 2, 10, 12], "onc": [1, 10], "loc_a": 1, "where": [1, 7, 10, 12], "come": 1, "loc_b": 1, "distance_col": 1, "source_column_a": 1, "sourc": [1, 4, 7, 10, 12], "source_column_b": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b_0": 1, "loc_b_1": 1, "secondary_key_count": 1, "backup": 1, "secondary_table_nam": 1, "secondary_distances_fil": 1, "secondary_source_column": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_distance_col": 1, "state_dist": 1, "state_distance_lookup": 1, "county_state_dist": 1, "csv": [1, 2, 7, 10, 11, 12], "statecode1": 1, "statecode2": 1, "dist": 1, "county_dist": [1, 2, 12], "county_distance_lookup": 1, "county_1900_1910_distances_km": 1, "from_icpsrctyi": 1, "to_icpsrctyi": 1, "from_statefip": 1, "to_statefip": 1, "distance_km": 1, "state_1900_1910_distances_km": 1, "fetch": 1, "neither": 1, "nor": 1, "mpre": 1, "m_namefrst": 1, "accord": 1, "niu": 1, "other": [1, 2, 12], "mfbplmatch": 1, "multi": 1, "search": 1, "special": 1, "simplifi": 1, "particular": [1, 2], "constraint": 1, "num_col": 1, "whose": 1, "templat": 1, "n": [1, 8, 9], "per": [1, 2, 8, 9, 10], "current": [1, 2, 10], "respect": [1, 7], "jw_col_templ": 1, "jw": 1, "pair": [1, 12], "equal_and_not_null_templ": 1, "final": [1, 2, 12], "comput": [1, 3, 7], "_namefrst": 1, "_bpl": 1, "_sex": 1, "25": 1, "nvl": 1, "sm_namefrst": 1, "sn_namefrst": 1, "sm_bpl": 1, "sn_bpl": 1, "sm_sex": 1, "sn_sex": 1, "pass": [1, 2, 7, 8], "flexibl": 1, "user": [1, 10], "write": [1, 10, 12], "own": [1, 2], "favor": 1, "reason": 1, "good": 1, "fallback": 1, "defin": [1, 7, 8, 9, 10], "spark": [1, 2, 5, 8, 9, 10, 12], "builtin": 1, "argument": [1, 10, 12], "namelast_jw_max": 1, "namelast1": 1, "namelast2": 1, "namelast3": 1, "abov": [1, 5], "extend": 1, "beyond": 1, "top": [1, 4], "level": [1, 4, 10], "everi": 1, "jw_f": [1, 2, 12], "father_namefrst": 1, "rais": [1, 3], "exponenti": 1, "squar": 1, "county_distance_squar": [1, 2, 12], "county_a": 1, "county_b": 1, "upper": 1, "gt": 1, "btwn": 1, "addl": 1, "var": [1, 2], "program": [1, 2, 7, 12], "report": [1, 4, 6, 10], "addl_var": 1, "check_val_expr": 1, "else_v": 1, "volumn": 1, "datasourc": [1, 2, 10], "yrimmig": 1, "immyear_diff": [1, 2, 9, 12], "includ": [1, 2, 7, 9, 10], "train": [1, 4, 6, 8, 10], "independent_var": [1, 2, 12], "config": [1, 4, 7, 10, 12], "id_column": [1, 2], "_a": 1, "mult": 1, "exist": [1, 2, 10], "within": [1, 2, 6, 10, 11], "hh_train": [1, 2, 7, 10, 12], "hh": 1, "highest": [1, 2], "against": [1, 11], "ten": [1, 2], "tell": 2, "how": [2, 7], "descript": [2, 8, 10], "refer": 2, "here": [2, 7, 10, 12], "tutori": [2, 10], "script": [2, 6, 10], "discuss": 2, "readm": 2, "note": 2, "written": [2, 6], "toml": [2, 6, 10], "abl": 2, "json": [2, 10], "datasource_a": [2, 7], "datasource_b": [2, 7], "transform": [2, 4, 6, 7], "lowercase_strip": 2, "add_to_a": 2, "age_2": 2, "derived_from": 2, "expand_length": 2, "explod": [2, 7], "jaro_winkl": 2, "namelast_jw": [2, 12], "threshold": [2, 8, 12], "feature_nam": 2, "79": 2, "84": 2, "complex": 2, "machin": [2, 6, 7, 10, 12], "learn": [2, 6, 7, 10, 12], "probabilist": [2, 6], "drop_data_from_scored_match": 2, "us1900": 2, "us1900m_usa": 2, "p": 2, "parquet": [2, 7], "us1910": 2, "us1910m_usa": 2, "training_data_subset": 2, "serialp": 2, "rationalize_name_word": 2, "remove_qmark_hyphen": 2, "replace_apostroph": 2, "remove_suffix": 2, "remove_alternate_nam": 2, "condense_strip_whitespac": 2, "split": [2, 3, 7, 8, 9, 12], "namefrst_std": [2, 11], "array_index": 2, "bpl_orig": 2, "divide_by_int": 2, "get_floor": 2, "statefip_h": 2, "output_typ": 2, "substitution_column": [2, 7, 11], "join_column": [2, 11], "join_valu": [2, 11], "substitution_fil": [2, 11], "name_std": [2, 11], "male": [2, 11], "femal": [2, 11], "feature_select": [2, 3, 7], "input_column": [2, 3, 9], "output_column": [2, 3, 9], "sql_condit": 2, "namelast_bigram": 2, "bigram": [2, 4], "bpl_clean": 2, "bpl_str": 2, "washington": 2, "bpl2_str": 2, "53": 2, "region": [2, 12], "attach_vari": 2, "region_dict": 2, "col_to_join_on": 2, "col_to_add": 2, "null_fil": 2, "col_typ": 2, "potential_matches_univers": [2, 7], "birthyr_3": 2, "namefrst_std_jw": [2, 12], "75": [2, 8, 12], "comparis": 2, "post": [2, 7], "hh_comparison": [2, 7], "threshold_expr": 2, "fetch_a": 2, "sex_equ": 2, "equal": [2, 11], "relate_a": [2, 9], "pipeline_featur": [2, 7, 9], "sex_region_interact": 2, "transformer_typ": [2, 9], "interact": [2, 4, 7, 12], "relatetyp": [2, 9], "bucket": [2, 7], "hit": [2, 10, 12], "scale_data": [2, 12], "training_data": [2, 10], "dependent_var": [2, 12], "score_with_model": [2, 12], "use_training_data_featur": [2, 7, 12], "split_by_id_a": [2, 12], "decis": [2, 4, 8, 12], "drop_duplicate_with_threshold_ratio": [2, 12], "n_training_iter": [2, 7, 12], "output_suspicious_td": [2, 12], "param_grid": [2, 12], "model_paramet": [2, 7, 8, 12], "random_forest": [2, 12], "maxdepth": [2, 8, 12], "numtre": [2, 8, 12], "005": 2, "threshold_ratio": [2, 8, 12], "logistic_regress": [2, 12], "50": [2, 12], "65": 2, "80": 2, "chosen_model": [2, 8, 12], "prediction_col": 2, "predict": [2, 12], "hh_col": 2, "hh_training_data_1900_1910": 2, "probit": [2, 4], "go": [2, 10], "your": [2, 5, 7, 10, 12], "uniqu": 2, "identifi": [2, 6, 12], "full": [2, 7, 12], "short": 2, "alphanumer": 2, "convert_ints_to_long": 2, "automat": [2, 5, 7], "long": [2, 11], "especi": 2, "assum": 2, "schema": 2, "sometim": 2, "term": 2, "bigint": 2, "thing": 2, "my_fil": 2, "subset": [2, 11], "limit": 2, "extract": 2, "modifi": 2, "meant": 2, "usag": [2, 4, 10], "set_value_column_a": [2, 3], "liter": 2, "set_value_column_b": [2, 3], "iv": 2, "v": 2, "vi": 2, "vii": 2, "viii": 2, "namelast_clean_bigram": [2, 3], "fed": [2, 7], "prep": 2, "df": [2, 10], "men": 2, "newli": 2, "attempt": 2, "duplic": [2, 8], "row": 2, "conjuct": 2, "Will": 2, "conjunct": 2, "rang": [2, 9], "original_valu": 2, "plu": 2, "1870": 2, "expand": 2, "1867": 2, "1868": 2, "1869": 2, "1871": 2, "1872": 2, "1873": 2, "kept": 2, "keep": 2, "appropri": 2, "treat": [2, 9], "import": [2, 7, 10, 12], "dure": [2, 7], "hot": 2, "encod": [2, 3], "vector": [2, 9], "stage": 2, "well": 2, "upper_threshold": 2, "cannot": 2, "robust": 2, "ml": [2, 4, 8, 9], "typic": [2, 7], "leverag": 2, "api": [2, 6, 9], "piplin": 2, "regionf": 2, "sex_regionf_interact": 2, "immyear_caut": [2, 9], "myriad": 2, "explor": [2, 4, 6, 10], "part": [2, 7], "task": [2, 4, 6, 8, 12], "drop_duplicate_a": 2, "time": [2, 7, 10], "out": [2, 7, 12], "best": [2, 7], "smallest": 2, "possibl": 2, "ratio": [2, 8], "beta": [2, 8], "test": [2, 7, 12], "model_explor": [2, 10, 12], "hyper": [2, 6, 12], "paramet": [2, 6, 7, 8, 10, 12], "eval": 2, "skip": [2, 7], "apply_model": 2, "run_all_step": [2, 10, 12], "command": [2, 6, 10, 12], "try": 2, "creation": 2, "iter": 2, "scale": 2, "error": [2, 9], "1900": [2, 12], "about": [2, 10, 12], "1930": [2, 12], "1940": [2, 12], "fail": 2, "were": 2, "sure": [2, 5, 10], "scratch": 2, "although": 2, "know": 2, "haven": 2, "save": [2, 7, 12], "small": 2, "amount": 2, "process": [2, 6, 10], "repeatedli": 2, "help": [2, 7, 10], "neg": [2, 4, 6], "area": 2, "coverag": 2, "increas": [2, 9], "represent": [2, 7], "ensur": 2, "group": [2, 7], "a304bt": 2, "three": [2, 7], "b200": 2, "c201": 2, "d425": 2, "perform": [2, 6, 7, 11], "feature_import": [2, 7, 12], "coeffici": [2, 7], "enabl": [2, 7, 10], "srace": [2, 9, 12], "race_interacted_srac": [2, 9, 12], "hits2": [2, 12], "exact_mult": [2, 12], "ncount": [2, 3, 12], "ncount2": [2, 3, 12], "f_interacted_jw_f": [2, 12], "f_caution": [2, 12], "f_pre": [2, 12], "fbplmatch": [2, 12], "m_interacted_jw_m": [2, 9, 12], "jw_m": [2, 9, 12], "m_pre": [2, 9, 12], "mbplmatch": [2, 12], "sp_interacted_jw_sp": [2, 12], "jw_sp": [2, 12], "sp_pre": [2, 12], "mi": [2, 12], "fsoundex": [2, 12], "lsoundex": [2, 12], "oth": [2, 12], "imm_interacted_immyear_caut": [2, 12], "1900_1910_training_data_20191023": 2, "jw_max_a": 2, "jw_max_b": 2, "f1_match": 2, "f2_match": 2, "byrdifcat": 2, "racematch": 2, "bplmatch": 2, "imm_interacted_bplmatch": 2, "sexmatch": 2, "relatetype_interacted_relatematch": 2, "checkpoint": 3, "no_first_pad": 3, "don": 3, "prepend": 3, "namefrst_unstd_bigram": 3, "namelast_frst_bigram": 3, "namelast_clean_soundex": 3, "input_col": 3, "output_col": 3, "expon": 3, "introduct": 4, "overview": 4, "instal": 4, "pypi": 4, "preprocess": [4, 6, 10, 12], "model": [4, 6, 10], "run": [4, 5, 6, 7, 12], "librari": [4, 6], "mode": [4, 5, 12], "advanc": 4, "workflow": 4, "export": [4, 7, 10], "featur": [4, 6, 7, 8, 10], "reus": 4, "basic": 4, "map": [4, 7, 9], "substitut": [4, 7], "block": [4, 7], "comparison": [4, 7], "pipelin": 4, "ons": 4, "aggreg": 4, "union": 4, "soundex": 4, "power": 4, "regex": 4, "random": [4, 8], "forest": [4, 8], "logist": [4, 8], "regress": [4, 8], "tree": [4, 8], "gradient": [4, 8], "boost": [4, 8], "system": 5, "python": [5, 6, 10], "java": 5, "integr": 5, "apach": 5, "via": [5, 6], "pyspark": [5, 8, 9, 10], "packag": 5, "org": 5, "latest": 5, "pip": 5, "easiest": [5, 10], "through": [5, 7, 9, 10], "instruct": [5, 10], "But": 5, "clone": 5, "github": 5, "repositori": 5, "root": 5, "project": 5, "directori": [5, 10, 12], "develop": [5, 6], "e": 5, "dev": 5, "edit": 5, "made": 5, "built": 5, "tool": [5, 6], "line": [6, 10], "share": 6, "characterist": [6, 7], "correspond": [6, 7], "real": 6, "world": 6, "determinist": [6, 7], "rule": [6, 7], "algorithm": [6, 7], "At": [6, 7], "been": 6, "unit": 6, "census": 6, "hierarch": [6, 10], "structur": 6, "nest": 6, "howev": [6, 12], "tailor": 6, "ignor": 6, "common": [6, 7, 12], "highli": [6, 7], "languag": 6, "further": [6, 12], "broken": 6, "smaller": 6, "sequenc": 6, "linkrun": [6, 10], "prepar": [6, 7, 10], "research": 6, "experi": 6, "understand": 6, "tune": [6, 12], "relationship": 6, "varieti": 7, "normal": 7, "abbrevi": [7, 11], "regist": [7, 10], "datafram": [7, 10, 12], "request": 7, "classif": [7, 8], "metadata": 7, "introspect": 7, "ingest": 7, "inspect": 7, "mani": [7, 10], "aspect": [7, 10], "extens": 7, "longest": 7, "definit": 7, "reduc": 7, "drastic": 7, "improv": 7, "runtim": 7, "separ": 7, "total": 7, "potential_match": [7, 10], "satisfi": 7, "elig": 7, "reshap": 7, "thought": 7, "ahead": 7, "chosen": 7, "experiment": [7, 10], "focus": 7, "demograph": 7, "moment": 7, "veri": [7, 12], "anyon": 7, "percent": 7, "remain": 7, "popul": 7, "pull": 7, "fix": 7, "width": 7, "crosswalk": 7, "construct": 7, "alpha": 8, "hyperparamet": [8, 12], "de": 8, "param": [8, 12], "label": 8, "doc": [8, 9], "commonli": 8, "explan": 8, "randomforestclassifi": 8, "depth": 8, "20": 8, "featuresubsetstrategi": 8, "node": 8, "auto": 8, "onethird": 8, "sqrt": 8, "log2": 8, "15": 8, "generalizedlinearregress": 8, "famili": 8, "binomi": 8, "85": [8, 10], "logisticregress": 8, "decisiontreeclassifi": 8, "mininstancespernod": 8, "caus": 8, "left": 8, "right": [8, 10], "discard": 8, "maxbin": 8, "bin": 8, "discret": 8, "continu": [8, 9, 12], "gbtclassifi": 8, "mother": 9, "point": [9, 12], "x": [9, 10], "y": 9, "hold": 9, "except": 9, "strictli": 9, "inf": 9, "explicitli": 9, "cover": 9, "doubl": 9, "outsid": 9, "job": 10, "high": 10, "class": 10, "handl": 10, "main": 10, "complet": 10, "access": [10, 12], "link_run": 10, "factori": 10, "sparkfactori": 10, "load_config": 10, "load_conf_fil": 10, "sparksess": 10, "now": 10, "let": 10, "load": 10, "our": 10, "my_conf": 10, "lr": 10, "prep_step": 10, "get_step": 10, "enumer": 10, "print": 10, "input_table_nam": 10, "output_table_nam": 10, "run_step": 10, "get_tabl": 10, "matches_df": 10, "hh_model_explor": 10, "method": [10, 12], "interfac": 10, "easili": 10, "conveni": 10, "adjust": 10, "set_loc": 10, "set_num_cor": 10, "set_executor_memori": 10, "5g": 10, "ll": 10, "dictionari": 10, "often": 10, "modul": 10, "pleas": 10, "reproduc": 10, "consol": 10, "cpu": 10, "h": 10, "executor_memori": [10, 12], "execute_task": 10, "execute_command": 10, "conf": [10, 12], "show": 10, "messag": 10, "exit": 10, "memori": 10, "executor": 10, "begin": 10, "execut": 10, "seri": 10, "excute_command": 10, "filepath": 10, "sai": 10, "fullcount_1870_1880": 10, "pattern": 10, "full_count_1870_1880": 10, "prompt": 10, "enter": 10, "text": 10, "unstabl": 10, "topic": 10, "analyz": [10, 12], "set_preexisting_t": 10, "x_persist": 10, "borrow_t": 10, "get_task": 10, "set_print_sql": 10, "x_sql": 10, "x_sqlf": 10, "ipython": 10, "showf": 10, "x_summari": 10, "desc": 10, "x_crosswalk": 10, "x_tab": 10, "q": [10, 12], "x_hh_tfam": 10, "x_tfam": 10, "drop_al": 10, "reload": 10, "x_hh_tfam_2a": 10, "x_tfam_raw": 10, "drop_all_prc": 10, "x_hh_tfam_2b": 10, "x_union": 10, "drop_all_temp": 10, "x_load": 10, "get_set": 10, "set_link_task": 10, "x_parquet_from_csv": 10, "organ": 10, "hierarchi": 10, "five": 10, "hh_match": 10, "someth": 10, "choic": 10, "preexist": 10, "prepped_df_a": 10, "prepped_df_b": 10, "raw_df_b": 10, "raw_df_a": 10, "training_featur": [10, 12], "scored_potential_match": 10, "potential_matches_prep": 10, "exploded_df_b": 10, "exploded_df_a": 10, "predicted_match": 10, "hh_training_featur": [10, 12], "hh_training_data": 10, "hh_predicted_match": 10, "hh_scored_potential_match": 10, "hh_potential_match": 10, "hh_blocked_match": 10, "hh_potential_matchs_prep": 10, "model_eval_training_vector": 10, "model_eval_training_data": 10, "model_eval_repeat_fp": 10, "model_eval_training_featur": 10, "model_eval_training_result": 10, "model_eval_repeat_fn": 10, "hh_model_eval_training_vector": 10, "hh_model_eval_repeat_fp": 10, "hh_model_eval_repeat_fn": 10, "hh_model_eval_training_result": 10, "hh_model_eval_training_featur": 10, "hh_model_eval_training_data": 10, "persist": 10, "hidden": 10, "intermedi": 10, "yet": 10, "databas": 10, "tablenam": 10, "istemporari": 10, "task_nam": 10, "num": 10, "finish": 10, "put": [10, 12], "launch": [10, 12], "my": [10, 12], "subhead": 11, "suppli": 11, "regex_word_replac": 11, "variant": 11, "av": 11, "7th": 11, "swap": 11, "still": 11, "anywher": 11, "proceed": 11, "street_unstd": 11, "dir": 11, "substitutions_street_abbrev": 11, "span": 12, "1920": 12, "deriv": 12, "necessari": 12, "scenario": 12, "copi": 12, "use_potential_matches_featur": 12, "full_count_1900_1910": 12, "50g": 12, "ask": 12, "arg": 12, "partit": 12, "training_data_1900_1910_hlink_featur": 12, "might": 12, "shut": 12, "framework": 12, "etc": 12, "relev": 12, "matrix": 12, "implement": 12, "regular": 12, "training_data_1900_1910": 12, "weren": 12, "ident": 12, "manual": 12, "updat": 12, "isn": 12, "analysi": 12, "training_result": 12, "hh_training_result": 12, "1900_1910_training_result": 12, "repeat_fp": 12, "repeat_fn": 12, "hh_repeat_fp": 12, "hh_repeat_fn": 12, "1900_1910_potential_fp": 12, "1900_1910_potential_fn": 12, "prefer": 12, "ve": 12}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"column": [0, 2], "map": [0, 2], "basic": [0, 2], "usag": 0, "advanc": [0, 2, 12], "transform": [0, 1, 3, 9], "add_to_a": 0, "concat_to_a": 0, "concat_to_b": 0, "lowercase_strip": 0, "rationalize_name_word": 0, "remove_qmark_hyphen": 0, "remove_punctu": 0, "replace_apostroph": 0, "remove_alternate_nam": 0, "remove_suffix": 0, "remove_stop_word": 0, "remove_prefix": 0, "condense_strip_whitespac": 0, "remove_one_letter_nam": 0, "split": 0, "array_index": 0, "substr": 0, "divide_by_int": 0, "when_valu": 0, "get_floor": 0, "comparison": [1, 2], "type": [1, 9], "add": 1, "ons": 1, "aggreg": 1, "featur": [1, 2, 3, 9, 12], "household": [1, 2, 7], "maximum_jaro_winkl": 1, "jaro_winkl": 1, "jaro_winkler_street": 1, "max_jaro_winkl": 1, "equal": 1, "f1_match": 1, "f2_match": 1, "not_equ": 1, "equals_as_int": 1, "all_equ": 1, "not_zero_and_not_equ": 1, "time": 1, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "any_equ": 1, "either_are_1": 1, "either_are_0": 1, "second_gen_imm": 1, "rel_jaro_winkl": 1, "extra_children": 1, "jaro_winkler_r": 1, "sum": 1, "length_b": 1, "abs_diff": 1, "b_minus_a": 1, "geo_dist": 1, "fetch_a": 1, "fetch_b": 1, "present_both_year": 1, "neither_are_nul": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_and_equal_categorical_in_univers": 1, "multi_jaro_winkler_search": 1, "sql_condit": [1, 3], "alia": 1, "power": [1, 3], "threshold": 1, "lower_threshold": 1, "upper_threshold": 1, "gt_threshold": 1, "btwn_threshold": 1, "look_at_addl_var": 1, "hit": 1, "hits2": 1, "exact_mult": 1, "jw_max_a": 1, "jw_max_b": 1, "configur": [2, 4, 7], "config": 2, "file": 2, "top": 2, "level": 2, "data": [2, 11, 12], "sourc": [2, 5], "filter": 2, "substitut": [2, 11], "select": [2, 3], "potenti": [2, 12], "match": [2, 7], "univers": 2, "block": 2, "pipelin": [2, 9], "gener": [2, 9, 12], "train": [2, 7, 12], "model": [2, 7, 8, 12], "bigram": 3, "arrai": 3, "union": 3, "soundex": 3, "welcom": 4, "hlink": [4, 10], "": 4, "document": 4, "api": 4, "instal": 5, "requir": 5, "from": 5, "pypi": 5, "introduct": 6, "overview": [6, 7], "link": [7, 10, 12], "task": [7, 10], "preprocess": 7, "step": [7, 10], "relat": 7, "section": 7, "explor": [7, 12], "report": 7, "random_forest": 8, "probit": 8, "logistic_regress": 8, "decision_tre": 8, "gradient_boosted_tre": 8, "interact": [9, 10], "bucket": 9, "run": 10, "us": 10, "librari": 10, "mode": 10, "start": 10, "program": 10, "exampl": [10, 12], "workflow": [10, 12], "1": 11, "tabl": 11, "regex": 11, "word": 11, "replac": 11, "export": 12, "after": 12, "reus": 12, "differ": 12, "year": 12, "ml": 12, "list": 12, "fals": 12, "posit": 12, "neg": 12, "fp": 12, "fn": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Column Mappings": [[0, "column-mappings"], [2, "column-mappings"]], "Basic Usage": [[0, "basic-usage"]], "Advanced Usage": [[0, "advanced-usage"]], "Transforms": [[0, "transforms"]], "add_to_a": [[0, "add-to-a"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "lowercase_strip": [[0, "lowercase-strip"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_punctuation": [[0, "remove-punctuation"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_suffixes": [[0, "remove-suffixes"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_prefixes": [[0, "remove-prefixes"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "split": [[0, "split"]], "array_index": [[0, "array-index"]], "mapping": [[0, "mapping"]], "substring": [[0, "substring"]], "divide_by_int": [[0, "divide-by-int"]], "when_value": [[0, "when-value"]], "get_floor": [[0, "get-floor"]], "Comparison types, transform add-ons, aggregate features, and household aggregate features": [[1, "comparison-types-transform-add-ons-aggregate-features-and-household-aggregate-features"]], "Comparison types": [[1, "comparison-types"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "equals": [[1, "equals"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "not_equals": [[1, "not-equals"]], "equals_as_int": [[1, "equals-as-int"]], "all_equals": [[1, "all-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "and": [[1, "and"]], "times": [[1, "times"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "any_equals": [[1, "any-equals"]], "either_are_1": [[1, "either-are-1"]], "either_are_0": [[1, "either-are-0"]], "second_gen_imm": [[1, "second-gen-imm"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "extra_children": [[1, "extra-children"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "sum": [[1, "sum"]], "length_b": [[1, "length-b"]], "abs_diff": [[1, "abs-diff"]], "b_minus_a": [[1, "b-minus-a"]], "geo_distance": [[1, "geo-distance"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "present_both_years": [[1, "present-both-years"]], "neither_are_null": [[1, "neither-are-null"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "sql_condition": [[1, "sql-condition"], [3, "sql-condition"]], "Feature add-ons": [[1, "feature-add-ons"]], "alias": [[1, "alias"]], "power": [[1, "power"], [3, "power"]], "threshold": [[1, "threshold"]], "lower_threshold": [[1, "lower-threshold"]], "upper_threshold": [[1, "upper-threshold"]], "gt_threshold": [[1, "gt-threshold"]], "btwn_threshold": [[1, "btwn-threshold"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "Aggregate Features": [[1, "aggregate-features"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "exact_mult": [[1, "exact-mult"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "Configuration": [[2, "configuration"]], "Basic Config File": [[2, "basic-config-file"]], "Advanced Config File": [[2, "advanced-config-file"]], "Top level configs": [[2, "top-level-configs"]], "Data sources": [[2, "data-sources"]], "Filter": [[2, "filter"]], "Substitution Columns": [[2, "substitution-columns"]], "Feature Selections": [[2, "feature-selections"]], "Potential Matches Universe": [[2, "potential-matches-universe"]], "Blocking": [[2, "blocking"]], "Comparisons": [[2, "comparisons"]], "Household Comparisons": [[2, "household-comparisons"]], "Comparison Features": [[2, "comparison-features"]], "Pipeline-generated Features": [[2, "pipeline-generated-features"]], "Training and models": [[2, "training-and-models"]], "Household training and models": [[2, "household-training-and-models"]], "Feature Selection transforms": [[3, "feature-selection-transforms"]], "bigrams": [[3, "bigrams"]], "array": [[3, "array"]], "union": [[3, "union"]], "soundex": [[3, "soundex"]], "Welcome to hlink\u2019s documentation!": [[4, "welcome-to-hlink-s-documentation"]], "Configuration API": [[4, "configuration-api"], [4, null]], "Installation": [[5, "installation"]], "Requirements": [[5, "requirements"]], "Installing from PyPI": [[5, "installing-from-pypi"]], "Installing from source": [[5, "installing-from-source"]], "Introduction": [[6, "introduction"]], "Overview": [[6, "overview"], [7, "overview"], [7, "id1"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"]], "Link Tasks": [[7, "link-tasks"]], "Preprocessing": [[7, "preprocessing"]], "Task steps": [[7, "task-steps"], [7, "id2"], [7, "id5"], [7, "id8"], [7, "id11"], [7, "id14"]], "Related Configuration Sections": [[7, "related-configuration-sections"], [7, "id3"], [7, "id6"], [7, "id9"], [7, "id12"], [7, "id15"]], "Training and Household Training": [[7, "training-and-household-training"]], "Matching": [[7, "matching"]], "Household Matching": [[7, "household-matching"]], "Model Exploration and Household Model Exploration": [[7, "model-exploration-and-household-model-exploration"]], "Reporting": [[7, "reporting"]], "Models": [[8, "models"]], "random_forest": [[8, "random-forest"]], "probit": [[8, "probit"]], "logistic_regression": [[8, "logistic-regression"]], "decision_tree": [[8, "decision-tree"]], "gradient_boosted_trees": [[8, "gradient-boosted-trees"]], "Pipeline generated features": [[9, "pipeline-generated-features"]], "Transformer types": [[9, "transformer-types"]], "interaction": [[9, "interaction"]], "bucketizer": [[9, "bucketizer"]], "Running hlink": [[10, "running-hlink"]], "Using hlink as a Library": [[10, "using-hlink-as-a-library"]], "Interactive Mode": [[10, "interactive-mode"]], "Starting the program": [[10, "starting-the-program"]], "Running Linking Tasks and Steps": [[10, "running-linking-tasks-and-steps"]], "Example interactive mode workflow": [[10, "example-interactive-mode-workflow"]], "Substitutions": [[11, "substitutions"]], "1:1 substitution by data table": [[11, "substitution-by-data-table"]], "Substitution by regex word replace": [[11, "substitution-by-regex-word-replace"]], "Advanced Workflow Examples": [[12, "advanced-workflow-examples"]], "Export training data after generating features to reuse in different linking years": [[12, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Example training data export with generated ML features": [[12, "example-training-data-export-with-generated-ml-features"]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[12, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Example model exploration and FP/FN export workflow": [[12, "example-model-exploration-and-fp-fn-export-workflow"]]}, "indexentries": {}})
\ No newline at end of file
diff --git a/docs/substitutions.html b/docs/substitutions.html
index 95656ef..57e0bf2 100644
--- a/docs/substitutions.html
+++ b/docs/substitutions.html
@@ -123,7 +123,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul class="current">
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/use_examples.html b/docs/use_examples.html
index 03d9f29..8ee72b1 100644
--- a/docs/use_examples.html
+++ b/docs/use_examples.html
@@ -191,7 +191,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mapping_transforms.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/sphinx-docs/column_mappings.md b/sphinx-docs/column_mappings.md
new file mode 100755
index 0000000..96c2e93
--- /dev/null
+++ b/sphinx-docs/column_mappings.md
@@ -0,0 +1,362 @@
+# Column Mappings
+
+## Basic Usage
+
+Each column mapping reads a column from the input datasets into hlink. It has a
+`column_name` attribute which specifies the name of the input column to read in
+from both datasets. Optionally, it may have an `alias` attribute which gives a
+new name to use for the column in hlink.
+
+Column mappings support some *transforms* which make changes to the data as they
+are read in. These changes support data cleaning and harmonization. The available
+column mapping transforms are listed below in the [transforms](#transforms) section.
+
+## Advanced Usage
+
+By default, the input column must have the same name in both input datasets.
+With the `override_column_a` and `override_column_b` attributes, you can
+specify a different name for either dataset A or dataset B. When you do this,
+the `transforms` attribute applies only to the non-override dataset. You can also
+provide an `override_transforms` attribute which applies only to the override
+dataset.
+
+## Transforms
+
+Each section below describes a column mapping transform type. Each transform
+operates on a single input column and outputs a single output column. More than
+one transform may be applied to a column. Transforms apply in the order that
+they are listed in the `transforms` list, so the output of one transform may
+be the input of another. Input and output column types are listed in the format
+"Maps input column type → output column type". The letters T and U represent
+arbitrary column types.
+
+Each transform requires a `type` attribute, which must be one of the names
+listed below. Some transforms may use additional attributes. These vary by
+type, and additional information appears for each type of transform in its
+section below.
+
+Some transforms are suffixed by "a" or "b". These suffixes mean that the
+transforms apply to columns from only one of the two datasets to be linked
+(dataset A or dataset B). Most transforms operate on both dataset A and dataset
+B independently.
+
+For example, if you have two datasets taken 10 years apart, you may want to
+standardize the `age` variable so that it is comparable between the two
+datasets. To do this, you could create a new `age_at_dataset_b` variable by
+reading in the `age` variable from each dataset and then adding 10 to the
+variable from dataset A with the `add_to_a` transform.
+
+```
+[[column_mappings]]
+alias = "age_at_dataset_b"
+column_name = "age"
+transforms = [
+    {type = "add_to_a", value = 10}
+]
+```
+
+As another example, suppose that both datasets record each person's first name
+as a string. In dataset A the variable is called `namefrst` and is entirely
+lowercase, but in dataset B it is called `first_name` and is entirely uppercase.
+You could read these two columns into a `namefrst` column in hlink and apply
+a lowercase transform to only dataset B with the following configuration section.
+
+```
+[[column_mappings]]
+alias = "namefrst"
+column_name = "namefrst"
+# Read from column first_name in dataset B
+override_column_b = "first_name"
+# Apply these transforms only to dataset B
+override_transforms = [
+    {type = "lowercase_strip"}
+]
+```
+
+
+
+### add_to_a
+
+Add the given `value` to a column from dataset A.
+
+Maps numerical → numerical.
+
+```
+transforms = [{type = "add_to_a", value = 11}]
+```
+
+### concat_to_a
+
+Concatenate the string `value` to the end of a column in dataset A.
+
+Maps string → string.
+
+```
+transforms = [{type = "concat_to_a", value = " "}]
+```
+
+
+### concat_to_b
+
+Concatenate the string `value` to the end of a column in dataset B.
+
+Maps string → string.
+
+```
+transforms = [{type = "concat_to_b", value = " "}]
+```
+
+
+### lowercase_strip
+
+Used in name cleaning. Convert alphabetical characters to lower-case and strip white
+space characters from the start and end of the strings in the column.
+
+Maps string → string.
+
+```
+transforms = [{type = "lowercase_strip"}]
+```
+
+### rationalize_name_words
+
+Used in name cleaning. Replace the characters `?`, `*`, and `-` with spaces. Since
+people's names in raw census data can contain these characters, replacing these characters
+can lead to better matching.
+
+Maps string → string.
+
+```
+transforms = [{type = "rationalize_name_words"}]
+```
+
+
+### remove_qmark_hyphen
+
+Used in name cleaning. Remove the characters `?` and `-` from strings in the column.
+
+Maps string → string.
+
+```
+transforms = [{type = "remove_qmark_hyphen"}]
+```
+
+### remove_punctuation
+
+Remove most punctuation from strings in the column. This transform removes these characters:
+`? - \ / " ' : , . [ ] { }`.
+
+Maps string → string.
+
+```
+transforms = [{type = "remove_punctuation"}]
+```
+
+### replace_apostrophe
+
+Used in name cleaning. Replace each apostrophe `'` with a space.
+
+Maps string → string.
+
+```
+transforms = [{type = "replace_apostrophe"}]
+
+```
+
+### remove_alternate_names
+
+Used in name cleaning. If a string in the column contains the string ` or ` ("or" surrounded by spaces),
+then remove the ` or ` and all following characters.
+
+Maps string → string.
+
+```
+transforms = [{type = "remove_alternate_names"}]
+```
+
+### remove_suffixes
+
+Used in name cleaning. Given a list of suffixes, remove them from the strings in the column.
+
+Maps string → string.
+
+```
+transforms = [
+    {
+        type = "remove_suffixes",
+        values = ["jr", "sr", "ii", "iii"]
+    }
+]
+```
+
+### remove_stop_words
+
+Used in name cleaning. Remove last words from names such as street names.
+
+Maps string → string.
+
+```
+transforms = [
+    {
+        type = "remove_stop_words",
+        values = ['avenue', 'blvd', 'circle', 'court', 'road', 'street']
+    }
+]
+```
+
+### remove_prefixes
+
+Used in name cleaning. Remove prefixes like "Ms.", "Mr.", or "Mrs." from names.
+
+Maps string → string.
+
+```
+# In some census data, "ah" is a prefix from Chinese names.
+transforms = [{type = "remove_prefixes", values = ["ah"]}]
+```
+
+### condense_strip_whitespace
+
+Used in name cleaning. Take white space that may be more than one character or contain
+non-space characters and replace it with a single space.
+
+Maps string → string.
+
+```
+transforms = [{type = "condense_strip_whitespace"}]
+```
+
+### remove_one_letter_names
+
+Used in name cleaning. If a name is a single character, remove it and leave the white space behind.
+
+Maps string → string.
+
+```
+transforms = [{type = "remove_one_letter_names"}]
+```
+
+### split
+
+Split the column value on space characters.
+
+Maps string → array of string.
+
+```
+[[column_mappings]]
+alias = "namefrst_split"
+column_name = "namefrst_clean"
+transforms = [{type = "split"}]
+```
+
+### array_index
+
+If the column contains an array, select the element at the given position.
+
+This can be used as the input to another transform. In the example below, the first transform selects the second (index 1) item from  the "namefrst_split" column that contains a set of names split on white space. Then the substring 0,1 is selected, which gives the first initial of the person's probable middle name.
+
+Maps array of T → T.
+
+```
+[[column_mappings]]
+alias = "namefrst_mid_init"
+column_name = "namefrst_split"
+transforms = [
+    {type = "array_index", value = 1},
+    {type = "substring", values = [0, 1]}
+]
+```
+
+### mapping
+
+Map single or multiple values to a single output value, otherwise known as a "recoding."
+
+Maps T → U.
+
+```
+[[column_mappings]]
+column_name = "birthyr"
+alias = "clean_birthyr"
+transforms = [
+    {
+        type = "mapping",
+        values = [
+            {"from"=[9999,1999], "to" = ""},
+            {"from" = -9998, "to" = 9999}
+        ]
+    }
+]
+```
+
+### substring
+
+Replace a column with a substring of the data in the column.
+
+Maps string → string.
+
+```
+transforms = [
+    {type = "substring", values = [0, 1]}
+]
+ ```
+
+### divide_by_int
+
+Divide data in a column by an integer value. It may leave a non-integer result.
+
+For instance, the following example takes the birthplace variable and converts it
+from the detailed version to the general version. The two least significant digits
+are detailed birthplace information; to make the more general version, we simply drop
+them by dividing by 100 and rounding to the lowest whole number (floor function).
+
+Maps numerical → numerical.
+
+```
+[[column_mappings]]
+column_name = "bpl"
+alias = "bpl_root"
+transforms = [
+    {type = "divide_by_int", value = 100},
+    {type = "get_floor"}
+]
+```
+
+
+### when_value
+
+Apply conditional logic to replacement of values in a column. Works like the SQL `if()` or `case()` expressions in the SQL `select` clause.
+When the value of a column is `value` replace it with `if_value`. Otherwise replace it with `else_value`.
+
+The following example replaces all "race" IPUMS codes with 0 (white) or 1 (non-white). An IPUMS code of 100 is the "white" race category.
+
+Maps T → U.
+
+```
+column_name = "race"
+transforms = [
+    {type = "when_value", value = 100, if_value = 0, else_value = 1}
+]
+```
+
+
+### get_floor
+
+Round down to the nearest whole number.
+
+This example produces the general version of the IPUMS "relate" variable. The variable
+is coded such that detailed categories are between the hundreds (300 is child of household
+head, 301 is simply 'child', 302 is adopted child, 303 is step-child for instance).
+The general categories are usually all that's needed (1 == household head, 2 == spouse,
+3 == child, 4 == child-in-law, 5 == parent, 6 == parent-in-law, 7== sibling, 12 == not related to head).
+
+Maps numerical → numerical.
+
+```
+[[column_mappings]]
+alias = "relate_div_100"
+column_name = "relate"
+transforms = [
+    {type = "divide_by_int", value = 100},
+    {type = "get_floor"}
+]
+```
diff --git a/sphinx-docs/config.md b/sphinx-docs/config.md
index 8747c52..b02dc2e 100644
--- a/sphinx-docs/config.md
+++ b/sphinx-docs/config.md
@@ -1,20 +1,20 @@
 # Configuration
 1. [Basic Example Config File](#basic-config-file)
 2. [Advanced Example Config File](#advanced-config-file)
-3. [Top level configs](#top-level-configs)
-4. [Data sources](#data-sources)
+3. [Top-Level Configs](#top-level-configs)
+4. [Data Sources](#data-sources)
 5. [Filter](#filter)
-6. [Column mappings](#column-mappings)
-7. [Substitution columns](#substitution-columns)
-8. [Feature selections](#feature-selections)
-9. [Potential matches universe](#potential-matches-universe)
+6. [Column Mappings](#column-mappings)
+7. [Substitution Columns](#substitution-columns)
+8. [Feature Selections](#feature-selections)
+9. [Potential Matches Universe](#potential-matches-universe)
 10. [Blocking](#blocking)
 11. [Comparisons](#comparisons)
-12. [Household comparisons](#household-comparisons)
-13. [Comparison features](#comparison-features)
-14. [Pipeline-generated features](#pipeline-generated-features)
-15. [Training and models](#training-and-models)
-16. [Household training and models](#household-training-and-models)
+12. [Household Comparisons](#household-comparisons)
+13. [Comparison Features](#comparison-features)
+14. [Pipeline-Generated Features](#pipeline-generated-features)
+15. [Training and Models](#training-and-models)
+16. [Household Training and Models](#household-training-and-models)
 
 ## Basic Config File
 
@@ -438,16 +438,34 @@ datasource = "b"
 ```
 
 
-## [Column Mappings](column_mapping_transforms)
+## [Column Mappings](column_mappings)
 
 * Header name: `column_mappings`
-* Description: Base column mappings and transformations to extract from your input datasets.
+* Description: Base column mappings and transformations to extract from your
+  input datasets. Each column mapping requires a `column_name` which tells it
+  which input column to read from. Optionally you may provide an `alias` for
+  the column and `transforms` to modify it as it is read in. There are some additional
+  attributes listed below that are meant for advanced usage. These are described
+  in more detail on the [column mappings](column_mappings) page.
 * Required: True
 * Type: List
 * Attributes:
-  * `alias` -- Type: `string`. Optional; if not specified the new column name defaults to `column_name`. New name of column.
-  * `column_name` -- Type: `string`. Name of column in input data. Used as the name of the output column if `alias` is not specified.
-  * `transforms` -- Type: `List`. Optional. A list of transforms to apply, in order, to the input data. See the [column mapping transforms](column_mapping_transforms) section for more information.
+  * `column_name` -- Type: `string`. The name of the column in the input data.
+  * `alias` -- Type: `string`. Optional. The new name of the column to use
+    in hlink. By default, this is the same as `column_name`.
+  * `transforms` -- Type: `List`. Optional. A list of transforms to apply, in
+    order, to the input data. See the [column mapping transforms](column_mappings.html#transforms)
+    section for more information.
+  * `set_value_column_a` -- Type: `Any`. Optional. Set all records for dataset
+    A to the given literal value.
+  * `set_value_column_b` -- Type: `Any`. Optional. Set all records for dataset
+    B to the given literal value.
+  * `override_column_a` -- Type: `string`. Read from this column in dataset A
+    instead of the column specified with `column_name`.
+  * `override_column_b` -- Type: `string`. Read from this column in dataset B
+    instead of the column specified with `column_name`.
+  * `override_transforms` -- Type: `List`. Transforms to apply to the override
+    column specified with `override_column_a` or `override_column_b`.
 
 ```
 [[column_mappings]]
diff --git a/sphinx-docs/index.rst b/sphinx-docs/index.rst
index 1f903fd..2c9a76e 100644
--- a/sphinx-docs/index.rst
+++ b/sphinx-docs/index.rst
@@ -23,7 +23,7 @@ Configuration API
    :maxdepth: 2
    :caption: Configuration API
 
-   Column Mapping <column_mapping_transforms.md>
+   Column Mapping <column_mappings.md>
    Comparison Types <comparison_types.md>
    Feature Selection <feature_selection_transforms.md>
    Pipeline Features <pipeline_features.md>

From 19a5f743746f5797fd7b7cd1c4973b1ed2ecb365 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Mon, 27 Nov 2023 19:53:14 +0000
Subject: [PATCH 6/6] [#118] Fix a link and a header in the new column_mappings
 docs

---
 docs/_sources/index.rst.txt     | 2 +-
 docs/_sources/link_tasks.md.txt | 2 +-
 docs/genindex.html              | 2 +-
 docs/index.html                 | 4 ++--
 docs/link_tasks.html            | 4 ++--
 docs/search.html                | 2 +-
 docs/searchindex.js             | 2 +-
 sphinx-docs/index.rst           | 2 +-
 sphinx-docs/link_tasks.md       | 2 +-
 9 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt
index 2c9a76e..efdb528 100644
--- a/docs/_sources/index.rst.txt
+++ b/docs/_sources/index.rst.txt
@@ -23,7 +23,7 @@ Configuration API
    :maxdepth: 2
    :caption: Configuration API
 
-   Column Mapping <column_mappings.md>
+   Column Mappings <column_mappings.md>
    Comparison Types <comparison_types.md>
    Feature Selection <feature_selection_transforms.md>
    Pipeline Features <pipeline_features.md>
diff --git a/docs/_sources/link_tasks.md.txt b/docs/_sources/link_tasks.md.txt
index 34e27ba..dc201b7 100644
--- a/docs/_sources/link_tasks.md.txt
+++ b/docs/_sources/link_tasks.md.txt
@@ -17,7 +17,7 @@ datasets.
 ### Related Configuration Sections
 
 * The [`datasource_a` and `datasource_b`](config.html#data-sources) sections specify where to find the input data.
-* [```column_mappings```](column_mapping_transforms.html#column-mapping-transforms),
+* [`column_mappings`](column_mappings.html#column-mappings),
 [`feature_selections`](feature_selection_transforms.html#feature-selection-transforms),
 and [`substitution_columns`](substitutions.html#substitutions) may all be used to define transformations on the input data.
 * The [`filter`](config.html#filter) section may be used to filter some records out of the input data
diff --git a/docs/genindex.html b/docs/genindex.html
index 2705398..9db198d 100644
--- a/docs/genindex.html
+++ b/docs/genindex.html
@@ -62,7 +62,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mappings</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/index.html b/docs/index.html
index 8c14a90..845bf82 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -90,7 +90,7 @@ <h1>Configuration API<a class="headerlink" href="#configuration-api" title="Link
 <div class="toctree-wrapper compound">
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a><ul>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mappings</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="column_mappings.html#basic-usage">Basic Usage</a></li>
 <li class="toctree-l2"><a class="reference internal" href="column_mappings.html#advanced-usage">Advanced Usage</a></li>
 <li class="toctree-l2"><a class="reference internal" href="column_mappings.html#transforms">Transforms</a></li>
@@ -160,7 +160,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mappings</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/link_tasks.html b/docs/link_tasks.html
index d5a2d72..c501d66 100644
--- a/docs/link_tasks.html
+++ b/docs/link_tasks.html
@@ -53,7 +53,7 @@ <h3>Task steps<a class="headerlink" href="#task-steps" title="Link to this headi
 <h3>Related Configuration Sections<a class="headerlink" href="#related-configuration-sections" title="Link to this heading">¶</a></h3>
 <ul class="simple">
 <li><p>The <a class="reference external" href="config.html#data-sources"><code class="docutils literal notranslate"><span class="pre">datasource_a</span></code> and <code class="docutils literal notranslate"><span class="pre">datasource_b</span></code></a> sections specify where to find the input data.</p></li>
-<li><p><a class="reference external" href="column_mapping_transforms.html#column-mapping-transforms"><code class="docutils literal notranslate"><span class="pre">column_mappings</span></code></a>,
+<li><p><a class="reference external" href="column_mappings.html#column-mappings"><code class="docutils literal notranslate"><span class="pre">column_mappings</span></code></a>,
 <a class="reference external" href="feature_selection_transforms.html#feature-selection-transforms"><code class="docutils literal notranslate"><span class="pre">feature_selections</span></code></a>,
 and <a class="reference external" href="substitutions.html#substitutions"><code class="docutils literal notranslate"><span class="pre">substitution_columns</span></code></a> may all be used to define transformations on the input data.</p></li>
 <li><p>The <a class="reference external" href="config.html#filter"><code class="docutils literal notranslate"><span class="pre">filter</span></code></a> section may be used to filter some records out of the input data
@@ -255,7 +255,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mappings</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/search.html b/docs/search.html
index f9b4670..e225593 100644
--- a/docs/search.html
+++ b/docs/search.html
@@ -91,7 +91,7 @@ <h3>Navigation</h3>
 </ul>
 <p class="caption" role="heading"><span class="caption-text">Configuration API</span></p>
 <ul>
-<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mapping</a></li>
+<li class="toctree-l1"><a class="reference internal" href="column_mappings.html">Column Mappings</a></li>
 <li class="toctree-l1"><a class="reference internal" href="comparison_types.html">Comparison Types</a></li>
 <li class="toctree-l1"><a class="reference internal" href="feature_selection_transforms.html">Feature Selection</a></li>
 <li class="toctree-l1"><a class="reference internal" href="pipeline_features.html">Pipeline Features</a></li>
diff --git a/docs/searchindex.js b/docs/searchindex.js
index 7fd47bc..8c946ef 100644
--- a/docs/searchindex.js
+++ b/docs/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["column_mappings", "comparison_types", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "filenames": ["column_mappings.md", "comparison_types.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "titles": ["Column Mappings", "Comparison types, transform add-ons, aggregate features, and household aggregate features", "Configuration", "Feature Selection transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "terms": {"each": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "read": [0, 1, 2, 7, 10], "from": [0, 1, 2, 4, 6, 7, 8, 9, 10, 12], "input": [0, 1, 2, 3, 6, 7, 10, 11], "dataset": [0, 1, 2, 6, 7, 10, 12], "hlink": [0, 1, 2, 5, 6, 7, 12], "It": [0, 1, 2, 6, 10, 12], "ha": [0, 1, 2, 6, 10, 12], "column_nam": [0, 1, 2, 11], "attribut": [0, 1, 2, 3, 7, 8, 9, 10, 11], "which": [0, 1, 2, 3, 6, 7, 9, 10, 12], "specifi": [0, 1, 2, 6, 7, 9, 10, 11], "name": [0, 1, 2, 10, 11], "both": [0, 1, 2, 7, 12], "option": [0, 1, 2, 3, 6, 7, 8, 10, 12], "mai": [0, 2, 6, 7, 10], "have": [0, 1, 2, 5, 6, 7, 8, 10, 12], "an": [0, 1, 2, 3, 6, 8, 10], "alia": [0, 2, 7], "give": [0, 2], "new": [0, 2, 12], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "support": [0, 7, 8], "some": [0, 1, 2, 3, 6, 7, 10], "make": [0, 1, 2, 5, 12], "chang": [0, 1, 2, 5, 10, 12], "data": [0, 1, 4, 6, 7, 10], "thei": [0, 1, 2, 7, 10], "ar": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12], "These": [0, 1, 2, 3, 6, 7, 8, 9], "clean": [0, 6], "harmon": 0, "The": [0, 1, 2, 3, 5, 7, 8, 9, 10, 12], "avail": [0, 1, 2, 3, 5, 7, 8, 9, 12], "list": [0, 1, 2, 3, 4, 8, 10, 11], "below": [0, 1, 2, 3, 8, 9, 10], "section": [0, 1, 2, 12], "By": [0, 2, 10], "default": [0, 1, 2, 7, 8, 10], "must": [0, 1, 2, 8, 9, 11], "same": [0, 1, 2, 6, 7, 10], "With": [0, 9], "override_column_a": [0, 2, 3], "override_column_b": [0, 2, 3], "you": [0, 1, 2, 5, 10, 11, 12], "can": [0, 1, 2, 5, 6, 7, 8, 10, 12], "differ": [0, 1, 2, 4, 6, 7], "either": [0, 1, 2, 6, 11], "A": [0, 1, 2, 9, 10], "b": [0, 1, 2, 10], "when": [0, 1, 2, 3, 7, 12], "do": [0, 1, 3, 10, 12], "thi": [0, 1, 2, 5, 6, 7, 9, 10, 12], "appli": [0, 2, 3, 7, 12], "onli": [0, 1, 2, 7, 12], "non": 0, "overrid": [0, 2], "also": [0, 1, 2, 5, 6, 7, 9, 10, 12], "provid": [0, 2, 3, 6, 7, 9, 10], "override_transform": [0, 2], "describ": [0, 2, 10], "type": [0, 2, 3, 4, 7, 8, 10, 11, 12], "oper": [0, 2], "singl": [0, 2, 10, 12], "output": [0, 1, 2, 3, 6, 7, 10, 12], "more": [0, 1, 2, 9, 10, 12], "than": [0, 1, 2, 8], "one": [0, 1, 2, 7], "order": [0, 2, 7], "so": [0, 1, 2, 5, 12], "anoth": [0, 1, 3, 7], "format": 0, "letter": 0, "t": [0, 1, 2, 3, 12], "u": 0, "repres": [0, 1, 2, 3, 9, 10], "arbitrari": 0, "requir": [0, 1, 2, 3, 4, 7, 9, 10, 11], "addit": [0, 1, 2, 3, 5, 6, 10], "vari": [0, 2], "inform": [0, 1, 2, 10], "appear": [0, 1], "its": [0, 1, 6, 10], "suffix": 0, "mean": [0, 2], "two": [0, 1, 2, 3, 6, 7, 9, 10, 12], "link": [0, 1, 2, 4, 6, 8], "most": [0, 1, 7, 10], "independ": [0, 2], "For": [0, 1, 2, 7, 10, 12], "exampl": [0, 1, 2], "taken": [0, 1], "10": [0, 2, 5, 12], "year": [0, 1, 2, 3, 4], "apart": 0, "want": [0, 1, 2, 10, 12], "standard": [0, 1, 11], "ag": [0, 1, 2, 3], "variabl": [0, 1, 2, 12], "i": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12], "compar": [0, 1, 2, 6, 7], "between": [0, 1, 2, 6, 7, 10, 12], "To": [0, 1, 5, 7, 10], "could": [0, 2], "creat": [0, 2, 6, 7, 9, 10, 11, 12], "age_at_dataset_b": 0, "ad": [0, 1, 2], "column_map": [0, 2, 7], "valu": [0, 1, 2, 8, 9, 10, 11], "As": 0, "suppos": 0, "record": [0, 1, 2, 6, 7], "person": [0, 1, 6], "": [0, 1, 2, 6, 7, 10, 11], "first": [0, 1, 2, 5, 7, 10, 11], "string": [0, 1, 2, 3, 7, 8, 10, 11], "In": [0, 1, 6, 10, 12], "call": 0, "namefrst": [0, 1, 2], "entir": 0, "lowercas": 0, "first_nam": 0, "uppercas": 0, "follow": [0, 1, 6, 10, 11, 12], "configur": [0, 1, 6, 10, 12], "add": [0, 4], "_": [0, 1, 2, 3, 4, 8, 9, 10], "given": [0, 1, 2, 3, 8, 12], "numer": [0, 1], "11": [0, 2, 5, 9], "concat": 0, "concaten": [0, 1], "end": [0, 1, 2, 3, 11], "strip": [0, 7], "convert": [0, 1, 2], "alphabet": 0, "charact": 0, "lower": [0, 1], "case": [0, 1, 2, 3, 6], "white": 0, "space": [0, 2, 3, 11], "start": [0, 11], "ration": 0, "word": [0, 4], "replac": [0, 1, 4], "sinc": [0, 2], "peopl": [0, 1, 6, 10], "raw": [0, 2, 7, 10], "censu": [0, 7, 12], "contain": [0, 1, 11], "lead": 0, "better": [0, 6], "match": [0, 1, 4, 6, 10, 11, 12], "remov": 0, "qmark": 0, "hyphen": 0, "punctuat": 0, "apostroph": 0, "altern": [0, 2], "If": [0, 1, 2, 3, 7, 8, 10, 11], "surround": 0, "all": [0, 1, 2, 3, 7, 8, 9, 10], "them": [0, 1, 2, 7], "jr": [0, 2], "sr": [0, 2], "ii": [0, 2], "iii": [0, 2], "stop": 0, "last": [0, 1, 7, 9], "street": [0, 1], "avenu": [0, 11], "blvd": 0, "circl": 0, "court": 0, "road": 0, "prefix": 0, "like": [0, 2, 7, 10], "m": [0, 1], "mr": 0, "ah": 0, "chines": 0, "condens": 0, "whitespac": [0, 7], "take": [0, 1, 2, 3, 7, 10], "leav": 0, "behind": 0, "arrai": [0, 2, 4, 8, 9], "namefrst_split": [0, 2], "namefrst_clean": [0, 2], "index": [0, 5], "select": [0, 1, 4, 6, 10, 12], "element": 0, "posit": [0, 1, 2, 4, 6], "second": [0, 1, 2, 11], "1": [0, 1, 2, 4, 7, 8, 9, 10, 12], "item": 0, "set": [0, 1, 2, 3, 6, 7, 10, 12], "Then": [0, 5], "0": [0, 1, 2, 7, 8, 9, 10, 12], "initi": [0, 1, 10], "probabl": [0, 2, 8], "middl": [0, 1], "namefrst_mid_init": [0, 1], "multipl": [0, 1, 2, 10], "otherwis": [0, 1, 9, 12], "known": 0, "recod": 0, "birthyr": [0, 2], "clean_birthyr": [0, 2, 3], "9999": [0, 2, 9], "1999": [0, 2], "9998": 0, "divid": 0, "int": [0, 1, 2, 3, 8], "integ": [0, 1, 2, 9], "result": [0, 1, 6, 9, 10, 12], "instanc": [0, 8], "birthplac": 0, "detail": [0, 2, 10], "version": [0, 5, 12], "gener": [0, 1, 4, 6, 7, 10], "least": [0, 1], "signific": 0, "digit": 0, "we": [0, 1, 10, 12], "simpli": [0, 2], "drop": [0, 2, 10], "100": [0, 2, 12], "round": [0, 2], "lowest": 0, "whole": [0, 6], "number": [0, 1, 2, 7, 8, 10], "floor": 0, "function": [0, 1, 2, 6, 10], "bpl": [0, 1, 2], "bpl_root": 0, "condit": [0, 1, 2, 3, 4, 7], "logic": 0, "work": [0, 1, 2, 5, 7, 10, 12], "sql": [0, 1, 2, 3, 4, 7, 10], "express": [0, 1, 2], "claus": [0, 1], "if_valu": 0, "else_valu": 0, "race": [0, 1, 2, 9, 12], "ipum": [0, 6], "code": [0, 1, 2, 5], "categori": [0, 8], "get": [0, 1, 10], "down": [0, 6, 12], "nearest": 0, "produc": [0, 10], "relat": [0, 1, 2], "hundr": 0, "300": 0, "child": [0, 8], "household": [0, 4, 6, 8, 10, 12], "head": 0, "301": 0, "302": 0, "adopt": 0, "303": 0, "step": [0, 1, 2, 6], "usual": [0, 7, 12], "need": [0, 1, 2, 7, 10, 12], "2": [0, 1, 2, 3, 7, 8, 11, 12], "spous": 0, "3": [0, 1, 2, 5, 7, 8, 9, 12], "4": [0, 1, 8], "law": 0, "5": [0, 1, 2, 8, 9, 10, 12], "parent": [0, 1, 11], "6": [0, 2, 8, 9, 12], "7": [0, 1, 2, 8, 12], "sibl": 0, "12": [0, 5], "relate_div_100": [0, 1, 2], "page": [1, 2, 10], "comparison_featur": [1, 2, 7], "along": 1, "header": [1, 2, 3, 9, 11], "context": [1, 3, 9], "relatematch": [1, 2], "comparison_typ": [1, 2], "categor": [1, 2, 8, 9], "true": [1, 2, 3, 7, 9, 11, 12], "maximum": [1, 8], "jaro": [1, 9], "winkler": [1, 9], "find": [1, 7, 12], "greatest": 1, "among": 1, "cartesian": 1, "product": [1, 6, 12], "column": [1, 3, 4, 7, 9, 10, 11, 12], "namelast": [1, 2], "would": [1, 2, 12], "return": [1, 3, 8, 10], "four": 1, "namefrst_a": 1, "namefrst_b": 1, "namelast_b": 1, "namelast_a": 1, "maximum_jw": 1, "score": [1, 2, 7, 9], "namefrst_jw": [1, 2, 12], "geograph": 1, "filter": [1, 4, 7, 11], "major": [1, 10], "locat": [1, 2, 10], "befor": [1, 2, 3, 5, 7], "boundari": 1, "zero": 1, "jw_street": 1, "enum_dist": 1, "max": [1, 8, 10], "member": [1, 7], "neighborhood": 1, "surnam": 1, "related_individual_max_jw": 1, "namefrst_rel": 1, "assert": [1, 10], "NOT": 1, "distinct": 1, "f1": 1, "evalu": [1, 2, 6, 7, 8], "ani": [1, 2, 5, 8], "potenti": [1, 4, 7], "mismatch": 1, "queri": [1, 2], "fi": 1, "OR": 1, "mi0": 1, "mi1": 1, "THEN": 1, "els": [1, 2, 3], "first_init_col": 1, "namefrst_init": 1, "mid_init_col": 1, "namefrst_mid_init_2": 1, "f2": 1, "empti": 1, "null": [1, 2, 3], "AND": [1, 2], "individu": [1, 2, 7, 12], "mainli": 1, "caution": [1, 9], "flag": [1, 9, 10, 12], "f": [1, 10], "sp": 1, "m_caution": [1, 2, 9, 12], "mbpl": 1, "mother_birthyr": 1, "stepmom": 1, "momloc": 1, "comp_a": [1, 2], "comp_b": [1, 2], "comp_c": 1, "parent_step_chang": 1, "comp_d": 1, "check": [1, 10], "sign": 1, "boolean": [1, 2, 3, 11, 12], "form": [1, 7, 11], "cast": 1, "col": 1, "namelast_equal_as_int": 1, "namelast_clean": [1, 2, 3], "whether": [1, 2, 11], "join": [1, 11], "across": 1, "being": [1, 7], "exact": [1, 2], "namefrst_unstd": [1, 2], "present": [1, 2, 9], "nonzero": 1, "primarili": [1, 7], "indic": [1, 12], "kind": 1, "incompar": 1, "akin": 1, "miss": [1, 10], "see": [1, 2, 5, 10, 12], "univers": [1, 4, 7], "similar": 1, "fbpl_nomatch": 1, "fbpl": 1, "allow": [1, 2, 7, 12], "up": [1, 2, 10, 11], "sub": 1, "object": [1, 2, 6, 10], "document": [1, 8, 10, 12], "sp_caution": [1, 2, 12], "spouse_bpl": 1, "spouse_birthyr": 1, "durmarr": [1, 2], "new_marr": [1, 2], "street_jw": [1, 2, 12], "counti": 1, "statefip": [1, 2], "9": 1, "multipli": 1, "togeth": [1, 2], "after": [1, 2, 4, 8, 10], "float": [1, 2, 8], "comp": 1, "c": 1, "sploc": 1, "012": 1, "fals": [1, 2, 3, 4, 6, 10], "d": 1, "under": [1, 2], "specif": [1, 2, 10], "circumst": 1, "should": [1, 2, 8, 9, 10], "mid_init_match": 1, "either_1": 1, "nativ": 1, "either_0": 1, "gen": 1, "imm": [1, 2, 12], "immigr": 1, "look": [1, 10, 11], "foreign": 1, "born": 1, "sgen": [1, 2, 12], "rel": [1, 2, 12], "scala": 1, "determin": [1, 7], "greater": [1, 5], "jw_threshold": 1, "less": [1, 2], "age_threshold": 1, "sex": [1, 2, 11], "sampl": 1, "related_individual_row": 1, "unrel": 1, "depend": [1, 2, 5, 12], "name_col": 1, "birthyr_col": 1, "namefrst_related_row": 1, "replaced_birthyr": [1, 2, 3], "extra": 1, "children": 1, "who": 1, "base": [1, 2, 7], "expect": 1, "count": [1, 10, 12], "suspect": [1, 6], "relate_col": 1, "histid_col": 1, "id": [1, 2], "birth": 1, "year_b": 1, "wa": [1, 12], "minimum": [1, 8], "accept": [1, 2, 12], "consid": [1, 8], "histid": [1, 2, 12], "1910": [1, 2, 12], "8": [1, 2, 5, 10], "rate": 1, "calcul": [1, 12], "percentag": 1, "seen": 1, "neighbor": 1, "meet": 1, "95": 1, "nbor": [1, 2, 12], "namelast_neighbor": 1, "05": [1, 2], "namelast_popularity_sum": 1, "namelast_popular": 1, "length": [1, 2, 9], "size": 1, "ab": 1, "diff": 1, "absolut": 1, "invalid": [1, 8], "instead": [1, 2, 5, 7], "marriag": 1, "durat": 1, "99": [1, 2], "placehold": 1, "unknown": 1, "exclud": 1, "those": [1, 2], "consider": 1, "byrdiff": [1, 2, 12], "mardurmatch": [1, 2], "14": 1, "minu": [1, 2], "subtract": 1, "geo": 1, "distanc": [1, 8], "lookup": 1, "tabl": [1, 2, 4, 7, 10, 12], "core": [1, 7, 10, 12], "dist_tabl": 1, "py": [1, 2], "There": [1, 2, 3, 7], "sever": [1, 6], "wai": [1, 5, 10], "file": [1, 4, 6, 7, 10, 11, 12], "kei": [1, 7, 10], "key_count": 1, "secondari": 1, "serv": 1, "back": 1, "primari": [1, 6], "doe": [1, 7, 12], "particularli": 1, "state": [1, 6], "much": [1, 7], "fewer": [1, 8], "combin": [1, 2, 3, 7], "thu": 1, "risk": 1, "fill": 1, "aren": 1, "ex": 1, "just": [1, 2, 10, 12], "even": 1, "though": 1, "distances_fil": 1, "path": [1, 2, 10, 11, 12], "table_nam": 1, "what": [1, 2, 10, 12], "onc": [1, 10], "loc_a": 1, "where": [1, 7, 10, 12], "come": 1, "loc_b": 1, "distance_col": 1, "source_column_a": 1, "sourc": [1, 4, 7, 10, 12], "source_column_b": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b_0": 1, "loc_b_1": 1, "secondary_key_count": 1, "backup": 1, "secondary_table_nam": 1, "secondary_distances_fil": 1, "secondary_source_column": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_distance_col": 1, "state_dist": 1, "state_distance_lookup": 1, "county_state_dist": 1, "csv": [1, 2, 7, 10, 11, 12], "statecode1": 1, "statecode2": 1, "dist": 1, "county_dist": [1, 2, 12], "county_distance_lookup": 1, "county_1900_1910_distances_km": 1, "from_icpsrctyi": 1, "to_icpsrctyi": 1, "from_statefip": 1, "to_statefip": 1, "distance_km": 1, "state_1900_1910_distances_km": 1, "fetch": 1, "neither": 1, "nor": 1, "mpre": 1, "m_namefrst": 1, "accord": 1, "niu": 1, "other": [1, 2, 12], "mfbplmatch": 1, "multi": 1, "search": 1, "special": 1, "simplifi": 1, "particular": [1, 2], "constraint": 1, "num_col": 1, "whose": 1, "templat": 1, "n": [1, 8, 9], "per": [1, 2, 8, 9, 10], "current": [1, 2, 10], "respect": [1, 7], "jw_col_templ": 1, "jw": 1, "pair": [1, 12], "equal_and_not_null_templ": 1, "final": [1, 2, 12], "comput": [1, 3, 7], "_namefrst": 1, "_bpl": 1, "_sex": 1, "25": 1, "nvl": 1, "sm_namefrst": 1, "sn_namefrst": 1, "sm_bpl": 1, "sn_bpl": 1, "sm_sex": 1, "sn_sex": 1, "pass": [1, 2, 7, 8], "flexibl": 1, "user": [1, 10], "write": [1, 10, 12], "own": [1, 2], "favor": 1, "reason": 1, "good": 1, "fallback": 1, "defin": [1, 7, 8, 9, 10], "spark": [1, 2, 5, 8, 9, 10, 12], "builtin": 1, "argument": [1, 10, 12], "namelast_jw_max": 1, "namelast1": 1, "namelast2": 1, "namelast3": 1, "abov": [1, 5], "extend": 1, "beyond": 1, "top": [1, 4], "level": [1, 4, 10], "everi": 1, "jw_f": [1, 2, 12], "father_namefrst": 1, "rais": [1, 3], "exponenti": 1, "squar": 1, "county_distance_squar": [1, 2, 12], "county_a": 1, "county_b": 1, "upper": 1, "gt": 1, "btwn": 1, "addl": 1, "var": [1, 2], "program": [1, 2, 7, 12], "report": [1, 4, 6, 10], "addl_var": 1, "check_val_expr": 1, "else_v": 1, "volumn": 1, "datasourc": [1, 2, 10], "yrimmig": 1, "immyear_diff": [1, 2, 9, 12], "includ": [1, 2, 7, 9, 10], "train": [1, 4, 6, 8, 10], "independent_var": [1, 2, 12], "config": [1, 4, 7, 10, 12], "id_column": [1, 2], "_a": 1, "mult": 1, "exist": [1, 2, 10], "within": [1, 2, 6, 10, 11], "hh_train": [1, 2, 7, 10, 12], "hh": 1, "highest": [1, 2], "against": [1, 11], "ten": [1, 2], "tell": 2, "how": [2, 7], "descript": [2, 8, 10], "refer": 2, "here": [2, 7, 10, 12], "tutori": [2, 10], "script": [2, 6, 10], "discuss": 2, "readm": 2, "note": 2, "written": [2, 6], "toml": [2, 6, 10], "abl": 2, "json": [2, 10], "datasource_a": [2, 7], "datasource_b": [2, 7], "transform": [2, 4, 6, 7], "lowercase_strip": 2, "add_to_a": 2, "age_2": 2, "derived_from": 2, "expand_length": 2, "explod": [2, 7], "jaro_winkl": 2, "namelast_jw": [2, 12], "threshold": [2, 8, 12], "feature_nam": 2, "79": 2, "84": 2, "complex": 2, "machin": [2, 6, 7, 10, 12], "learn": [2, 6, 7, 10, 12], "probabilist": [2, 6], "drop_data_from_scored_match": 2, "us1900": 2, "us1900m_usa": 2, "p": 2, "parquet": [2, 7], "us1910": 2, "us1910m_usa": 2, "training_data_subset": 2, "serialp": 2, "rationalize_name_word": 2, "remove_qmark_hyphen": 2, "replace_apostroph": 2, "remove_suffix": 2, "remove_alternate_nam": 2, "condense_strip_whitespac": 2, "split": [2, 3, 7, 8, 9, 12], "namefrst_std": [2, 11], "array_index": 2, "bpl_orig": 2, "divide_by_int": 2, "get_floor": 2, "statefip_h": 2, "output_typ": 2, "substitution_column": [2, 7, 11], "join_column": [2, 11], "join_valu": [2, 11], "substitution_fil": [2, 11], "name_std": [2, 11], "male": [2, 11], "femal": [2, 11], "feature_select": [2, 3, 7], "input_column": [2, 3, 9], "output_column": [2, 3, 9], "sql_condit": 2, "namelast_bigram": 2, "bigram": [2, 4], "bpl_clean": 2, "bpl_str": 2, "washington": 2, "bpl2_str": 2, "53": 2, "region": [2, 12], "attach_vari": 2, "region_dict": 2, "col_to_join_on": 2, "col_to_add": 2, "null_fil": 2, "col_typ": 2, "potential_matches_univers": [2, 7], "birthyr_3": 2, "namefrst_std_jw": [2, 12], "75": [2, 8, 12], "comparis": 2, "post": [2, 7], "hh_comparison": [2, 7], "threshold_expr": 2, "fetch_a": 2, "sex_equ": 2, "equal": [2, 11], "relate_a": [2, 9], "pipeline_featur": [2, 7, 9], "sex_region_interact": 2, "transformer_typ": [2, 9], "interact": [2, 4, 7, 12], "relatetyp": [2, 9], "bucket": [2, 7], "hit": [2, 10, 12], "scale_data": [2, 12], "training_data": [2, 10], "dependent_var": [2, 12], "score_with_model": [2, 12], "use_training_data_featur": [2, 7, 12], "split_by_id_a": [2, 12], "decis": [2, 4, 8, 12], "drop_duplicate_with_threshold_ratio": [2, 12], "n_training_iter": [2, 7, 12], "output_suspicious_td": [2, 12], "param_grid": [2, 12], "model_paramet": [2, 7, 8, 12], "random_forest": [2, 12], "maxdepth": [2, 8, 12], "numtre": [2, 8, 12], "005": 2, "threshold_ratio": [2, 8, 12], "logistic_regress": [2, 12], "50": [2, 12], "65": 2, "80": 2, "chosen_model": [2, 8, 12], "prediction_col": 2, "predict": [2, 12], "hh_col": 2, "hh_training_data_1900_1910": 2, "probit": [2, 4], "go": [2, 10], "your": [2, 5, 7, 10, 12], "uniqu": 2, "identifi": [2, 6, 12], "full": [2, 7, 12], "short": 2, "alphanumer": 2, "convert_ints_to_long": 2, "automat": [2, 5, 7], "long": [2, 11], "especi": 2, "assum": 2, "schema": 2, "sometim": 2, "term": 2, "bigint": 2, "thing": 2, "my_fil": 2, "subset": [2, 11], "limit": 2, "extract": 2, "modifi": 2, "meant": 2, "usag": [2, 4, 10], "set_value_column_a": [2, 3], "liter": 2, "set_value_column_b": [2, 3], "iv": 2, "v": 2, "vi": 2, "vii": 2, "viii": 2, "namelast_clean_bigram": [2, 3], "fed": [2, 7], "prep": 2, "df": [2, 10], "men": 2, "newli": 2, "attempt": 2, "duplic": [2, 8], "row": 2, "conjuct": 2, "Will": 2, "conjunct": 2, "rang": [2, 9], "original_valu": 2, "plu": 2, "1870": 2, "expand": 2, "1867": 2, "1868": 2, "1869": 2, "1871": 2, "1872": 2, "1873": 2, "kept": 2, "keep": 2, "appropri": 2, "treat": [2, 9], "import": [2, 7, 10, 12], "dure": [2, 7], "hot": 2, "encod": [2, 3], "vector": [2, 9], "stage": 2, "well": 2, "upper_threshold": 2, "cannot": 2, "robust": 2, "ml": [2, 4, 8, 9], "typic": [2, 7], "leverag": 2, "api": [2, 6, 9], "piplin": 2, "regionf": 2, "sex_regionf_interact": 2, "immyear_caut": [2, 9], "myriad": 2, "explor": [2, 4, 6, 10], "part": [2, 7], "task": [2, 4, 6, 8, 12], "drop_duplicate_a": 2, "time": [2, 7, 10], "out": [2, 7, 12], "best": [2, 7], "smallest": 2, "possibl": 2, "ratio": [2, 8], "beta": [2, 8], "test": [2, 7, 12], "model_explor": [2, 10, 12], "hyper": [2, 6, 12], "paramet": [2, 6, 7, 8, 10, 12], "eval": 2, "skip": [2, 7], "apply_model": 2, "run_all_step": [2, 10, 12], "command": [2, 6, 10, 12], "try": 2, "creation": 2, "iter": 2, "scale": 2, "error": [2, 9], "1900": [2, 12], "about": [2, 10, 12], "1930": [2, 12], "1940": [2, 12], "fail": 2, "were": 2, "sure": [2, 5, 10], "scratch": 2, "although": 2, "know": 2, "haven": 2, "save": [2, 7, 12], "small": 2, "amount": 2, "process": [2, 6, 10], "repeatedli": 2, "help": [2, 7, 10], "neg": [2, 4, 6], "area": 2, "coverag": 2, "increas": [2, 9], "represent": [2, 7], "ensur": 2, "group": [2, 7], "a304bt": 2, "three": [2, 7], "b200": 2, "c201": 2, "d425": 2, "perform": [2, 6, 7, 11], "feature_import": [2, 7, 12], "coeffici": [2, 7], "enabl": [2, 7, 10], "srace": [2, 9, 12], "race_interacted_srac": [2, 9, 12], "hits2": [2, 12], "exact_mult": [2, 12], "ncount": [2, 3, 12], "ncount2": [2, 3, 12], "f_interacted_jw_f": [2, 12], "f_caution": [2, 12], "f_pre": [2, 12], "fbplmatch": [2, 12], "m_interacted_jw_m": [2, 9, 12], "jw_m": [2, 9, 12], "m_pre": [2, 9, 12], "mbplmatch": [2, 12], "sp_interacted_jw_sp": [2, 12], "jw_sp": [2, 12], "sp_pre": [2, 12], "mi": [2, 12], "fsoundex": [2, 12], "lsoundex": [2, 12], "oth": [2, 12], "imm_interacted_immyear_caut": [2, 12], "1900_1910_training_data_20191023": 2, "jw_max_a": 2, "jw_max_b": 2, "f1_match": 2, "f2_match": 2, "byrdifcat": 2, "racematch": 2, "bplmatch": 2, "imm_interacted_bplmatch": 2, "sexmatch": 2, "relatetype_interacted_relatematch": 2, "checkpoint": 3, "no_first_pad": 3, "don": 3, "prepend": 3, "namefrst_unstd_bigram": 3, "namelast_frst_bigram": 3, "namelast_clean_soundex": 3, "input_col": 3, "output_col": 3, "expon": 3, "introduct": 4, "overview": 4, "instal": 4, "pypi": 4, "preprocess": [4, 6, 10, 12], "model": [4, 6, 10], "run": [4, 5, 6, 7, 12], "librari": [4, 6], "mode": [4, 5, 12], "advanc": 4, "workflow": 4, "export": [4, 7, 10], "featur": [4, 6, 7, 8, 10], "reus": 4, "basic": 4, "map": [4, 7, 9], "substitut": [4, 7], "block": [4, 7], "comparison": [4, 7], "pipelin": 4, "ons": 4, "aggreg": 4, "union": 4, "soundex": 4, "power": 4, "regex": 4, "random": [4, 8], "forest": [4, 8], "logist": [4, 8], "regress": [4, 8], "tree": [4, 8], "gradient": [4, 8], "boost": [4, 8], "system": 5, "python": [5, 6, 10], "java": 5, "integr": 5, "apach": 5, "via": [5, 6], "pyspark": [5, 8, 9, 10], "packag": 5, "org": 5, "latest": 5, "pip": 5, "easiest": [5, 10], "through": [5, 7, 9, 10], "instruct": [5, 10], "But": 5, "clone": 5, "github": 5, "repositori": 5, "root": 5, "project": 5, "directori": [5, 10, 12], "develop": [5, 6], "e": 5, "dev": 5, "edit": 5, "made": 5, "built": 5, "tool": [5, 6], "line": [6, 10], "share": 6, "characterist": [6, 7], "correspond": [6, 7], "real": 6, "world": 6, "determinist": [6, 7], "rule": [6, 7], "algorithm": [6, 7], "At": [6, 7], "been": 6, "unit": 6, "census": 6, "hierarch": [6, 10], "structur": 6, "nest": 6, "howev": [6, 12], "tailor": 6, "ignor": 6, "common": [6, 7, 12], "highli": [6, 7], "languag": 6, "further": [6, 12], "broken": 6, "smaller": 6, "sequenc": 6, "linkrun": [6, 10], "prepar": [6, 7, 10], "research": 6, "experi": 6, "understand": 6, "tune": [6, 12], "relationship": 6, "varieti": 7, "normal": 7, "abbrevi": [7, 11], "regist": [7, 10], "datafram": [7, 10, 12], "request": 7, "classif": [7, 8], "metadata": 7, "introspect": 7, "ingest": 7, "inspect": 7, "mani": [7, 10], "aspect": [7, 10], "extens": 7, "longest": 7, "definit": 7, "reduc": 7, "drastic": 7, "improv": 7, "runtim": 7, "separ": 7, "total": 7, "potential_match": [7, 10], "satisfi": 7, "elig": 7, "reshap": 7, "thought": 7, "ahead": 7, "chosen": 7, "experiment": [7, 10], "focus": 7, "demograph": 7, "moment": 7, "veri": [7, 12], "anyon": 7, "percent": 7, "remain": 7, "popul": 7, "pull": 7, "fix": 7, "width": 7, "crosswalk": 7, "construct": 7, "alpha": 8, "hyperparamet": [8, 12], "de": 8, "param": [8, 12], "label": 8, "doc": [8, 9], "commonli": 8, "explan": 8, "randomforestclassifi": 8, "depth": 8, "20": 8, "featuresubsetstrategi": 8, "node": 8, "auto": 8, "onethird": 8, "sqrt": 8, "log2": 8, "15": 8, "generalizedlinearregress": 8, "famili": 8, "binomi": 8, "85": [8, 10], "logisticregress": 8, "decisiontreeclassifi": 8, "mininstancespernod": 8, "caus": 8, "left": 8, "right": [8, 10], "discard": 8, "maxbin": 8, "bin": 8, "discret": 8, "continu": [8, 9, 12], "gbtclassifi": 8, "mother": 9, "point": [9, 12], "x": [9, 10], "y": 9, "hold": 9, "except": 9, "strictli": 9, "inf": 9, "explicitli": 9, "cover": 9, "doubl": 9, "outsid": 9, "job": 10, "high": 10, "class": 10, "handl": 10, "main": 10, "complet": 10, "access": [10, 12], "link_run": 10, "factori": 10, "sparkfactori": 10, "load_config": 10, "load_conf_fil": 10, "sparksess": 10, "now": 10, "let": 10, "load": 10, "our": 10, "my_conf": 10, "lr": 10, "prep_step": 10, "get_step": 10, "enumer": 10, "print": 10, "input_table_nam": 10, "output_table_nam": 10, "run_step": 10, "get_tabl": 10, "matches_df": 10, "hh_model_explor": 10, "method": [10, 12], "interfac": 10, "easili": 10, "conveni": 10, "adjust": 10, "set_loc": 10, "set_num_cor": 10, "set_executor_memori": 10, "5g": 10, "ll": 10, "dictionari": 10, "often": 10, "modul": 10, "pleas": 10, "reproduc": 10, "consol": 10, "cpu": 10, "h": 10, "executor_memori": [10, 12], "execute_task": 10, "execute_command": 10, "conf": [10, 12], "show": 10, "messag": 10, "exit": 10, "memori": 10, "executor": 10, "begin": 10, "execut": 10, "seri": 10, "excute_command": 10, "filepath": 10, "sai": 10, "fullcount_1870_1880": 10, "pattern": 10, "full_count_1870_1880": 10, "prompt": 10, "enter": 10, "text": 10, "unstabl": 10, "topic": 10, "analyz": [10, 12], "set_preexisting_t": 10, "x_persist": 10, "borrow_t": 10, "get_task": 10, "set_print_sql": 10, "x_sql": 10, "x_sqlf": 10, "ipython": 10, "showf": 10, "x_summari": 10, "desc": 10, "x_crosswalk": 10, "x_tab": 10, "q": [10, 12], "x_hh_tfam": 10, "x_tfam": 10, "drop_al": 10, "reload": 10, "x_hh_tfam_2a": 10, "x_tfam_raw": 10, "drop_all_prc": 10, "x_hh_tfam_2b": 10, "x_union": 10, "drop_all_temp": 10, "x_load": 10, "get_set": 10, "set_link_task": 10, "x_parquet_from_csv": 10, "organ": 10, "hierarchi": 10, "five": 10, "hh_match": 10, "someth": 10, "choic": 10, "preexist": 10, "prepped_df_a": 10, "prepped_df_b": 10, "raw_df_b": 10, "raw_df_a": 10, "training_featur": [10, 12], "scored_potential_match": 10, "potential_matches_prep": 10, "exploded_df_b": 10, "exploded_df_a": 10, "predicted_match": 10, "hh_training_featur": [10, 12], "hh_training_data": 10, "hh_predicted_match": 10, "hh_scored_potential_match": 10, "hh_potential_match": 10, "hh_blocked_match": 10, "hh_potential_matchs_prep": 10, "model_eval_training_vector": 10, "model_eval_training_data": 10, "model_eval_repeat_fp": 10, "model_eval_training_featur": 10, "model_eval_training_result": 10, "model_eval_repeat_fn": 10, "hh_model_eval_training_vector": 10, "hh_model_eval_repeat_fp": 10, "hh_model_eval_repeat_fn": 10, "hh_model_eval_training_result": 10, "hh_model_eval_training_featur": 10, "hh_model_eval_training_data": 10, "persist": 10, "hidden": 10, "intermedi": 10, "yet": 10, "databas": 10, "tablenam": 10, "istemporari": 10, "task_nam": 10, "num": 10, "finish": 10, "put": [10, 12], "launch": [10, 12], "my": [10, 12], "subhead": 11, "suppli": 11, "regex_word_replac": 11, "variant": 11, "av": 11, "7th": 11, "swap": 11, "still": 11, "anywher": 11, "proceed": 11, "street_unstd": 11, "dir": 11, "substitutions_street_abbrev": 11, "span": 12, "1920": 12, "deriv": 12, "necessari": 12, "scenario": 12, "copi": 12, "use_potential_matches_featur": 12, "full_count_1900_1910": 12, "50g": 12, "ask": 12, "arg": 12, "partit": 12, "training_data_1900_1910_hlink_featur": 12, "might": 12, "shut": 12, "framework": 12, "etc": 12, "relev": 12, "matrix": 12, "implement": 12, "regular": 12, "training_data_1900_1910": 12, "weren": 12, "ident": 12, "manual": 12, "updat": 12, "isn": 12, "analysi": 12, "training_result": 12, "hh_training_result": 12, "1900_1910_training_result": 12, "repeat_fp": 12, "repeat_fn": 12, "hh_repeat_fp": 12, "hh_repeat_fn": 12, "1900_1910_potential_fp": 12, "1900_1910_potential_fn": 12, "prefer": 12, "ve": 12}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"column": [0, 2], "map": [0, 2], "basic": [0, 2], "usag": 0, "advanc": [0, 2, 12], "transform": [0, 1, 3, 9], "add_to_a": 0, "concat_to_a": 0, "concat_to_b": 0, "lowercase_strip": 0, "rationalize_name_word": 0, "remove_qmark_hyphen": 0, "remove_punctu": 0, "replace_apostroph": 0, "remove_alternate_nam": 0, "remove_suffix": 0, "remove_stop_word": 0, "remove_prefix": 0, "condense_strip_whitespac": 0, "remove_one_letter_nam": 0, "split": 0, "array_index": 0, "substr": 0, "divide_by_int": 0, "when_valu": 0, "get_floor": 0, "comparison": [1, 2], "type": [1, 9], "add": 1, "ons": 1, "aggreg": 1, "featur": [1, 2, 3, 9, 12], "household": [1, 2, 7], "maximum_jaro_winkl": 1, "jaro_winkl": 1, "jaro_winkler_street": 1, "max_jaro_winkl": 1, "equal": 1, "f1_match": 1, "f2_match": 1, "not_equ": 1, "equals_as_int": 1, "all_equ": 1, "not_zero_and_not_equ": 1, "time": 1, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "any_equ": 1, "either_are_1": 1, "either_are_0": 1, "second_gen_imm": 1, "rel_jaro_winkl": 1, "extra_children": 1, "jaro_winkler_r": 1, "sum": 1, "length_b": 1, "abs_diff": 1, "b_minus_a": 1, "geo_dist": 1, "fetch_a": 1, "fetch_b": 1, "present_both_year": 1, "neither_are_nul": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_and_equal_categorical_in_univers": 1, "multi_jaro_winkler_search": 1, "sql_condit": [1, 3], "alia": 1, "power": [1, 3], "threshold": 1, "lower_threshold": 1, "upper_threshold": 1, "gt_threshold": 1, "btwn_threshold": 1, "look_at_addl_var": 1, "hit": 1, "hits2": 1, "exact_mult": 1, "jw_max_a": 1, "jw_max_b": 1, "configur": [2, 4, 7], "config": 2, "file": 2, "top": 2, "level": 2, "data": [2, 11, 12], "sourc": [2, 5], "filter": 2, "substitut": [2, 11], "select": [2, 3], "potenti": [2, 12], "match": [2, 7], "univers": 2, "block": 2, "pipelin": [2, 9], "gener": [2, 9, 12], "train": [2, 7, 12], "model": [2, 7, 8, 12], "bigram": 3, "arrai": 3, "union": 3, "soundex": 3, "welcom": 4, "hlink": [4, 10], "": 4, "document": 4, "api": 4, "instal": 5, "requir": 5, "from": 5, "pypi": 5, "introduct": 6, "overview": [6, 7], "link": [7, 10, 12], "task": [7, 10], "preprocess": 7, "step": [7, 10], "relat": 7, "section": 7, "explor": [7, 12], "report": 7, "random_forest": 8, "probit": 8, "logistic_regress": 8, "decision_tre": 8, "gradient_boosted_tre": 8, "interact": [9, 10], "bucket": 9, "run": 10, "us": 10, "librari": 10, "mode": 10, "start": 10, "program": 10, "exampl": [10, 12], "workflow": [10, 12], "1": 11, "tabl": 11, "regex": 11, "word": 11, "replac": 11, "export": 12, "after": 12, "reus": 12, "differ": 12, "year": 12, "ml": 12, "list": 12, "fals": 12, "posit": 12, "neg": 12, "fp": 12, "fn": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Column Mappings": [[0, "column-mappings"], [2, "column-mappings"]], "Basic Usage": [[0, "basic-usage"]], "Advanced Usage": [[0, "advanced-usage"]], "Transforms": [[0, "transforms"]], "add_to_a": [[0, "add-to-a"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "lowercase_strip": [[0, "lowercase-strip"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_punctuation": [[0, "remove-punctuation"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_suffixes": [[0, "remove-suffixes"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_prefixes": [[0, "remove-prefixes"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "split": [[0, "split"]], "array_index": [[0, "array-index"]], "mapping": [[0, "mapping"]], "substring": [[0, "substring"]], "divide_by_int": [[0, "divide-by-int"]], "when_value": [[0, "when-value"]], "get_floor": [[0, "get-floor"]], "Comparison types, transform add-ons, aggregate features, and household aggregate features": [[1, "comparison-types-transform-add-ons-aggregate-features-and-household-aggregate-features"]], "Comparison types": [[1, "comparison-types"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "equals": [[1, "equals"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "not_equals": [[1, "not-equals"]], "equals_as_int": [[1, "equals-as-int"]], "all_equals": [[1, "all-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "and": [[1, "and"]], "times": [[1, "times"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "any_equals": [[1, "any-equals"]], "either_are_1": [[1, "either-are-1"]], "either_are_0": [[1, "either-are-0"]], "second_gen_imm": [[1, "second-gen-imm"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "extra_children": [[1, "extra-children"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "sum": [[1, "sum"]], "length_b": [[1, "length-b"]], "abs_diff": [[1, "abs-diff"]], "b_minus_a": [[1, "b-minus-a"]], "geo_distance": [[1, "geo-distance"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "present_both_years": [[1, "present-both-years"]], "neither_are_null": [[1, "neither-are-null"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "sql_condition": [[1, "sql-condition"], [3, "sql-condition"]], "Feature add-ons": [[1, "feature-add-ons"]], "alias": [[1, "alias"]], "power": [[1, "power"], [3, "power"]], "threshold": [[1, "threshold"]], "lower_threshold": [[1, "lower-threshold"]], "upper_threshold": [[1, "upper-threshold"]], "gt_threshold": [[1, "gt-threshold"]], "btwn_threshold": [[1, "btwn-threshold"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "Aggregate Features": [[1, "aggregate-features"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "exact_mult": [[1, "exact-mult"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "Configuration": [[2, "configuration"]], "Basic Config File": [[2, "basic-config-file"]], "Advanced Config File": [[2, "advanced-config-file"]], "Top level configs": [[2, "top-level-configs"]], "Data sources": [[2, "data-sources"]], "Filter": [[2, "filter"]], "Substitution Columns": [[2, "substitution-columns"]], "Feature Selections": [[2, "feature-selections"]], "Potential Matches Universe": [[2, "potential-matches-universe"]], "Blocking": [[2, "blocking"]], "Comparisons": [[2, "comparisons"]], "Household Comparisons": [[2, "household-comparisons"]], "Comparison Features": [[2, "comparison-features"]], "Pipeline-generated Features": [[2, "pipeline-generated-features"]], "Training and models": [[2, "training-and-models"]], "Household training and models": [[2, "household-training-and-models"]], "Feature Selection transforms": [[3, "feature-selection-transforms"]], "bigrams": [[3, "bigrams"]], "array": [[3, "array"]], "union": [[3, "union"]], "soundex": [[3, "soundex"]], "Welcome to hlink\u2019s documentation!": [[4, "welcome-to-hlink-s-documentation"]], "Configuration API": [[4, "configuration-api"], [4, null]], "Installation": [[5, "installation"]], "Requirements": [[5, "requirements"]], "Installing from PyPI": [[5, "installing-from-pypi"]], "Installing from source": [[5, "installing-from-source"]], "Introduction": [[6, "introduction"]], "Overview": [[6, "overview"], [7, "overview"], [7, "id1"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"]], "Link Tasks": [[7, "link-tasks"]], "Preprocessing": [[7, "preprocessing"]], "Task steps": [[7, "task-steps"], [7, "id2"], [7, "id5"], [7, "id8"], [7, "id11"], [7, "id14"]], "Related Configuration Sections": [[7, "related-configuration-sections"], [7, "id3"], [7, "id6"], [7, "id9"], [7, "id12"], [7, "id15"]], "Training and Household Training": [[7, "training-and-household-training"]], "Matching": [[7, "matching"]], "Household Matching": [[7, "household-matching"]], "Model Exploration and Household Model Exploration": [[7, "model-exploration-and-household-model-exploration"]], "Reporting": [[7, "reporting"]], "Models": [[8, "models"]], "random_forest": [[8, "random-forest"]], "probit": [[8, "probit"]], "logistic_regression": [[8, "logistic-regression"]], "decision_tree": [[8, "decision-tree"]], "gradient_boosted_trees": [[8, "gradient-boosted-trees"]], "Pipeline generated features": [[9, "pipeline-generated-features"]], "Transformer types": [[9, "transformer-types"]], "interaction": [[9, "interaction"]], "bucketizer": [[9, "bucketizer"]], "Running hlink": [[10, "running-hlink"]], "Using hlink as a Library": [[10, "using-hlink-as-a-library"]], "Interactive Mode": [[10, "interactive-mode"]], "Starting the program": [[10, "starting-the-program"]], "Running Linking Tasks and Steps": [[10, "running-linking-tasks-and-steps"]], "Example interactive mode workflow": [[10, "example-interactive-mode-workflow"]], "Substitutions": [[11, "substitutions"]], "1:1 substitution by data table": [[11, "substitution-by-data-table"]], "Substitution by regex word replace": [[11, "substitution-by-regex-word-replace"]], "Advanced Workflow Examples": [[12, "advanced-workflow-examples"]], "Export training data after generating features to reuse in different linking years": [[12, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Example training data export with generated ML features": [[12, "example-training-data-export-with-generated-ml-features"]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[12, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Example model exploration and FP/FN export workflow": [[12, "example-model-exploration-and-fp-fn-export-workflow"]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["column_mappings", "comparison_types", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "filenames": ["column_mappings.md", "comparison_types.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "titles": ["Column Mappings", "Comparison types, transform add-ons, aggregate features, and household aggregate features", "Configuration", "Feature Selection transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "terms": {"each": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], "read": [0, 1, 2, 7, 10], "from": [0, 1, 2, 4, 6, 7, 8, 9, 10, 12], "input": [0, 1, 2, 3, 6, 7, 10, 11], "dataset": [0, 1, 2, 6, 7, 10, 12], "hlink": [0, 1, 2, 5, 6, 7, 12], "It": [0, 1, 2, 6, 10, 12], "ha": [0, 1, 2, 6, 10, 12], "column_nam": [0, 1, 2, 11], "attribut": [0, 1, 2, 3, 7, 8, 9, 10, 11], "which": [0, 1, 2, 3, 6, 7, 9, 10, 12], "specifi": [0, 1, 2, 6, 7, 9, 10, 11], "name": [0, 1, 2, 10, 11], "both": [0, 1, 2, 7, 12], "option": [0, 1, 2, 3, 6, 7, 8, 10, 12], "mai": [0, 2, 6, 7, 10], "have": [0, 1, 2, 5, 6, 7, 8, 10, 12], "an": [0, 1, 2, 3, 6, 8, 10], "alia": [0, 2, 7], "give": [0, 2], "new": [0, 2, 12], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12], "support": [0, 7, 8], "some": [0, 1, 2, 3, 6, 7, 10], "make": [0, 1, 2, 5, 12], "chang": [0, 1, 2, 5, 10, 12], "data": [0, 1, 4, 6, 7, 10], "thei": [0, 1, 2, 7, 10], "ar": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12], "These": [0, 1, 2, 3, 6, 7, 8, 9], "clean": [0, 6], "harmon": 0, "The": [0, 1, 2, 3, 5, 7, 8, 9, 10, 12], "avail": [0, 1, 2, 3, 5, 7, 8, 9, 12], "list": [0, 1, 2, 3, 4, 8, 10, 11], "below": [0, 1, 2, 3, 8, 9, 10], "section": [0, 1, 2, 12], "By": [0, 2, 10], "default": [0, 1, 2, 7, 8, 10], "must": [0, 1, 2, 8, 9, 11], "same": [0, 1, 2, 6, 7, 10], "With": [0, 9], "override_column_a": [0, 2, 3], "override_column_b": [0, 2, 3], "you": [0, 1, 2, 5, 10, 11, 12], "can": [0, 1, 2, 5, 6, 7, 8, 10, 12], "differ": [0, 1, 2, 4, 6, 7], "either": [0, 1, 2, 6, 11], "A": [0, 1, 2, 9, 10], "b": [0, 1, 2, 10], "when": [0, 1, 2, 3, 7, 12], "do": [0, 1, 3, 10, 12], "thi": [0, 1, 2, 5, 6, 7, 9, 10, 12], "appli": [0, 2, 3, 7, 12], "onli": [0, 1, 2, 7, 12], "non": 0, "overrid": [0, 2], "also": [0, 1, 2, 5, 6, 7, 9, 10, 12], "provid": [0, 2, 3, 6, 7, 9, 10], "override_transform": [0, 2], "describ": [0, 2, 10], "type": [0, 2, 3, 4, 7, 8, 10, 11, 12], "oper": [0, 2], "singl": [0, 2, 10, 12], "output": [0, 1, 2, 3, 6, 7, 10, 12], "more": [0, 1, 2, 9, 10, 12], "than": [0, 1, 2, 8], "one": [0, 1, 2, 7], "order": [0, 2, 7], "so": [0, 1, 2, 5, 12], "anoth": [0, 1, 3, 7], "format": 0, "letter": 0, "t": [0, 1, 2, 3, 12], "u": 0, "repres": [0, 1, 2, 3, 9, 10], "arbitrari": 0, "requir": [0, 1, 2, 3, 4, 7, 9, 10, 11], "addit": [0, 1, 2, 3, 5, 6, 10], "vari": [0, 2], "inform": [0, 1, 2, 10], "appear": [0, 1], "its": [0, 1, 6, 10], "suffix": 0, "mean": [0, 2], "two": [0, 1, 2, 3, 6, 7, 9, 10, 12], "link": [0, 1, 2, 4, 6, 8], "most": [0, 1, 7, 10], "independ": [0, 2], "For": [0, 1, 2, 7, 10, 12], "exampl": [0, 1, 2], "taken": [0, 1], "10": [0, 2, 5, 12], "year": [0, 1, 2, 3, 4], "apart": 0, "want": [0, 1, 2, 10, 12], "standard": [0, 1, 11], "ag": [0, 1, 2, 3], "variabl": [0, 1, 2, 12], "i": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 12], "compar": [0, 1, 2, 6, 7], "between": [0, 1, 2, 6, 7, 10, 12], "To": [0, 1, 5, 7, 10], "could": [0, 2], "creat": [0, 2, 6, 7, 9, 10, 11, 12], "age_at_dataset_b": 0, "ad": [0, 1, 2], "column_map": [0, 2, 7], "valu": [0, 1, 2, 8, 9, 10, 11], "As": 0, "suppos": 0, "record": [0, 1, 2, 6, 7], "person": [0, 1, 6], "": [0, 1, 2, 6, 7, 10, 11], "first": [0, 1, 2, 5, 7, 10, 11], "string": [0, 1, 2, 3, 7, 8, 10, 11], "In": [0, 1, 6, 10, 12], "call": 0, "namefrst": [0, 1, 2], "entir": 0, "lowercas": 0, "first_nam": 0, "uppercas": 0, "follow": [0, 1, 6, 10, 11, 12], "configur": [0, 1, 6, 10, 12], "add": [0, 4], "_": [0, 1, 2, 3, 4, 8, 9, 10], "given": [0, 1, 2, 3, 8, 12], "numer": [0, 1], "11": [0, 2, 5, 9], "concat": 0, "concaten": [0, 1], "end": [0, 1, 2, 3, 11], "strip": [0, 7], "convert": [0, 1, 2], "alphabet": 0, "charact": 0, "lower": [0, 1], "case": [0, 1, 2, 3, 6], "white": 0, "space": [0, 2, 3, 11], "start": [0, 11], "ration": 0, "word": [0, 4], "replac": [0, 1, 4], "sinc": [0, 2], "peopl": [0, 1, 6, 10], "raw": [0, 2, 7, 10], "censu": [0, 7, 12], "contain": [0, 1, 11], "lead": 0, "better": [0, 6], "match": [0, 1, 4, 6, 10, 11, 12], "remov": 0, "qmark": 0, "hyphen": 0, "punctuat": 0, "apostroph": 0, "altern": [0, 2], "If": [0, 1, 2, 3, 7, 8, 10, 11], "surround": 0, "all": [0, 1, 2, 3, 7, 8, 9, 10], "them": [0, 1, 2, 7], "jr": [0, 2], "sr": [0, 2], "ii": [0, 2], "iii": [0, 2], "stop": 0, "last": [0, 1, 7, 9], "street": [0, 1], "avenu": [0, 11], "blvd": 0, "circl": 0, "court": 0, "road": 0, "prefix": 0, "like": [0, 2, 7, 10], "m": [0, 1], "mr": 0, "ah": 0, "chines": 0, "condens": 0, "whitespac": [0, 7], "take": [0, 1, 2, 3, 7, 10], "leav": 0, "behind": 0, "arrai": [0, 2, 4, 8, 9], "namefrst_split": [0, 2], "namefrst_clean": [0, 2], "index": [0, 5], "select": [0, 1, 4, 6, 10, 12], "element": 0, "posit": [0, 1, 2, 4, 6], "second": [0, 1, 2, 11], "1": [0, 1, 2, 4, 7, 8, 9, 10, 12], "item": 0, "set": [0, 1, 2, 3, 6, 7, 10, 12], "Then": [0, 5], "0": [0, 1, 2, 7, 8, 9, 10, 12], "initi": [0, 1, 10], "probabl": [0, 2, 8], "middl": [0, 1], "namefrst_mid_init": [0, 1], "multipl": [0, 1, 2, 10], "otherwis": [0, 1, 9, 12], "known": 0, "recod": 0, "birthyr": [0, 2], "clean_birthyr": [0, 2, 3], "9999": [0, 2, 9], "1999": [0, 2], "9998": 0, "divid": 0, "int": [0, 1, 2, 3, 8], "integ": [0, 1, 2, 9], "result": [0, 1, 6, 9, 10, 12], "instanc": [0, 8], "birthplac": 0, "detail": [0, 2, 10], "version": [0, 5, 12], "gener": [0, 1, 4, 6, 7, 10], "least": [0, 1], "signific": 0, "digit": 0, "we": [0, 1, 10, 12], "simpli": [0, 2], "drop": [0, 2, 10], "100": [0, 2, 12], "round": [0, 2], "lowest": 0, "whole": [0, 6], "number": [0, 1, 2, 7, 8, 10], "floor": 0, "function": [0, 1, 2, 6, 10], "bpl": [0, 1, 2], "bpl_root": 0, "condit": [0, 1, 2, 3, 4, 7], "logic": 0, "work": [0, 1, 2, 5, 7, 10, 12], "sql": [0, 1, 2, 3, 4, 7, 10], "express": [0, 1, 2], "claus": [0, 1], "if_valu": 0, "else_valu": 0, "race": [0, 1, 2, 9, 12], "ipum": [0, 6], "code": [0, 1, 2, 5], "categori": [0, 8], "get": [0, 1, 10], "down": [0, 6, 12], "nearest": 0, "produc": [0, 10], "relat": [0, 1, 2], "hundr": 0, "300": 0, "child": [0, 8], "household": [0, 4, 6, 8, 10, 12], "head": 0, "301": 0, "302": 0, "adopt": 0, "303": 0, "step": [0, 1, 2, 6], "usual": [0, 7, 12], "need": [0, 1, 2, 7, 10, 12], "2": [0, 1, 2, 3, 7, 8, 11, 12], "spous": 0, "3": [0, 1, 2, 5, 7, 8, 9, 12], "4": [0, 1, 8], "law": 0, "5": [0, 1, 2, 8, 9, 10, 12], "parent": [0, 1, 11], "6": [0, 2, 8, 9, 12], "7": [0, 1, 2, 8, 12], "sibl": 0, "12": [0, 5], "relate_div_100": [0, 1, 2], "page": [1, 2, 10], "comparison_featur": [1, 2, 7], "along": 1, "header": [1, 2, 3, 9, 11], "context": [1, 3, 9], "relatematch": [1, 2], "comparison_typ": [1, 2], "categor": [1, 2, 8, 9], "true": [1, 2, 3, 7, 9, 11, 12], "maximum": [1, 8], "jaro": [1, 9], "winkler": [1, 9], "find": [1, 7, 12], "greatest": 1, "among": 1, "cartesian": 1, "product": [1, 6, 12], "column": [1, 3, 4, 7, 9, 10, 11, 12], "namelast": [1, 2], "would": [1, 2, 12], "return": [1, 3, 8, 10], "four": 1, "namefrst_a": 1, "namefrst_b": 1, "namelast_b": 1, "namelast_a": 1, "maximum_jw": 1, "score": [1, 2, 7, 9], "namefrst_jw": [1, 2, 12], "geograph": 1, "filter": [1, 4, 7, 11], "major": [1, 10], "locat": [1, 2, 10], "befor": [1, 2, 3, 5, 7], "boundari": 1, "zero": 1, "jw_street": 1, "enum_dist": 1, "max": [1, 8, 10], "member": [1, 7], "neighborhood": 1, "surnam": 1, "related_individual_max_jw": 1, "namefrst_rel": 1, "assert": [1, 10], "NOT": 1, "distinct": 1, "f1": 1, "evalu": [1, 2, 6, 7, 8], "ani": [1, 2, 5, 8], "potenti": [1, 4, 7], "mismatch": 1, "queri": [1, 2], "fi": 1, "OR": 1, "mi0": 1, "mi1": 1, "THEN": 1, "els": [1, 2, 3], "first_init_col": 1, "namefrst_init": 1, "mid_init_col": 1, "namefrst_mid_init_2": 1, "f2": 1, "empti": 1, "null": [1, 2, 3], "AND": [1, 2], "individu": [1, 2, 7, 12], "mainli": 1, "caution": [1, 9], "flag": [1, 9, 10, 12], "f": [1, 10], "sp": 1, "m_caution": [1, 2, 9, 12], "mbpl": 1, "mother_birthyr": 1, "stepmom": 1, "momloc": 1, "comp_a": [1, 2], "comp_b": [1, 2], "comp_c": 1, "parent_step_chang": 1, "comp_d": 1, "check": [1, 10], "sign": 1, "boolean": [1, 2, 3, 11, 12], "form": [1, 7, 11], "cast": 1, "col": 1, "namelast_equal_as_int": 1, "namelast_clean": [1, 2, 3], "whether": [1, 2, 11], "join": [1, 11], "across": 1, "being": [1, 7], "exact": [1, 2], "namefrst_unstd": [1, 2], "present": [1, 2, 9], "nonzero": 1, "primarili": [1, 7], "indic": [1, 12], "kind": 1, "incompar": 1, "akin": 1, "miss": [1, 10], "see": [1, 2, 5, 10, 12], "univers": [1, 4, 7], "similar": 1, "fbpl_nomatch": 1, "fbpl": 1, "allow": [1, 2, 7, 12], "up": [1, 2, 10, 11], "sub": 1, "object": [1, 2, 6, 10], "document": [1, 8, 10, 12], "sp_caution": [1, 2, 12], "spouse_bpl": 1, "spouse_birthyr": 1, "durmarr": [1, 2], "new_marr": [1, 2], "street_jw": [1, 2, 12], "counti": 1, "statefip": [1, 2], "9": 1, "multipli": 1, "togeth": [1, 2], "after": [1, 2, 4, 8, 10], "float": [1, 2, 8], "comp": 1, "c": 1, "sploc": 1, "012": 1, "fals": [1, 2, 3, 4, 6, 10], "d": 1, "under": [1, 2], "specif": [1, 2, 10], "circumst": 1, "should": [1, 2, 8, 9, 10], "mid_init_match": 1, "either_1": 1, "nativ": 1, "either_0": 1, "gen": 1, "imm": [1, 2, 12], "immigr": 1, "look": [1, 10, 11], "foreign": 1, "born": 1, "sgen": [1, 2, 12], "rel": [1, 2, 12], "scala": 1, "determin": [1, 7], "greater": [1, 5], "jw_threshold": 1, "less": [1, 2], "age_threshold": 1, "sex": [1, 2, 11], "sampl": 1, "related_individual_row": 1, "unrel": 1, "depend": [1, 2, 5, 12], "name_col": 1, "birthyr_col": 1, "namefrst_related_row": 1, "replaced_birthyr": [1, 2, 3], "extra": 1, "children": 1, "who": 1, "base": [1, 2, 7], "expect": 1, "count": [1, 10, 12], "suspect": [1, 6], "relate_col": 1, "histid_col": 1, "id": [1, 2], "birth": 1, "year_b": 1, "wa": [1, 12], "minimum": [1, 8], "accept": [1, 2, 12], "consid": [1, 8], "histid": [1, 2, 12], "1910": [1, 2, 12], "8": [1, 2, 5, 10], "rate": 1, "calcul": [1, 12], "percentag": 1, "seen": 1, "neighbor": 1, "meet": 1, "95": 1, "nbor": [1, 2, 12], "namelast_neighbor": 1, "05": [1, 2], "namelast_popularity_sum": 1, "namelast_popular": 1, "length": [1, 2, 9], "size": 1, "ab": 1, "diff": 1, "absolut": 1, "invalid": [1, 8], "instead": [1, 2, 5, 7], "marriag": 1, "durat": 1, "99": [1, 2], "placehold": 1, "unknown": 1, "exclud": 1, "those": [1, 2], "consider": 1, "byrdiff": [1, 2, 12], "mardurmatch": [1, 2], "14": 1, "minu": [1, 2], "subtract": 1, "geo": 1, "distanc": [1, 8], "lookup": 1, "tabl": [1, 2, 4, 7, 10, 12], "core": [1, 7, 10, 12], "dist_tabl": 1, "py": [1, 2], "There": [1, 2, 3, 7], "sever": [1, 6], "wai": [1, 5, 10], "file": [1, 4, 6, 7, 10, 11, 12], "kei": [1, 7, 10], "key_count": 1, "secondari": 1, "serv": 1, "back": 1, "primari": [1, 6], "doe": [1, 7, 12], "particularli": 1, "state": [1, 6], "much": [1, 7], "fewer": [1, 8], "combin": [1, 2, 3, 7], "thu": 1, "risk": 1, "fill": 1, "aren": 1, "ex": 1, "just": [1, 2, 10, 12], "even": 1, "though": 1, "distances_fil": 1, "path": [1, 2, 10, 11, 12], "table_nam": 1, "what": [1, 2, 10, 12], "onc": [1, 10], "loc_a": 1, "where": [1, 7, 10, 12], "come": 1, "loc_b": 1, "distance_col": 1, "source_column_a": 1, "sourc": [1, 4, 7, 10, 12], "source_column_b": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b_0": 1, "loc_b_1": 1, "secondary_key_count": 1, "backup": 1, "secondary_table_nam": 1, "secondary_distances_fil": 1, "secondary_source_column": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_distance_col": 1, "state_dist": 1, "state_distance_lookup": 1, "county_state_dist": 1, "csv": [1, 2, 7, 10, 11, 12], "statecode1": 1, "statecode2": 1, "dist": 1, "county_dist": [1, 2, 12], "county_distance_lookup": 1, "county_1900_1910_distances_km": 1, "from_icpsrctyi": 1, "to_icpsrctyi": 1, "from_statefip": 1, "to_statefip": 1, "distance_km": 1, "state_1900_1910_distances_km": 1, "fetch": 1, "neither": 1, "nor": 1, "mpre": 1, "m_namefrst": 1, "accord": 1, "niu": 1, "other": [1, 2, 12], "mfbplmatch": 1, "multi": 1, "search": 1, "special": 1, "simplifi": 1, "particular": [1, 2], "constraint": 1, "num_col": 1, "whose": 1, "templat": 1, "n": [1, 8, 9], "per": [1, 2, 8, 9, 10], "current": [1, 2, 10], "respect": [1, 7], "jw_col_templ": 1, "jw": 1, "pair": [1, 12], "equal_and_not_null_templ": 1, "final": [1, 2, 12], "comput": [1, 3, 7], "_namefrst": 1, "_bpl": 1, "_sex": 1, "25": 1, "nvl": 1, "sm_namefrst": 1, "sn_namefrst": 1, "sm_bpl": 1, "sn_bpl": 1, "sm_sex": 1, "sn_sex": 1, "pass": [1, 2, 7, 8], "flexibl": 1, "user": [1, 10], "write": [1, 10, 12], "own": [1, 2], "favor": 1, "reason": 1, "good": 1, "fallback": 1, "defin": [1, 7, 8, 9, 10], "spark": [1, 2, 5, 8, 9, 10, 12], "builtin": 1, "argument": [1, 10, 12], "namelast_jw_max": 1, "namelast1": 1, "namelast2": 1, "namelast3": 1, "abov": [1, 5], "extend": 1, "beyond": 1, "top": [1, 4], "level": [1, 4, 10], "everi": 1, "jw_f": [1, 2, 12], "father_namefrst": 1, "rais": [1, 3], "exponenti": 1, "squar": 1, "county_distance_squar": [1, 2, 12], "county_a": 1, "county_b": 1, "upper": 1, "gt": 1, "btwn": 1, "addl": 1, "var": [1, 2], "program": [1, 2, 7, 12], "report": [1, 4, 6, 10], "addl_var": 1, "check_val_expr": 1, "else_v": 1, "volumn": 1, "datasourc": [1, 2, 10], "yrimmig": 1, "immyear_diff": [1, 2, 9, 12], "includ": [1, 2, 7, 9, 10], "train": [1, 4, 6, 8, 10], "independent_var": [1, 2, 12], "config": [1, 4, 7, 10, 12], "id_column": [1, 2], "_a": 1, "mult": 1, "exist": [1, 2, 10], "within": [1, 2, 6, 10, 11], "hh_train": [1, 2, 7, 10, 12], "hh": 1, "highest": [1, 2], "against": [1, 11], "ten": [1, 2], "tell": 2, "how": [2, 7], "descript": [2, 8, 10], "refer": 2, "here": [2, 7, 10, 12], "tutori": [2, 10], "script": [2, 6, 10], "discuss": 2, "readm": 2, "note": 2, "written": [2, 6], "toml": [2, 6, 10], "abl": 2, "json": [2, 10], "datasource_a": [2, 7], "datasource_b": [2, 7], "transform": [2, 4, 6, 7], "lowercase_strip": 2, "add_to_a": 2, "age_2": 2, "derived_from": 2, "expand_length": 2, "explod": [2, 7], "jaro_winkl": 2, "namelast_jw": [2, 12], "threshold": [2, 8, 12], "feature_nam": 2, "79": 2, "84": 2, "complex": 2, "machin": [2, 6, 7, 10, 12], "learn": [2, 6, 7, 10, 12], "probabilist": [2, 6], "drop_data_from_scored_match": 2, "us1900": 2, "us1900m_usa": 2, "p": 2, "parquet": [2, 7], "us1910": 2, "us1910m_usa": 2, "training_data_subset": 2, "serialp": 2, "rationalize_name_word": 2, "remove_qmark_hyphen": 2, "replace_apostroph": 2, "remove_suffix": 2, "remove_alternate_nam": 2, "condense_strip_whitespac": 2, "split": [2, 3, 7, 8, 9, 12], "namefrst_std": [2, 11], "array_index": 2, "bpl_orig": 2, "divide_by_int": 2, "get_floor": 2, "statefip_h": 2, "output_typ": 2, "substitution_column": [2, 7, 11], "join_column": [2, 11], "join_valu": [2, 11], "substitution_fil": [2, 11], "name_std": [2, 11], "male": [2, 11], "femal": [2, 11], "feature_select": [2, 3, 7], "input_column": [2, 3, 9], "output_column": [2, 3, 9], "sql_condit": 2, "namelast_bigram": 2, "bigram": [2, 4], "bpl_clean": 2, "bpl_str": 2, "washington": 2, "bpl2_str": 2, "53": 2, "region": [2, 12], "attach_vari": 2, "region_dict": 2, "col_to_join_on": 2, "col_to_add": 2, "null_fil": 2, "col_typ": 2, "potential_matches_univers": [2, 7], "birthyr_3": 2, "namefrst_std_jw": [2, 12], "75": [2, 8, 12], "comparis": 2, "post": [2, 7], "hh_comparison": [2, 7], "threshold_expr": 2, "fetch_a": 2, "sex_equ": 2, "equal": [2, 11], "relate_a": [2, 9], "pipeline_featur": [2, 7, 9], "sex_region_interact": 2, "transformer_typ": [2, 9], "interact": [2, 4, 7, 12], "relatetyp": [2, 9], "bucket": [2, 7], "hit": [2, 10, 12], "scale_data": [2, 12], "training_data": [2, 10], "dependent_var": [2, 12], "score_with_model": [2, 12], "use_training_data_featur": [2, 7, 12], "split_by_id_a": [2, 12], "decis": [2, 4, 8, 12], "drop_duplicate_with_threshold_ratio": [2, 12], "n_training_iter": [2, 7, 12], "output_suspicious_td": [2, 12], "param_grid": [2, 12], "model_paramet": [2, 7, 8, 12], "random_forest": [2, 12], "maxdepth": [2, 8, 12], "numtre": [2, 8, 12], "005": 2, "threshold_ratio": [2, 8, 12], "logistic_regress": [2, 12], "50": [2, 12], "65": 2, "80": 2, "chosen_model": [2, 8, 12], "prediction_col": 2, "predict": [2, 12], "hh_col": 2, "hh_training_data_1900_1910": 2, "probit": [2, 4], "go": [2, 10], "your": [2, 5, 7, 10, 12], "uniqu": 2, "identifi": [2, 6, 12], "full": [2, 7, 12], "short": 2, "alphanumer": 2, "convert_ints_to_long": 2, "automat": [2, 5, 7], "long": [2, 11], "especi": 2, "assum": 2, "schema": 2, "sometim": 2, "term": 2, "bigint": 2, "thing": 2, "my_fil": 2, "subset": [2, 11], "limit": 2, "extract": 2, "modifi": 2, "meant": 2, "usag": [2, 4, 10], "set_value_column_a": [2, 3], "liter": 2, "set_value_column_b": [2, 3], "iv": 2, "v": 2, "vi": 2, "vii": 2, "viii": 2, "namelast_clean_bigram": [2, 3], "fed": [2, 7], "prep": 2, "df": [2, 10], "men": 2, "newli": 2, "attempt": 2, "duplic": [2, 8], "row": 2, "conjuct": 2, "Will": 2, "conjunct": 2, "rang": [2, 9], "original_valu": 2, "plu": 2, "1870": 2, "expand": 2, "1867": 2, "1868": 2, "1869": 2, "1871": 2, "1872": 2, "1873": 2, "kept": 2, "keep": 2, "appropri": 2, "treat": [2, 9], "import": [2, 7, 10, 12], "dure": [2, 7], "hot": 2, "encod": [2, 3], "vector": [2, 9], "stage": 2, "well": 2, "upper_threshold": 2, "cannot": 2, "robust": 2, "ml": [2, 4, 8, 9], "typic": [2, 7], "leverag": 2, "api": [2, 6, 9], "piplin": 2, "regionf": 2, "sex_regionf_interact": 2, "immyear_caut": [2, 9], "myriad": 2, "explor": [2, 4, 6, 10], "part": [2, 7], "task": [2, 4, 6, 8, 12], "drop_duplicate_a": 2, "time": [2, 7, 10], "out": [2, 7, 12], "best": [2, 7], "smallest": 2, "possibl": 2, "ratio": [2, 8], "beta": [2, 8], "test": [2, 7, 12], "model_explor": [2, 10, 12], "hyper": [2, 6, 12], "paramet": [2, 6, 7, 8, 10, 12], "eval": 2, "skip": [2, 7], "apply_model": 2, "run_all_step": [2, 10, 12], "command": [2, 6, 10, 12], "try": 2, "creation": 2, "iter": 2, "scale": 2, "error": [2, 9], "1900": [2, 12], "about": [2, 10, 12], "1930": [2, 12], "1940": [2, 12], "fail": 2, "were": 2, "sure": [2, 5, 10], "scratch": 2, "although": 2, "know": 2, "haven": 2, "save": [2, 7, 12], "small": 2, "amount": 2, "process": [2, 6, 10], "repeatedli": 2, "help": [2, 7, 10], "neg": [2, 4, 6], "area": 2, "coverag": 2, "increas": [2, 9], "represent": [2, 7], "ensur": 2, "group": [2, 7], "a304bt": 2, "three": [2, 7], "b200": 2, "c201": 2, "d425": 2, "perform": [2, 6, 7, 11], "feature_import": [2, 7, 12], "coeffici": [2, 7], "enabl": [2, 7, 10], "srace": [2, 9, 12], "race_interacted_srac": [2, 9, 12], "hits2": [2, 12], "exact_mult": [2, 12], "ncount": [2, 3, 12], "ncount2": [2, 3, 12], "f_interacted_jw_f": [2, 12], "f_caution": [2, 12], "f_pre": [2, 12], "fbplmatch": [2, 12], "m_interacted_jw_m": [2, 9, 12], "jw_m": [2, 9, 12], "m_pre": [2, 9, 12], "mbplmatch": [2, 12], "sp_interacted_jw_sp": [2, 12], "jw_sp": [2, 12], "sp_pre": [2, 12], "mi": [2, 12], "fsoundex": [2, 12], "lsoundex": [2, 12], "oth": [2, 12], "imm_interacted_immyear_caut": [2, 12], "1900_1910_training_data_20191023": 2, "jw_max_a": 2, "jw_max_b": 2, "f1_match": 2, "f2_match": 2, "byrdifcat": 2, "racematch": 2, "bplmatch": 2, "imm_interacted_bplmatch": 2, "sexmatch": 2, "relatetype_interacted_relatematch": 2, "checkpoint": 3, "no_first_pad": 3, "don": 3, "prepend": 3, "namefrst_unstd_bigram": 3, "namelast_frst_bigram": 3, "namelast_clean_soundex": 3, "input_col": 3, "output_col": 3, "expon": 3, "introduct": 4, "overview": 4, "instal": 4, "pypi": 4, "preprocess": [4, 6, 10, 12], "model": [4, 6, 10], "run": [4, 5, 6, 7, 12], "librari": [4, 6], "mode": [4, 5, 12], "advanc": 4, "workflow": 4, "export": [4, 7, 10], "featur": [4, 6, 7, 8, 10], "reus": 4, "basic": 4, "map": [4, 7, 9], "substitut": [4, 7], "block": [4, 7], "comparison": [4, 7], "pipelin": 4, "ons": 4, "aggreg": 4, "union": 4, "soundex": 4, "power": 4, "regex": 4, "random": [4, 8], "forest": [4, 8], "logist": [4, 8], "regress": [4, 8], "tree": [4, 8], "gradient": [4, 8], "boost": [4, 8], "system": 5, "python": [5, 6, 10], "java": 5, "integr": 5, "apach": 5, "via": [5, 6], "pyspark": [5, 8, 9, 10], "packag": 5, "org": 5, "latest": 5, "pip": 5, "easiest": [5, 10], "through": [5, 7, 9, 10], "instruct": [5, 10], "But": 5, "clone": 5, "github": 5, "repositori": 5, "root": 5, "project": 5, "directori": [5, 10, 12], "develop": [5, 6], "e": 5, "dev": 5, "edit": 5, "made": 5, "built": 5, "tool": [5, 6], "line": [6, 10], "share": 6, "characterist": [6, 7], "correspond": [6, 7], "real": 6, "world": 6, "determinist": [6, 7], "rule": [6, 7], "algorithm": [6, 7], "At": [6, 7], "been": 6, "unit": 6, "census": 6, "hierarch": [6, 10], "structur": 6, "nest": 6, "howev": [6, 12], "tailor": 6, "ignor": 6, "common": [6, 7, 12], "highli": [6, 7], "languag": 6, "further": [6, 12], "broken": 6, "smaller": 6, "sequenc": 6, "linkrun": [6, 10], "prepar": [6, 7, 10], "research": 6, "experi": 6, "understand": 6, "tune": [6, 12], "relationship": 6, "varieti": 7, "normal": 7, "abbrevi": [7, 11], "regist": [7, 10], "datafram": [7, 10, 12], "request": 7, "classif": [7, 8], "metadata": 7, "introspect": 7, "ingest": 7, "inspect": 7, "mani": [7, 10], "aspect": [7, 10], "extens": 7, "longest": 7, "definit": 7, "reduc": 7, "drastic": 7, "improv": 7, "runtim": 7, "separ": 7, "total": 7, "potential_match": [7, 10], "satisfi": 7, "elig": 7, "reshap": 7, "thought": 7, "ahead": 7, "chosen": 7, "experiment": [7, 10], "focus": 7, "demograph": 7, "moment": 7, "veri": [7, 12], "anyon": 7, "percent": 7, "remain": 7, "popul": 7, "pull": 7, "fix": 7, "width": 7, "crosswalk": 7, "construct": 7, "alpha": 8, "hyperparamet": [8, 12], "de": 8, "param": [8, 12], "label": 8, "doc": [8, 9], "commonli": 8, "explan": 8, "randomforestclassifi": 8, "depth": 8, "20": 8, "featuresubsetstrategi": 8, "node": 8, "auto": 8, "onethird": 8, "sqrt": 8, "log2": 8, "15": 8, "generalizedlinearregress": 8, "famili": 8, "binomi": 8, "85": [8, 10], "logisticregress": 8, "decisiontreeclassifi": 8, "mininstancespernod": 8, "caus": 8, "left": 8, "right": [8, 10], "discard": 8, "maxbin": 8, "bin": 8, "discret": 8, "continu": [8, 9, 12], "gbtclassifi": 8, "mother": 9, "point": [9, 12], "x": [9, 10], "y": 9, "hold": 9, "except": 9, "strictli": 9, "inf": 9, "explicitli": 9, "cover": 9, "doubl": 9, "outsid": 9, "job": 10, "high": 10, "class": 10, "handl": 10, "main": 10, "complet": 10, "access": [10, 12], "link_run": 10, "factori": 10, "sparkfactori": 10, "load_config": 10, "load_conf_fil": 10, "sparksess": 10, "now": 10, "let": 10, "load": 10, "our": 10, "my_conf": 10, "lr": 10, "prep_step": 10, "get_step": 10, "enumer": 10, "print": 10, "input_table_nam": 10, "output_table_nam": 10, "run_step": 10, "get_tabl": 10, "matches_df": 10, "hh_model_explor": 10, "method": [10, 12], "interfac": 10, "easili": 10, "conveni": 10, "adjust": 10, "set_loc": 10, "set_num_cor": 10, "set_executor_memori": 10, "5g": 10, "ll": 10, "dictionari": 10, "often": 10, "modul": 10, "pleas": 10, "reproduc": 10, "consol": 10, "cpu": 10, "h": 10, "executor_memori": [10, 12], "execute_task": 10, "execute_command": 10, "conf": [10, 12], "show": 10, "messag": 10, "exit": 10, "memori": 10, "executor": 10, "begin": 10, "execut": 10, "seri": 10, "excute_command": 10, "filepath": 10, "sai": 10, "fullcount_1870_1880": 10, "pattern": 10, "full_count_1870_1880": 10, "prompt": 10, "enter": 10, "text": 10, "unstabl": 10, "topic": 10, "analyz": [10, 12], "set_preexisting_t": 10, "x_persist": 10, "borrow_t": 10, "get_task": 10, "set_print_sql": 10, "x_sql": 10, "x_sqlf": 10, "ipython": 10, "showf": 10, "x_summari": 10, "desc": 10, "x_crosswalk": 10, "x_tab": 10, "q": [10, 12], "x_hh_tfam": 10, "x_tfam": 10, "drop_al": 10, "reload": 10, "x_hh_tfam_2a": 10, "x_tfam_raw": 10, "drop_all_prc": 10, "x_hh_tfam_2b": 10, "x_union": 10, "drop_all_temp": 10, "x_load": 10, "get_set": 10, "set_link_task": 10, "x_parquet_from_csv": 10, "organ": 10, "hierarchi": 10, "five": 10, "hh_match": 10, "someth": 10, "choic": 10, "preexist": 10, "prepped_df_a": 10, "prepped_df_b": 10, "raw_df_b": 10, "raw_df_a": 10, "training_featur": [10, 12], "scored_potential_match": 10, "potential_matches_prep": 10, "exploded_df_b": 10, "exploded_df_a": 10, "predicted_match": 10, "hh_training_featur": [10, 12], "hh_training_data": 10, "hh_predicted_match": 10, "hh_scored_potential_match": 10, "hh_potential_match": 10, "hh_blocked_match": 10, "hh_potential_matchs_prep": 10, "model_eval_training_vector": 10, "model_eval_training_data": 10, "model_eval_repeat_fp": 10, "model_eval_training_featur": 10, "model_eval_training_result": 10, "model_eval_repeat_fn": 10, "hh_model_eval_training_vector": 10, "hh_model_eval_repeat_fp": 10, "hh_model_eval_repeat_fn": 10, "hh_model_eval_training_result": 10, "hh_model_eval_training_featur": 10, "hh_model_eval_training_data": 10, "persist": 10, "hidden": 10, "intermedi": 10, "yet": 10, "databas": 10, "tablenam": 10, "istemporari": 10, "task_nam": 10, "num": 10, "finish": 10, "put": [10, 12], "launch": [10, 12], "my": [10, 12], "subhead": 11, "suppli": 11, "regex_word_replac": 11, "variant": 11, "av": 11, "7th": 11, "swap": 11, "still": 11, "anywher": 11, "proceed": 11, "street_unstd": 11, "dir": 11, "substitutions_street_abbrev": 11, "span": 12, "1920": 12, "deriv": 12, "necessari": 12, "scenario": 12, "copi": 12, "use_potential_matches_featur": 12, "full_count_1900_1910": 12, "50g": 12, "ask": 12, "arg": 12, "partit": 12, "training_data_1900_1910_hlink_featur": 12, "might": 12, "shut": 12, "framework": 12, "etc": 12, "relev": 12, "matrix": 12, "implement": 12, "regular": 12, "training_data_1900_1910": 12, "weren": 12, "ident": 12, "manual": 12, "updat": 12, "isn": 12, "analysi": 12, "training_result": 12, "hh_training_result": 12, "1900_1910_training_result": 12, "repeat_fp": 12, "repeat_fn": 12, "hh_repeat_fp": 12, "hh_repeat_fn": 12, "1900_1910_potential_fp": 12, "1900_1910_potential_fn": 12, "prefer": 12, "ve": 12}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"column": [0, 2], "map": [0, 2], "basic": [0, 2], "usag": 0, "advanc": [0, 2, 12], "transform": [0, 1, 3, 9], "add_to_a": 0, "concat_to_a": 0, "concat_to_b": 0, "lowercase_strip": 0, "rationalize_name_word": 0, "remove_qmark_hyphen": 0, "remove_punctu": 0, "replace_apostroph": 0, "remove_alternate_nam": 0, "remove_suffix": 0, "remove_stop_word": 0, "remove_prefix": 0, "condense_strip_whitespac": 0, "remove_one_letter_nam": 0, "split": 0, "array_index": 0, "substr": 0, "divide_by_int": 0, "when_valu": 0, "get_floor": 0, "comparison": [1, 2], "type": [1, 9], "add": 1, "ons": 1, "aggreg": 1, "featur": [1, 2, 3, 9, 12], "household": [1, 2, 7], "maximum_jaro_winkl": 1, "jaro_winkl": 1, "jaro_winkler_street": 1, "max_jaro_winkl": 1, "equal": 1, "f1_match": 1, "f2_match": 1, "not_equ": 1, "equals_as_int": 1, "all_equ": 1, "not_zero_and_not_equ": 1, "time": 1, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "any_equ": 1, "either_are_1": 1, "either_are_0": 1, "second_gen_imm": 1, "rel_jaro_winkl": 1, "extra_children": 1, "jaro_winkler_r": 1, "sum": 1, "length_b": 1, "abs_diff": 1, "b_minus_a": 1, "geo_dist": 1, "fetch_a": 1, "fetch_b": 1, "present_both_year": 1, "neither_are_nul": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_and_equal_categorical_in_univers": 1, "multi_jaro_winkler_search": 1, "sql_condit": [1, 3], "alia": 1, "power": [1, 3], "threshold": 1, "lower_threshold": 1, "upper_threshold": 1, "gt_threshold": 1, "btwn_threshold": 1, "look_at_addl_var": 1, "hit": 1, "hits2": 1, "exact_mult": 1, "jw_max_a": 1, "jw_max_b": 1, "configur": [2, 4, 7], "config": 2, "file": 2, "top": 2, "level": 2, "data": [2, 11, 12], "sourc": [2, 5], "filter": 2, "substitut": [2, 11], "select": [2, 3], "potenti": [2, 12], "match": [2, 7], "univers": 2, "block": 2, "pipelin": [2, 9], "gener": [2, 9, 12], "train": [2, 7, 12], "model": [2, 7, 8, 12], "bigram": 3, "arrai": 3, "union": 3, "soundex": 3, "welcom": 4, "hlink": [4, 10], "": 4, "document": 4, "api": 4, "instal": 5, "requir": 5, "from": 5, "pypi": 5, "introduct": 6, "overview": [6, 7], "link": [7, 10, 12], "task": [7, 10], "preprocess": 7, "step": [7, 10], "relat": 7, "section": 7, "explor": [7, 12], "report": 7, "random_forest": 8, "probit": 8, "logistic_regress": 8, "decision_tre": 8, "gradient_boosted_tre": 8, "interact": [9, 10], "bucket": 9, "run": 10, "us": 10, "librari": 10, "mode": 10, "start": 10, "program": 10, "exampl": [10, 12], "workflow": [10, 12], "1": 11, "tabl": 11, "regex": 11, "word": 11, "replac": 11, "export": 12, "after": 12, "reus": 12, "differ": 12, "year": 12, "ml": 12, "list": 12, "fals": 12, "posit": 12, "neg": 12, "fp": 12, "fn": 12}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 60}, "alltitles": {"Column Mappings": [[0, "column-mappings"], [2, "column-mappings"]], "Basic Usage": [[0, "basic-usage"]], "Advanced Usage": [[0, "advanced-usage"]], "Transforms": [[0, "transforms"]], "add_to_a": [[0, "add-to-a"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "lowercase_strip": [[0, "lowercase-strip"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_punctuation": [[0, "remove-punctuation"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_suffixes": [[0, "remove-suffixes"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_prefixes": [[0, "remove-prefixes"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "split": [[0, "split"]], "array_index": [[0, "array-index"]], "mapping": [[0, "mapping"]], "substring": [[0, "substring"]], "divide_by_int": [[0, "divide-by-int"]], "when_value": [[0, "when-value"]], "get_floor": [[0, "get-floor"]], "Comparison types, transform add-ons, aggregate features, and household aggregate features": [[1, "comparison-types-transform-add-ons-aggregate-features-and-household-aggregate-features"]], "Comparison types": [[1, "comparison-types"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "equals": [[1, "equals"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "not_equals": [[1, "not-equals"]], "equals_as_int": [[1, "equals-as-int"]], "all_equals": [[1, "all-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "and": [[1, "and"]], "times": [[1, "times"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "any_equals": [[1, "any-equals"]], "either_are_1": [[1, "either-are-1"]], "either_are_0": [[1, "either-are-0"]], "second_gen_imm": [[1, "second-gen-imm"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "extra_children": [[1, "extra-children"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "sum": [[1, "sum"]], "length_b": [[1, "length-b"]], "abs_diff": [[1, "abs-diff"]], "b_minus_a": [[1, "b-minus-a"]], "geo_distance": [[1, "geo-distance"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "present_both_years": [[1, "present-both-years"]], "neither_are_null": [[1, "neither-are-null"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "sql_condition": [[1, "sql-condition"], [3, "sql-condition"]], "Feature add-ons": [[1, "feature-add-ons"]], "alias": [[1, "alias"]], "power": [[1, "power"], [3, "power"]], "threshold": [[1, "threshold"]], "lower_threshold": [[1, "lower-threshold"]], "upper_threshold": [[1, "upper-threshold"]], "gt_threshold": [[1, "gt-threshold"]], "btwn_threshold": [[1, "btwn-threshold"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "Aggregate Features": [[1, "aggregate-features"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "exact_mult": [[1, "exact-mult"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "Configuration": [[2, "configuration"]], "Basic Config File": [[2, "basic-config-file"]], "Advanced Config File": [[2, "advanced-config-file"]], "Top level configs": [[2, "top-level-configs"]], "Data sources": [[2, "data-sources"]], "Filter": [[2, "filter"]], "Substitution Columns": [[2, "substitution-columns"]], "Feature Selections": [[2, "feature-selections"]], "Potential Matches Universe": [[2, "potential-matches-universe"]], "Blocking": [[2, "blocking"]], "Comparisons": [[2, "comparisons"]], "Household Comparisons": [[2, "household-comparisons"]], "Comparison Features": [[2, "comparison-features"]], "Pipeline-generated Features": [[2, "pipeline-generated-features"]], "Training and models": [[2, "training-and-models"]], "Household training and models": [[2, "household-training-and-models"]], "Feature Selection transforms": [[3, "feature-selection-transforms"]], "bigrams": [[3, "bigrams"]], "array": [[3, "array"]], "union": [[3, "union"]], "soundex": [[3, "soundex"]], "Installation": [[5, "installation"]], "Requirements": [[5, "requirements"]], "Installing from PyPI": [[5, "installing-from-pypi"]], "Installing from source": [[5, "installing-from-source"]], "Introduction": [[6, "introduction"]], "Overview": [[6, "overview"], [7, "overview"], [7, "id1"], [7, "id4"], [7, "id7"], [7, "id10"], [7, "id13"]], "Models": [[8, "models"]], "random_forest": [[8, "random-forest"]], "probit": [[8, "probit"]], "logistic_regression": [[8, "logistic-regression"]], "decision_tree": [[8, "decision-tree"]], "gradient_boosted_trees": [[8, "gradient-boosted-trees"]], "Pipeline generated features": [[9, "pipeline-generated-features"]], "Transformer types": [[9, "transformer-types"]], "interaction": [[9, "interaction"]], "bucketizer": [[9, "bucketizer"]], "Running hlink": [[10, "running-hlink"]], "Using hlink as a Library": [[10, "using-hlink-as-a-library"]], "Interactive Mode": [[10, "interactive-mode"]], "Starting the program": [[10, "starting-the-program"]], "Running Linking Tasks and Steps": [[10, "running-linking-tasks-and-steps"]], "Example interactive mode workflow": [[10, "example-interactive-mode-workflow"]], "Substitutions": [[11, "substitutions"]], "1:1 substitution by data table": [[11, "substitution-by-data-table"]], "Substitution by regex word replace": [[11, "substitution-by-regex-word-replace"]], "Advanced Workflow Examples": [[12, "advanced-workflow-examples"]], "Export training data after generating features to reuse in different linking years": [[12, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Example training data export with generated ML features": [[12, "example-training-data-export-with-generated-ml-features"]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[12, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Example model exploration and FP/FN export workflow": [[12, "example-model-exploration-and-fp-fn-export-workflow"]], "Welcome to hlink\u2019s documentation!": [[4, "welcome-to-hlink-s-documentation"]], "Configuration API": [[4, "configuration-api"], [4, null]], "Link Tasks": [[7, "link-tasks"]], "Preprocessing": [[7, "preprocessing"]], "Task steps": [[7, "task-steps"], [7, "id2"], [7, "id5"], [7, "id8"], [7, "id11"], [7, "id14"]], "Related Configuration Sections": [[7, "related-configuration-sections"], [7, "id3"], [7, "id6"], [7, "id9"], [7, "id12"], [7, "id15"]], "Training and Household Training": [[7, "training-and-household-training"]], "Matching": [[7, "matching"]], "Household Matching": [[7, "household-matching"]], "Model Exploration and Household Model Exploration": [[7, "model-exploration-and-household-model-exploration"]], "Reporting": [[7, "reporting"]]}, "indexentries": {}})
\ No newline at end of file
diff --git a/sphinx-docs/index.rst b/sphinx-docs/index.rst
index 2c9a76e..efdb528 100644
--- a/sphinx-docs/index.rst
+++ b/sphinx-docs/index.rst
@@ -23,7 +23,7 @@ Configuration API
    :maxdepth: 2
    :caption: Configuration API
 
-   Column Mapping <column_mappings.md>
+   Column Mappings <column_mappings.md>
    Comparison Types <comparison_types.md>
    Feature Selection <feature_selection_transforms.md>
    Pipeline Features <pipeline_features.md>
diff --git a/sphinx-docs/link_tasks.md b/sphinx-docs/link_tasks.md
index 34e27ba..dc201b7 100644
--- a/sphinx-docs/link_tasks.md
+++ b/sphinx-docs/link_tasks.md
@@ -17,7 +17,7 @@ datasets.
 ### Related Configuration Sections
 
 * The [`datasource_a` and `datasource_b`](config.html#data-sources) sections specify where to find the input data.
-* [```column_mappings```](column_mapping_transforms.html#column-mapping-transforms),
+* [`column_mappings`](column_mappings.html#column-mappings),
 [`feature_selections`](feature_selection_transforms.html#feature-selection-transforms),
 and [`substitution_columns`](substitutions.html#substitutions) may all be used to define transformations on the input data.
 * The [`filter`](config.html#filter) section may be used to filter some records out of the input data