diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 4e42786..47021fc 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -1,6 +1,13 @@
 name: HLink Docker CI
 
-on: [pull_request, push]
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
 
 env:
   HLINK_TAG: hlink:githubactions
@@ -28,9 +35,6 @@ jobs:
     - name: Check formatting with black
       run: docker run $HLINK_TAG-${{ matrix.python_version}} black --check .
       
-    - name: Lint with flake8
-      run: docker run $HLINK_TAG-${{ matrix.python_version}} flake8 --count .
-      
     - name: Test
       run: docker run $HLINK_TAG-${{ matrix.python_version}} pytest
     
diff --git a/doc/developer.md b/doc/developer.md
index 1893703..3d82d96 100644
--- a/doc/developer.md
+++ b/doc/developer.md
@@ -19,9 +19,23 @@ To set up a copy of this project for development,
 
 ## Running Tests
 
-To run the project's test suite, run `pytest` in the root project directory. Running all of the tests
-can take a while, depending on your computer's hardware and setup. To run a subset of tests that test some but not
-all of the core features, try `pytest -m quickcheck`. These tests should run much more quickly.
+To run the project's test suite, run `pytest` in the root project directory.
+Running all of the tests can take a while, depending on your computer's
+hardware and setup. If you are working on a particular bug or feature, there
+are several good ways to filter the tests to run just tests that interest you.
+Check out the pytest documentation
+[here](https://docs.pytest.org/en/latest/how-to/usage.html#specifying-which-tests-to-run).
+
+In particular, the `-k` argument is helpful for running only tests with names
+that match the topics you are interested in, like this:
+
+```
+pytest -k "lightgbm or xgboost"
+```
+
+The GitHub Actions workflow runs all of the tests on each push or PR to the
+main branch. It runs the tests on several versions of Python and in several
+different Python environments.
 
 ## Building the Scala Jar
 
diff --git a/docs/_sources/substitutions.md.txt b/docs/_sources/substitutions.md.txt
index 93c9947..eb76c45 100644
--- a/docs/_sources/substitutions.md.txt
+++ b/docs/_sources/substitutions.md.txt
@@ -9,7 +9,11 @@ You must supply a substitution file and either specify `regex_word_replace=true`
 
 ## 1:1 substitution by data table
 
-Performs a 1:1 replacement on a filtered subset of the data table.  If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file.  Used to replace variant name forms with standardized name forms, filtering on sex.
+Performs a 1:1 replacement on a filtered subset of the data table.  If the
+input column data equals a value in the second column of the substitution file,
+it is replaced with the data in the first column of the substitution file.
+Used to replace variant name forms with standardized name forms, filtering on
+a column like sex which may affect common names.
 
 * Attributes:
   * `join_column` -- Type: `string`.  Column to filter input data on.
diff --git a/docs/_static/alabaster.css b/docs/_static/alabaster.css
index e3174bf..7e75bf8 100644
--- a/docs/_static/alabaster.css
+++ b/docs/_static/alabaster.css
@@ -1,5 +1,3 @@
-@import url("basic.css");
-
 /* -- page layout ----------------------------------------------------------- */
 
 body {
@@ -160,8 +158,8 @@ div.sphinxsidebar input {
     font-size: 1em;
 }
 
-div.sphinxsidebar #searchbox input[type="text"] {
-    width: 160px;
+div.sphinxsidebar #searchbox {
+    margin: 1em 0;
 }
 
 div.sphinxsidebar .search > div {
@@ -263,10 +261,6 @@ div.admonition p.last {
     margin-bottom: 0;
 }
 
-div.highlight {
-    background-color: #fff;
-}
-
 dt:target, .highlight {
     background: #FAF3E8;
 }
@@ -454,7 +448,7 @@ ul, ol {
 }
 
 pre {
-    background: #EEE;
+    background: unset;
     padding: 7px 30px;
     margin: 15px 0px;
     line-height: 1.3em;
@@ -485,15 +479,15 @@ a.reference {
     border-bottom: 1px dotted #004B6B;
 }
 
+a.reference:hover {
+    border-bottom: 1px solid #6D4100;
+}
+
 /* Don't put an underline on images */
 a.image-reference, a.image-reference:hover {
     border-bottom: none;
 }
 
-a.reference:hover {
-    border-bottom: 1px solid #6D4100;
-}
-
 a.footnote-reference {
     text-decoration: none;
     font-size: 0.7em;
@@ -509,68 +503,7 @@ a:hover tt, a:hover code {
     background: #EEE;
 }
 
-
-@media screen and (max-width: 870px) {
-
-    div.sphinxsidebar {
-    	display: none;
-    }
-
-    div.document {
-       width: 100%;
-
-    }
-
-    div.documentwrapper {
-    	margin-left: 0;
-    	margin-top: 0;
-    	margin-right: 0;
-    	margin-bottom: 0;
-    }
-
-    div.bodywrapper {
-    	margin-top: 0;
-    	margin-right: 0;
-    	margin-bottom: 0;
-    	margin-left: 0;
-    }
-
-    ul {
-    	margin-left: 0;
-    }
-
-	li > ul {
-        /* Matches the 30px from the "ul, ol" selector above */
-		margin-left: 30px;
-	}
-
-    .document {
-    	width: auto;
-    }
-
-    .footer {
-    	width: auto;
-    }
-
-    .bodywrapper {
-    	margin: 0;
-    }
-
-    .footer {
-    	width: auto;
-    }
-
-    .github {
-        display: none;
-    }
-
-
-
-}
-
-
-
-@media screen and (max-width: 875px) {
+@media screen and (max-width: 940px) {
 
     body {
         margin: 0;
@@ -580,12 +513,16 @@ a:hover tt, a:hover code {
     div.documentwrapper {
         float: none;
         background: #fff;
+        margin-left: 0;
+        margin-top: 0;
+        margin-right: 0;
+        margin-bottom: 0;
     }
 
     div.sphinxsidebar {
         display: block;
         float: none;
-        width: 102.5%;
+        width: unset;
         margin: 50px -30px -20px -30px;
         padding: 10px 20px;
         background: #333;
@@ -620,8 +557,14 @@ a:hover tt, a:hover code {
 
     div.body {
         min-height: 0;
+        min-width: auto; /* fixes width on small screens, breaks .hll */
         padding: 0;
     }
+    
+    .hll {
+        /* "fixes" the breakage */
+        width: max-content;
+    }
 
     .rtd_doc_footer {
         display: none;
@@ -635,13 +578,18 @@ a:hover tt, a:hover code {
         width: auto;
     }
 
-    .footer {
-        width: auto;
-    }
-
     .github {
         display: none;
     }
+
+    ul {
+        margin-left: 0;
+    }
+
+    li > ul {
+       /* Matches the 30px from the "ul, ol" selector above */
+        margin-left: 30px;
+    }
 }
 
 
@@ -705,4 +653,11 @@ nav#breadcrumbs li+li:before {
     div.related {
         display: none;
     }
+}
+
+img.github  {
+    position: absolute;
+    top: 0;
+    border: 0;
+    right: 0;
 }
\ No newline at end of file
diff --git a/docs/_static/github-banner.svg b/docs/_static/github-banner.svg
new file mode 100644
index 0000000..c47d9dc
--- /dev/null
+++ b/docs/_static/github-banner.svg
@@ -0,0 +1,5 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="80" height="80" viewBox="0 0 250 250" fill="#fff">
+    <path d="M0 0l115 115h15l12 27 108 108V0z" fill="#151513"/>
+    <path d="M128 109c-15-9-9-19-9-19 3-7 2-11 2-11-1-7 3-2 3-2 4 5 2 11 2 11-3 10 5 15 9 16"/>
+    <path d="M115 115s4 2 5 0l14-14c3-2 6-3 8-3-8-11-15-24 2-41 5-5 10-7 16-7 1-2 3-7 12-11 0 0 5 3 7 16 4 2 8 5 12 9s7 8 9 12c14 3 17 7 17 7-4 8-9 11-11 11 0 6-2 11-7 16-16 16-30 10-41 2 0 3-1 7-5 11l-12 11c-1 1 1 5 1 5z"/>
+</svg>
diff --git a/docs/column_mappings.html b/docs/column_mappings.html
index 623d185..c191199 100644
--- a/docs/column_mappings.html
+++ b/docs/column_mappings.html
@@ -7,7 +7,8 @@
 
     <title>Column Mappings &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -369,7 +370,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -403,16 +413,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -430,7 +430,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/column_mappings.md.txt"
diff --git a/docs/comparison_features.html b/docs/comparison_features.html
index ebbfbda..d251a28 100644
--- a/docs/comparison_features.html
+++ b/docs/comparison_features.html
@@ -7,7 +7,8 @@
 
     <title>Comparison Features &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -1267,7 +1268,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -1302,16 +1312,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -1329,7 +1329,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/comparison_features.md.txt"
diff --git a/docs/comparisons.html b/docs/comparisons.html
index 1f214d4..244760e 100644
--- a/docs/comparisons.html
+++ b/docs/comparisons.html
@@ -7,7 +7,8 @@
 
     <title>Comparisons &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -164,7 +165,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -198,16 +208,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -225,7 +225,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/comparisons.md.txt"
diff --git a/docs/config.html b/docs/config.html
index 84e7b11..07bb31c 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -7,7 +7,8 @@
 
     <title>Configuration &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -912,7 +913,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul class="current">
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -959,16 +969,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -986,7 +986,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/config.md.txt"
diff --git a/docs/feature_selection_transforms.html b/docs/feature_selection_transforms.html
index fe86f9a..4a58650 100644
--- a/docs/feature_selection_transforms.html
+++ b/docs/feature_selection_transforms.html
@@ -7,7 +7,8 @@
 
     <title>Feature Selection Transforms &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -184,7 +185,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -221,16 +231,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -248,7 +248,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/feature_selection_transforms.md.txt"
diff --git a/docs/genindex.html b/docs/genindex.html
index e3f4867..3de05ad 100644
--- a/docs/genindex.html
+++ b/docs/genindex.html
@@ -6,7 +6,8 @@
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>Index &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -52,7 +53,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -79,16 +89,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -106,7 +106,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
     </div>
 
diff --git a/docs/index.html b/docs/index.html
index fa2a68c..4d8f405 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -7,7 +7,8 @@
 
     <title>Welcome to hlink’s documentation! &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -156,7 +157,16 @@ <h1 class="logo"><a href="#">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -184,16 +194,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -211,7 +211,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/index.rst.txt"
diff --git a/docs/installation.html b/docs/installation.html
index 1d241e5..6cd06c5 100644
--- a/docs/installation.html
+++ b/docs/installation.html
@@ -7,7 +7,8 @@
 
     <title>Installation &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -75,7 +76,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul class="current">
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1 current"><a class="current reference internal" href="#">Installation</a><ul>
@@ -109,16 +119,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -136,7 +136,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/installation.md.txt"
diff --git a/docs/introduction.html b/docs/introduction.html
index 749729b..bf6781f 100644
--- a/docs/introduction.html
+++ b/docs/introduction.html
@@ -7,7 +7,8 @@
 
     <title>Introduction &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -89,7 +90,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul class="current">
 <li class="toctree-l1 current"><a class="current reference internal" href="#">Introduction</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="#overview">Overview</a></li>
@@ -121,16 +131,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -148,7 +148,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/introduction.md.txt"
diff --git a/docs/link_tasks.html b/docs/link_tasks.html
index a739318..7a7ab55 100644
--- a/docs/link_tasks.html
+++ b/docs/link_tasks.html
@@ -7,7 +7,8 @@
 
     <title>Link Tasks &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -237,7 +238,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul class="current">
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -274,16 +284,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -301,7 +301,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/link_tasks.md.txt"
diff --git a/docs/models.html b/docs/models.html
index 96bf7de..9bfce4b 100644
--- a/docs/models.html
+++ b/docs/models.html
@@ -7,7 +7,8 @@
 
     <title>Models &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -150,7 +151,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -185,16 +195,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -212,7 +212,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/models.md.txt"
diff --git a/docs/pipeline_features.html b/docs/pipeline_features.html
index d4a50f8..931dac7 100644
--- a/docs/pipeline_features.html
+++ b/docs/pipeline_features.html
@@ -7,7 +7,8 @@
 
     <title>Pipeline generated features &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -99,7 +100,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -131,16 +141,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -158,7 +158,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/pipeline_features.md.txt"
diff --git a/docs/running_the_program.html b/docs/running_the_program.html
index 5c066c4..085b800 100644
--- a/docs/running_the_program.html
+++ b/docs/running_the_program.html
@@ -7,7 +7,8 @@
 
     <title>Running hlink &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -285,7 +286,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul class="current">
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -318,16 +328,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -345,7 +345,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/running_the_program.md.txt"
diff --git a/docs/search.html b/docs/search.html
index 6e582ed..30fa50f 100644
--- a/docs/search.html
+++ b/docs/search.html
@@ -6,7 +6,8 @@
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>Search &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
@@ -123,7 +124,7 @@ <h3>Related Topics</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
     </div>
 
diff --git a/docs/searchindex.js b/docs/searchindex.js
index 149159e..01a8b75 100644
--- a/docs/searchindex.js
+++ b/docs/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"1:1 substitution by data table": [[12, "substitution-by-data-table"]], "Advanced Config File": [[3, "advanced-config-file"]], "Advanced Usage": [[0, "advanced-usage"]], "Advanced Workflow Examples": [[13, null]], "Aggregate Features": [[1, "aggregate-features"]], "Basic Config File": [[3, "basic-config-file"]], "Basic Usage": [[0, "basic-usage"]], "Blocking": [[3, "blocking"]], "Column Mappings": [[0, null], [3, "column-mappings"]], "Comparison Features": [[1, null], [3, "comparison-features"]], "Comparison Types": [[1, "comparison-types"], [2, "comparison-types"]], "Comparisons": [[2, null], [3, "comparisons"]], "Configuration": [[3, null]], "Configuration API": [[5, "configuration-api"], [5, null]], "Data sources": [[3, "data-sources"]], "Defining Multiple Comparisons": [[2, "defining-multiple-comparisons"]], "Example interactive mode workflow": [[11, "example-interactive-mode-workflow"]], "Example model exploration and FP/FN export workflow": [[13, "example-model-exploration-and-fp-fn-export-workflow"]], "Example training data export with generated ML features": [[13, "example-training-data-export-with-generated-ml-features"]], "Export training data after generating features to reuse in different linking years": [[13, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Feature Selection Transforms": [[4, null]], "Feature Selections": [[3, "feature-selections"]], "Feature add-ons": [[1, "feature-add-ons"]], "Filter": [[3, "filter"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "Household Comparisons": [[3, "household-comparisons"]], "Household Matching": [[8, "household-matching"]], "Household training and models": [[3, "household-training-and-models"]], "Installation": [[6, null]], "Installing from PyPI": [[6, "installing-from-pypi"]], "Installing from source": [[6, "installing-from-source"]], "Interactive Mode": [[11, "interactive-mode"]], "Introduction": [[7, null]], "Link Tasks": [[8, null]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[13, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Matching": [[8, "matching"]], "Model Exploration and Household Model Exploration": [[8, "model-exploration-and-household-model-exploration"]], "Models": [[9, null]], "Multiple Comparisons": [[3, "multiple-comparisons"]], "Overview": [[2, "overview"], [7, "overview"], [8, "overview"], [8, "id1"], [8, "id4"], [8, "id7"], [8, "id10"], [8, "id13"]], "Pipeline generated features": [[10, null]], "Pipeline-generated Features": [[3, "pipeline-generated-features"]], "Potential Matches Universe": [[3, "potential-matches-universe"]], "Preprocessing": [[8, "preprocessing"]], "Related Configuration Sections": [[8, "related-configuration-sections"], [8, "id3"], [8, "id6"], [8, "id9"], [8, "id12"], [8, "id15"]], "Reporting": [[8, "reporting"]], "Requirements": [[6, "requirements"]], "Running Linking Tasks and Steps": [[11, "running-linking-tasks-and-steps"]], "Running hlink": [[11, null]], "Single Comparison": [[3, "single-comparison"]], "Starting the program": [[11, "starting-the-program"]], "Substitution Columns": [[3, "substitution-columns"]], "Substitution by regex word replace": [[12, "substitution-by-regex-word-replace"]], "Substitutions": [[12, null]], "Task steps": [[8, "task-steps"], [8, "id2"], [8, "id5"], [8, "id8"], [8, "id11"], [8, "id14"]], "Top level configs": [[3, "top-level-configs"]], "Training and Household Training": [[8, "training-and-household-training"]], "Training and models": [[3, "training-and-models"]], "Transformer types": [[10, "transformer-types"]], "Transforms": [[0, "transforms"]], "Using hlink as a Library": [[11, "using-hlink-as-a-library"]], "Welcome to hlink\u2019s documentation!": [[5, null]], "abs_diff": [[1, "abs-diff"]], "add_to_a": [[0, "add-to-a"]], "alias": [[1, "alias"]], "all_equals": [[1, "all-equals"]], "and": [[1, "and"]], "any_equals": [[1, "any-equals"]], "array": [[4, "array"]], "array_index": [[0, "array-index"]], "b_minus_a": [[1, "b-minus-a"]], "bigrams": [[4, "bigrams"]], "btwn_threshold": [[1, "btwn-threshold"]], "bucketizer": [[10, "bucketizer"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "decision_tree": [[9, "decision-tree"]], "divide_by_int": [[0, "divide-by-int"]], "either_are_0": [[1, "either-are-0"]], "either_are_1": [[1, "either-are-1"]], "equals": [[1, "equals"]], "equals_as_int": [[1, "equals-as-int"]], "exact_mult": [[1, "exact-mult"]], "extra_children": [[1, "extra-children"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "geo_distance": [[1, "geo-distance"]], "get_floor": [[0, "get-floor"]], "gradient_boosted_trees": [[9, "gradient-boosted-trees"]], "gt_threshold": [[1, "gt-threshold"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "interaction": [[10, "interaction"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "length_b": [[1, "length-b"]], "logistic_regression": [[9, "logistic-regression"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "lower_threshold": [[1, "lower-threshold"]], "lowercase_strip": [[0, "lowercase-strip"]], "mapping": [[0, "mapping"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "neither_are_null": [[1, "neither-are-null"]], "not_equals": [[1, "not-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "power": [[1, "power"], [4, "power"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_both_years": [[1, "present-both-years"]], "probit": [[9, "probit"]], "random_forest": [[9, "random-forest"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "remove_prefixes": [[0, "remove-prefixes"]], "remove_punctuation": [[0, "remove-punctuation"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_suffixes": [[0, "remove-suffixes"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "second_gen_imm": [[1, "second-gen-imm"]], "soundex": [[4, "soundex"]], "split": [[0, "split"]], "sql_condition": [[1, "sql-condition"], [4, "sql-condition"]], "substring": [[0, "substring"]], "sum": [[1, "sum"]], "threshold": [[1, "threshold"]], "times": [[1, "times"]], "union": [[4, "union"]], "upper_threshold": [[1, "upper-threshold"]], "when_value": [[0, "when-value"]]}, "docnames": ["column_mappings", "comparison_features", "comparisons", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["column_mappings.md", "comparison_features.md", "comparisons.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [0, 1, 3, 7, 8, 11, 12], "0": [0, 1, 2, 3, 8, 9, 10, 11, 13], "005": 3, "012": 1, "05": [1, 3], "1": [0, 1, 2, 3, 5, 8, 9, 10, 11, 13], "10": [0, 3, 6, 13], "100": [0, 3, 13], "11": [0, 3, 6, 10], "12": [0, 6], "14": 1, "15": 9, "1867": 3, "1868": 3, "1869": 3, "1870": 3, "1871": 3, "1872": 3, "1873": 3, "1900": [3, 13], "1900_1910_potential_fn": 13, "1900_1910_potential_fp": 13, "1900_1910_training_data_20191023": 3, "1900_1910_training_result": 13, "1910": [1, 3, 13], "1920": 13, "1930": [3, 13], "1940": [3, 13], "1999": [0, 3], "2": [0, 1, 2, 3, 4, 8, 9, 12, 13], "20": 9, "25": 1, "3": [0, 1, 2, 3, 6, 8, 9, 10, 13], "300": 0, "301": 0, "302": 0, "303": 0, "4": [0, 1, 9], "5": [0, 1, 2, 3, 9, 10, 11, 13], "50": [3, 13], "50g": 13, "53": 3, "5g": 11, "6": [0, 3, 9, 10, 13], "65": 3, "7": [0, 1, 3, 9, 13], "75": [3, 9, 13], "79": [2, 3], "7th": 12, "8": [1, 3, 6, 11], "80": 3, "84": [2, 3], "85": [9, 11], "9": 1, "95": 1, "99": [1, 3], "9998": 0, "9999": [0, 3, 10], "A": [0, 1, 2, 3, 4, 10, 11], "AND": [1, 2, 3], "As": [0, 2], "At": [7, 8], "But": [3, 6], "By": [0, 3, 11], "For": [0, 1, 2, 3, 8, 11, 13], "If": [0, 1, 3, 4, 8, 9, 11, 12], "In": [0, 1, 2, 7, 11, 13], "It": [0, 1, 2, 3, 7, 11, 13], "NOT": 1, "OR": [1, 2, 3], "THEN": 1, "The": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13], "Then": [0, 6], "There": [1, 3, 8], "These": [0, 1, 2, 3, 7, 8, 9, 10], "To": [0, 1, 6, 8, 11], "Will": 3, "With": [0, 10], "_": [0, 1, 2, 3, 4, 5, 9, 10, 11], "_a": 1, "_bpl": 1, "_namefrst": 1, "_sex": 1, "a304bt": 3, "ab": 1, "abbrevi": [8, 12], "abl": 3, "about": [3, 11, 13], "abov": [1, 2, 6], "absolut": 1, "accept": [1, 3, 13], "access": [11, 13], "accord": 1, "across": 1, "ad": [0, 1, 2, 3], "add": [0, 2, 5], "add_to_a": 3, "addit": [0, 1, 3, 6, 7, 11], "addl": 1, "addl_var": 1, "adjust": 11, "adopt": 0, "advanc": 5, "affect": 4, "after": [1, 2, 3, 5, 9, 11], "ag": [0, 1, 3, 4], "against": [1, 3, 12], "age_2": 3, "age_at_dataset_b": 0, "age_threshold": 1, "aggreg": 5, "ah": 0, "ahead": 8, "akin": 1, "algorithm": [1, 2, 7, 8], "alia": [0, 3, 8], "all": [0, 1, 3, 4, 8, 9, 10, 11], "allow": [1, 3, 8, 13], "along": 1, "alpha": 9, "alphabet": 0, "alphanumer": 3, "also": [0, 1, 2, 3, 6, 7, 8, 10, 11, 13], "altern": [0, 3], "although": 3, "alwai": 2, "among": 1, "amount": 3, "an": [0, 1, 2, 3, 7, 9, 11], "analysi": 13, "analyz": [11, 13], "ani": [1, 3, 4, 6, 9], "anoth": [0, 1, 3, 4, 8], "anyon": 8, "anywher": 12, "apach": 6, "apart": 0, "api": [3, 7, 10], "apostroph": 0, "appear": [0, 1], "appli": [0, 2, 3, 4, 8, 13], "apply_model": 3, "appropri": 3, "ar": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "arbitrari": 0, "area": 3, "aren": 1, "arg": 13, "argument": [1, 11, 13], "around": 3, "arrai": [0, 3, 5, 9, 10], "array_index": 3, "ask": 13, "aspect": [8, 11], "assert": [1, 11], "assum": 3, "attach_vari": 3, "attempt": 3, "attribut": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12], "auto": 9, "automat": [0, 3, 6, 8], "av": 12, "avail": [0, 1, 3, 4, 6, 8, 9, 10, 13], "avenu": [0, 12], "b": [0, 1, 2, 3, 4, 11], "b200": 3, "back": 1, "backup": 1, "base": [1, 3, 8], "basic": 5, "becaus": 2, "been": 7, "befor": [0, 1, 3, 4, 6, 8], "begin": 11, "behind": 0, "being": [1, 8], "belong": 3, "below": [0, 1, 3, 4, 9, 10, 11], "best": [3, 8], "beta": [3, 9], "better": [0, 7], "between": [0, 1, 2, 3, 7, 8, 11, 13], "beyond": 1, "bigint": 3, "bigram": [3, 5], "bin": 9, "binomi": 9, "birth": 1, "birthplac": [0, 3], "birthyr": [0, 3], "birthyr_3": 3, "birthyr_col": 1, "block": [2, 5, 8], "blvd": 0, "boolean": [1, 3, 4, 12, 13], "boost": [5, 9], "born": 1, "borrow_t": 11, "both": [0, 1, 2, 3, 8, 13], "boundari": 1, "bpl": [0, 1, 3], "bpl1": 3, "bpl2": 3, "bpl2_str": 3, "bpl3": 3, "bpl_clean": 3, "bpl_orig": 3, "bpl_root": 0, "bpl_str": 3, "bplmatch": 3, "broken": 7, "btwn": 1, "bucket": [3, 8], "built": 6, "builtin": 1, "byrdifcat": 3, "byrdiff": [1, 3, 13], "c": 1, "c201": 3, "calcul": [1, 13], "call": 0, "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "cannot": 3, "care": 2, "cartesian": 1, "case": [0, 1, 2, 3, 4, 7], "cast": 1, "categor": [1, 3, 9, 10], "categori": [0, 9], "caus": 9, "caution": [1, 10], "censu": [0, 8, 13], "census": 7, "chang": [0, 1, 3, 6, 11, 13], "charact": 0, "characterist": [7, 8], "check": [1, 11], "check_val_expr": 1, "checkpoint": 4, "child": [0, 9], "children": 1, "chines": 0, "choic": 11, "chosen": 8, "chosen_model": [3, 9, 13], "circl": 0, "circumst": 1, "class": 11, "classif": [8, 9], "claus": [0, 1], "clean": [0, 7], "clean_birthyr": [0, 3, 4], "clone": 6, "code": [0, 1, 3, 6], "coeffici": [3, 8], "col": [0, 1], "col_to_add": 3, "col_to_join_on": 3, "col_typ": 3, "column": [1, 4, 5, 8, 10, 11, 12, 13], "column_map": [0, 3, 8], "column_nam": [0, 1, 3, 12], "column_to_append": 0, "combin": [1, 2, 3, 4, 8], "come": 1, "command": [3, 7, 11, 13], "comment": 4, "common": [7, 8, 13], "commonli": 9, "comp": 1, "comp_a": [1, 2, 3], "comp_b": [1, 2, 3], "comp_c": 1, "comp_d": 1, "compar": [0, 1, 3, 7, 8], "comparis": 3, "comparison": [5, 8], "comparison_featur": [1, 2, 3, 8], "comparison_typ": [1, 2, 3], "complet": 11, "complex": [3, 4], "comput": [1, 4, 8], "concat": 0, "concaten": [0, 1], "condens": 0, "condense_strip_whitespac": 3, "condit": [0, 1, 2, 3, 4, 5, 8], "conf": [11, 13], "config": [1, 4, 5, 8, 11, 13], "configur": [0, 1, 2, 7, 11, 13], "conjuct": 3, "conjunct": 3, "connect": [2, 3], "consid": [1, 2, 9], "consider": 1, "consol": 11, "constraint": [1, 2], "construct": 8, "contain": [0, 1, 3, 12], "context": 10, "continu": [9, 10, 13], "conveni": 11, "convert": [0, 1, 3], "convert_ints_to_long": 3, "copi": [4, 13], "core": [1, 8, 11, 13], "correspond": [7, 8], "could": [0, 2, 3], "count": [1, 11, 13], "counti": [0, 1], "county_1900_1910_distances_km": 1, "county_a": 1, "county_b": 1, "county_dist": [1, 3, 13], "county_distance_lookup": 1, "county_distance_squar": [1, 3, 13], "county_state_dist": 1, "court": 0, "cover": 10, "coverag": 3, "cpu": 11, "creat": [0, 3, 7, 8, 10, 11, 12, 13], "creation": 3, "crosswalk": 8, "csv": [1, 3, 8, 11, 12, 13], "current": [1, 2, 3, 11], "d": 1, "d425": 3, "data": [0, 1, 5, 7, 8, 11], "databas": 11, "datafram": [8, 11, 13], "dataset": [0, 1, 3, 4, 7, 8, 11, 13], "datasourc": [1, 3, 11], "datasource_a": [3, 8], "datasource_b": [3, 8], "de": 9, "decis": [3, 5, 9, 13], "decisiontreeclassifi": 9, "default": [0, 1, 2, 3, 8, 9, 11], "defin": [1, 3, 5, 8, 9, 10, 11], "definit": [3, 8], "demograph": 8, "depend": [1, 2, 3, 6, 13], "dependent_var": [3, 13], "depth": 9, "deriv": 13, "derived_from": 3, "desc": 11, "describ": [0, 1, 2, 3, 11], "descript": [3, 9, 11], "detail": [0, 3, 11], "determin": [1, 8], "determinist": [7, 8], "dev": 6, "develop": [6, 7], "df": [3, 11], "dictionari": 11, "diff": 1, "differ": [0, 1, 3, 5, 7, 8], "digit": 0, "dir": 12, "directli": 2, "directori": [6, 11, 13], "discard": 9, "discret": 9, "discuss": 3, "dist": 1, "dist_tabl": 1, "distanc": [1, 9], "distance_col": 1, "distance_km": 1, "distances_fil": 1, "distinct": 1, "divid": 0, "divide_by_int": 3, "do": [0, 1, 2, 4, 11, 13], "doc": [9, 10], "document": [1, 3, 9, 11, 13], "doe": [1, 4, 8, 13], "don": [3, 4], "doubl": 10, "down": [0, 7, 13], "drastic": 8, "drop": [0, 3, 11], "drop_al": 11, "drop_all_prc": 11, "drop_all_temp": 11, "drop_data_from_scored_match": 3, "drop_duplicate_a": 3, "drop_duplicate_with_threshold_ratio": [3, 13], "duplic": [3, 9], "durat": 1, "dure": [1, 2, 3, 8], "durmarr": [1, 3], "e": 6, "each": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11], "easiest": [6, 11], "easili": 11, "edit": 6, "effect": [2, 4], "either": [0, 1, 2, 3, 7, 12], "either_0": 1, "either_1": 1, "element": 0, "elig": 8, "els": [1, 3, 4], "else_v": 1, "else_valu": 0, "empti": 1, "enabl": [3, 8, 11], "encod": [3, 4], "end": [0, 1, 3, 4, 12], "ensur": 3, "enter": 11, "entir": [0, 3], "enum_dist": 1, "enumer": 11, "equal": [2, 3, 12], "equal_and_not_null_templ": 1, "equival": 3, "error": [3, 10], "especi": 3, "etc": 13, "eval": 3, "evalu": [1, 3, 7, 8, 9], "even": 1, "everi": [1, 4], "ex": 1, "exact": [1, 3], "exact_mult": [3, 13], "exampl": [0, 1, 2, 3], "except": [3, 10], "exclud": 1, "excute_command": 11, "execut": 11, "execute_command": 11, "execute_task": 11, "executor": 11, "executor_memori": [11, 13], "exist": [1, 3, 11], "exit": 11, "expand": 3, "expand_length": 3, "expect": 1, "experi": 7, "experiment": [8, 11], "explan": 9, "explicitli": [1, 10], "explod": [2, 3, 8], "exploded_df_a": 11, "exploded_df_b": 11, "explor": [3, 5, 7, 11], "expon": 4, "exponenti": 1, "export": [5, 8, 11], "express": [0, 1, 3], "extend": 1, "extens": 8, "extra": 1, "extract": 3, "f": [1, 11], "f1": 1, "f1_match": 3, "f2": 1, "f2_match": 3, "f_caution": [3, 13], "f_interacted_jw_f": [3, 13], "f_pre": [3, 13], "factori": 11, "fail": 3, "fallback": 1, "fals": [1, 3, 4, 5, 7, 11], "famili": 9, "father_namefrst": 1, "favor": 1, "fbpl": 1, "fbpl_nomatch": 1, "fbplmatch": [3, 13], "featur": [2, 5, 7, 8, 9, 11], "feature_import": [3, 8, 13], "feature_nam": [2, 3], "feature_select": [2, 3, 4, 8], "featuresubsetstrategi": 9, "fed": [3, 8], "femal": [3, 12], "fetch": 1, "fetch_a": 3, "few": 4, "fewer": [1, 9], "fi": 1, "file": [1, 4, 5, 7, 8, 11, 12, 13], "filepath": 11, "fill": 1, "filter": [1, 2, 5, 8, 12], "final": [1, 3, 13], "find": [1, 8, 13], "finish": 11, "first": [0, 1, 3, 6, 8, 11, 12], "first_init_col": 1, "first_nam": 0, "five": 11, "fix": 8, "flag": [1, 2, 3, 10, 11, 13], "flexibl": [1, 2], "float": [1, 3, 9], "floor": 0, "focus": [2, 8], "follow": [0, 1, 2, 7, 11, 12, 13], "foreign": 1, "forest": [5, 9], "form": [1, 3, 8, 12], "format": 0, "four": 1, "framework": 13, "from": [0, 1, 3, 5, 7, 8, 9, 10, 11, 13], "from_icpsrctyi": 1, "from_statefip": 1, "fsoundex": [3, 13], "full": [3, 8, 13], "full_count_1870_1880": 11, "full_count_1900_1910": 13, "fullcount_1870_1880": 11, "function": [0, 1, 3, 7, 11], "further": [7, 13], "gbtclassifi": 9, "gen": 1, "gener": [0, 1, 5, 7, 8, 11], "generalizedlinearregress": 9, "geo": 1, "geograph": 1, "get": [0, 1, 2, 3, 11], "get_floor": 3, "get_set": 11, "get_step": 11, "get_tabl": 11, "get_task": 11, "github": 6, "give": [0, 3], "given": [0, 1, 2, 3, 4, 9, 13], "go": [3, 11], "good": 1, "gradient": [5, 9], "greater": [1, 2, 6], "greatest": 1, "group": [3, 8], "gt": 1, "h": 11, "ha": [0, 1, 2, 3, 4, 7, 11, 13], "handl": 11, "harmon": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "haven": 3, "head": 0, "header": [3, 10, 12], "help": [3, 8, 11], "here": [2, 3, 8, 11, 13], "hh": [1, 2], "hh_blocked_match": 11, "hh_col": 3, "hh_comparison": [2, 3, 8], "hh_match": [3, 11], "hh_model_eval_repeat_fn": 11, "hh_model_eval_repeat_fp": 11, "hh_model_eval_training_data": 11, "hh_model_eval_training_featur": 11, "hh_model_eval_training_result": 11, "hh_model_eval_training_vector": 11, "hh_model_explor": 11, "hh_potential_match": [2, 11], "hh_potential_matchs_prep": 11, "hh_predicted_match": 11, "hh_repeat_fn": 13, "hh_repeat_fp": 13, "hh_scored_potential_match": 11, "hh_train": [1, 3, 8, 11, 13], "hh_training_data": 11, "hh_training_data_1900_1910": 3, "hh_training_featur": [11, 13], "hh_training_result": 13, "hidden": 11, "hierarch": [7, 11], "hierarchi": 11, "high": 11, "highest": [1, 3], "highli": [7, 8], "histid": [1, 3, 13], "histid_col": 1, "hit": [3, 11, 13], "hits2": [3, 13], "hlink": [0, 1, 2, 3, 4, 6, 7, 8, 13], "hold": 10, "hot": 3, "household": [0, 2, 5, 7, 9, 11, 13], "how": [1, 3, 8], "howev": [4, 7, 13], "hundr": 0, "hyper": [3, 7, 13], "hyperparamet": [9, 13], "hyphen": 0, "i": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13], "id": [1, 3], "id_column": [1, 3], "ident": 13, "identifi": [3, 7, 13], "if_valu": 0, "ignor": 7, "ii": [0, 3], "iii": [0, 3], "imm": [1, 3, 13], "imm_interacted_bplmatch": 3, "imm_interacted_immyear_caut": [3, 13], "immigr": 1, "immyear_caut": [3, 10], "immyear_diff": [1, 3, 10, 13], "implement": 13, "implicitli": 2, "import": [3, 8, 11, 13], "improv": 8, "includ": [1, 3, 8, 10, 11], "incompar": 1, "increas": [3, 10], "independ": [0, 3], "independent_var": [1, 3, 13], "index": [0, 6], "indic": [1, 13], "individu": [1, 3, 8, 13], "inf": 10, "inform": [0, 1, 3, 11], "ingest": 8, "initi": [0, 1, 11], "input": [0, 1, 3, 4, 7, 8, 11, 12], "input_col": 4, "input_column": [3, 4, 10], "input_table_nam": 11, "inspect": 8, "instal": 5, "instanc": [0, 9], "instead": [1, 3, 4, 6, 8], "instruct": [6, 11], "int": [0, 1, 3, 4, 9], "integ": [0, 1, 3, 10], "integr": 6, "interact": [3, 5, 8, 13], "interfac": 11, "intermedi": 11, "introduct": 5, "introspect": 8, "invalid": [1, 9], "ipum": [0, 7], "ipython": 11, "isn": 13, "istemporari": 11, "item": 0, "iter": 3, "its": [0, 1, 7, 11], "itself": [1, 2], "iv": 3, "jaro": [1, 10], "jaro_winkl": 3, "java": 6, "job": 11, "join": [1, 3, 12], "join_column": [3, 12], "join_valu": [3, 12], "jr": [0, 3], "json": [3, 11], "just": [1, 3, 11, 13], "jw": 1, "jw_col_templ": 1, "jw_f": [1, 3, 13], "jw_m": [3, 10, 13], "jw_max_a": 3, "jw_max_b": 3, "jw_sp": [3, 13], "jw_street": 1, "jw_threshold": 1, "kei": [1, 8, 11], "key_count": 1, "kind": 1, "know": 3, "known": 0, "label": 9, "languag": 7, "last": [0, 1, 8, 10], "latest": 6, "launch": [11, 13], "law": 0, "lead": 0, "learn": [1, 2, 3, 7, 8, 11, 13], "least": [0, 1], "leav": 0, "left": 9, "length": [1, 3, 10], "less": [1, 3], "let": 11, "letter": 0, "level": [1, 5, 11], "leverag": 3, "librari": [5, 7], "like": [0, 2, 3, 8, 11], "limit": 3, "line": [7, 11], "link": [0, 1, 3, 5, 7, 9], "link_run": 11, "linkrun": [7, 11], "list": [0, 1, 3, 4, 5, 9, 11, 12], "liter": 3, "ll": 11, "load": 11, "load_conf_fil": 11, "load_config": 11, "loc_a": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b": 1, "loc_b_0": 1, "loc_b_1": 1, "locat": [1, 3, 11], "log2": 9, "logic": [0, 2, 3], "logist": [5, 9], "logistic_regress": [3, 13], "logisticregress": 9, "long": [3, 12], "longest": 8, "look": [1, 2, 11, 12], "lookup": 1, "lower": [0, 1], "lowercas": 0, "lowercase_strip": 3, "lowest": 0, "lr": 11, "lsoundex": [3, 13], "m": [0, 1], "m_caution": [1, 3, 10, 13], "m_interacted_jw_m": [3, 10, 13], "m_namefrst": 1, "m_pre": [3, 10, 13], "machin": [1, 2, 3, 7, 8, 11, 13], "made": 6, "mai": [0, 2, 3, 7, 8, 11], "main": 11, "mainli": 1, "major": [1, 11], "make": [0, 1, 2, 3, 6, 13], "male": [3, 12], "mani": [2, 8, 11], "manual": 13, "map": [5, 8, 10], "mardurmatch": [1, 3], "mark": 2, "marriag": 1, "match": [0, 1, 2, 5, 7, 11, 12, 13], "matches_df": 11, "matrix": 13, "max": [1, 9, 11], "maxbin": 9, "maxdepth": [3, 9, 13], "maximum": [1, 9], "maximum_jw": 1, "mbpl": 1, "mbplmatch": [3, 13], "mean": [0, 2, 3], "meant": 3, "meet": 1, "member": [1, 8], "memori": 11, "men": 3, "messag": 11, "metadata": 8, "method": [11, 13], "mfbplmatch": 1, "mi": [3, 13], "mi0": 1, "mi1": 1, "mid_init_col": 1, "mid_init_match": 1, "middl": [0, 1], "might": 13, "minimum": [1, 9], "mininstancespernod": 9, "minu": [1, 3], "mismatch": 1, "miss": [1, 11], "ml": [3, 5, 9, 10], "mode": [5, 6, 13], "model": [5, 7, 11], "model_eval_repeat_fn": 11, "model_eval_repeat_fp": 11, "model_eval_training_data": 11, "model_eval_training_featur": 11, "model_eval_training_result": 11, "model_eval_training_vector": 11, "model_explor": [3, 11, 13], "model_paramet": [3, 8, 9, 13], "modifi": 3, "modul": 11, "moment": 8, "momloc": 1, "more": [0, 1, 2, 3, 10, 11, 13], "most": [0, 1, 8, 11], "mother": 10, "mother_birthyr": 1, "mpre": 1, "mr": 0, "much": [1, 8], "mult": 1, "multi": 1, "multipl": [0, 1, 5, 11], "multipli": 1, "must": [0, 1, 2, 3, 4, 9, 10, 12], "my": [11, 13], "my_conf": 11, "my_fil": 3, "myriad": 3, "n": [1, 9, 10], "n_training_iter": [3, 8, 13], "name": [0, 1, 3, 4, 11, 12], "name_col": 1, "name_std": [3, 12], "namefrst": [0, 1, 3], "namefrst_clean": [0, 3], "namefrst_init": 1, "namefrst_jw": [1, 2, 3, 13], "namefrst_mid_init": [0, 1], "namefrst_mid_init_2": 1, "namefrst_rel": 1, "namefrst_related_row": 1, "namefrst_split": [0, 3], "namefrst_std": [3, 12], "namefrst_std_jw": [3, 13], "namefrst_unstd": [1, 3], "namefrst_unstd_bigram": 4, "namelast": [1, 3], "namelast1": 1, "namelast2": 1, "namelast3": 1, "namelast_bigram": 3, "namelast_clean": [1, 3, 4], "namelast_clean_bigram": [3, 4], "namelast_clean_soundex": 4, "namelast_equal_as_int": 1, "namelast_frst_bigram": 4, "namelast_jw": [2, 3, 13], "namelast_jw_max": 1, "namelast_neighbor": 1, "namelast_popular": 1, "namelast_popularity_sum": 1, "nativ": 1, "nbor": [1, 3, 13], "ncount": [3, 4, 13], "ncount2": [3, 4, 13], "nearest": 0, "necessari": [4, 13], "need": [0, 1, 2, 3, 8, 11, 13], "neg": [3, 5, 7], "neighbor": 1, "neighborhood": 1, "neither": 1, "nest": [2, 3, 7], "new": [0, 3, 13], "new_marr": [1, 3], "newli": 3, "niu": 1, "no_first_pad": 4, "node": 9, "non": 0, "nonzero": 1, "nor": 1, "normal": 8, "note": [2, 3], "now": [2, 11], "null": [1, 3, 4], "null_fil": 3, "num": 11, "num_col": 1, "number": [0, 1, 3, 4, 8, 9, 11], "numer": [0, 1], "numtre": [3, 9, 13], "nvl": 1, "object": [1, 3, 7, 11], "often": [2, 4, 11], "onc": [1, 11], "one": [0, 1, 3, 8], "onethird": 9, "onli": [0, 1, 2, 3, 8, 13], "ons": 5, "oper": [0, 1, 2, 3], "option": [0, 1, 3, 4, 7, 8, 9, 11, 13], "or_group": 3, "order": [0, 3, 8], "org": 6, "organ": 11, "original_valu": 3, "oth": [3, 13], "other": [1, 3, 13], "otherwis": [0, 1, 10, 13], "our": 11, "out": [3, 4, 8, 13], "output": [0, 1, 2, 3, 4, 7, 8, 11, 13], "output_col": 4, "output_column": [3, 4, 10], "output_suspicious_td": [3, 13], "output_table_nam": 11, "output_typ": 3, "outsid": 10, "overrid": [0, 3], "override_column_a": [0, 3, 4], "override_column_b": [0, 3, 4], "override_transform": [0, 3], "overview": 5, "own": [1, 3], "p": 3, "packag": 6, "page": [1, 2, 3, 11], "pair": [1, 2, 3, 13], "param": [9, 13], "param_grid": [3, 13], "paramet": [3, 7, 8, 9, 11, 13], "parent": [0, 1, 12], "parent_step_chang": 1, "parenthes": 3, "parquet": [3, 8], "part": [3, 8], "particular": [1, 2, 3], "particularli": 1, "partit": 13, "pass": [1, 8, 9], "path": [1, 3, 11, 12, 13], "pattern": 11, "peopl": [0, 1, 7, 11], "per": [1, 3, 9, 10, 11], "percent": 8, "percentag": 1, "perform": [1, 3, 7, 8, 12], "persist": 11, "person": [0, 1, 7], "pip": 6, "pipelin": 5, "pipeline_featur": [3, 8, 10], "piplin": 3, "placehold": 1, "pleas": [3, 11], "plu": 3, "point": [10, 13], "popul": 8, "posit": [0, 1, 3, 5, 7], "possibl": 3, "post": [3, 8], "potenti": [1, 2, 5, 8], "potential_match": [1, 2, 8, 11], "potential_matches_prep": 11, "potential_matches_univers": [3, 8], "power": 5, "predefin": 1, "predict": [3, 13], "predicted_match": 11, "prediction_col": 3, "preexist": 11, "prefer": 13, "prefix": 0, "prep": 3, "prep_step": 11, "prepar": [7, 8, 11], "prepend": 4, "prepped_df_a": 11, "prepped_df_b": 11, "preprocess": [5, 7, 11, 13], "present": [1, 3, 10], "pretti": 2, "primari": [1, 7], "primarili": [1, 8], "print": 11, "probabilist": [3, 7], "probabl": [0, 3, 9], "probit": [3, 5], "proceed": 12, "process": [2, 3, 7, 11], "produc": [0, 11], "product": [1, 7, 13], "program": [1, 3, 8, 13], "project": 6, "prompt": 11, "provid": [0, 3, 4, 7, 8, 10, 11], "pull": 8, "punctuat": 0, "put": [11, 13], "py": [1, 3], "pypi": 5, "pyspark": [6, 9, 10, 11], "python": [6, 7, 11], "q": [11, 13], "qmark": 0, "qualifi": 3, "queri": [1, 3], "race": [0, 1, 3, 10, 13], "race_interacted_srac": [3, 10, 13], "racematch": 3, "rais": [1, 4], "random": [5, 9], "random_forest": [3, 13], "randomforestclassifi": 9, "rang": [3, 10], "rate": 1, "ratio": [3, 9], "ration": 0, "rationalize_name_word": 3, "raw": [0, 3, 8, 11], "raw_df_a": 11, "raw_df_b": 11, "read": [0, 1, 3, 8, 11], "readm": 3, "real": 7, "reason": 1, "recod": 0, "record": [0, 1, 2, 3, 7, 8], "recurs": [2, 3], "reduc": [4, 8], "refer": 3, "regex": 5, "regex_word_replac": 12, "region": [3, 13], "region_dict": 3, "regionf": 3, "regist": [8, 11], "regress": [5, 9], "regular": 13, "rel": [1, 3, 13], "relat": [0, 1, 3], "relate_a": [3, 10], "relate_col": 1, "relate_div_100": [0, 1, 3], "related_individual_max_jw": 1, "related_individual_row": 1, "relatematch": [1, 3], "relatetyp": [3, 10], "relatetype_interacted_relatematch": 3, "relationship": 7, "relev": 13, "reload": 11, "remain": 8, "remov": 0, "remove_alternate_nam": 3, "remove_qmark_hyphen": 3, "remove_suffix": 3, "repeat_fn": 13, "repeat_fp": 13, "repeatedli": 3, "replac": [0, 1, 2, 5], "replace_apostroph": 3, "replaced_birthyr": [1, 3, 4], "report": [1, 5, 7, 11], "repositori": 6, "repres": [0, 1, 3, 10, 11], "represent": [3, 8], "reproduc": 11, "request": 8, "requir": [0, 1, 2, 3, 4, 5, 8, 10, 11, 12], "research": 7, "reshap": 8, "resourc": 4, "respect": [1, 8], "restrict": 2, "result": [0, 1, 7, 10, 11, 13], "return": [1, 4, 9, 11], "reus": 5, "right": [9, 11], "risk": 1, "road": 0, "robust": 3, "root": 6, "round": [0, 3], "row": [0, 3, 4], "rule": [2, 7, 8], "run": [5, 6, 7, 8, 13], "run_all_step": [3, 11, 13], "run_step": 11, "runtim": 8, "sai": 11, "same": [0, 1, 2, 3, 4, 7, 8, 11], "sampl": 1, "satisfi": [2, 3, 8], "save": [3, 8, 13], "scala": 1, "scale": 3, "scale_data": [3, 13], "scenario": 13, "schema": 3, "score": [1, 3, 8, 10], "score_with_model": [3, 13], "scored_potential_match": 11, "scratch": 3, "script": [3, 7, 11], "search": 1, "second": [0, 1, 3, 12], "secondari": 1, "secondary_distance_col": 1, "secondary_distances_fil": 1, "secondary_key_count": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_source_column": 1, "secondary_table_nam": 1, "section": [0, 1, 2, 3, 13], "see": [1, 3, 6, 11, 13], "seen": 1, "select": [0, 1, 5, 7, 11, 13], "separ": 8, "sequenc": 7, "seri": 11, "serialp": 3, "serv": 1, "set": [0, 1, 2, 3, 4, 7, 8, 11, 13], "set_executor_memori": 11, "set_link_task": 11, "set_loc": 11, "set_num_cor": 11, "set_preexisting_t": 11, "set_print_sql": 11, "set_value_column_a": [3, 4], "set_value_column_b": [3, 4], "sever": [1, 7], "sex": [1, 3, 12], "sex_equ": 3, "sex_region_interact": 3, "sex_regionf_interact": 3, "sexmatch": 3, "sgen": [1, 3, 13], "share": 7, "short": 3, "should": [1, 3, 4, 9, 10, 11], "show": 11, "showf": 11, "shut": 13, "sibl": 0, "sign": 1, "signific": 0, "similar": [1, 3], "simpli": [0, 3], "simplifi": 1, "sinc": [0, 3], "singl": [0, 4, 11, 13], "size": 1, "skip": [3, 4, 8], "sm_bpl": 1, "sm_namefrst": 1, "sm_sex": 1, "small": 3, "smaller": 7, "smallest": 3, "sn_bpl": 1, "sn_namefrst": 1, "sn_sex": 1, "so": [0, 1, 2, 3, 6, 13], "some": [0, 1, 2, 3, 4, 7, 8, 11], "someth": 11, "sometim": 3, "somewhat": 2, "soundex": 5, "sourc": [1, 5, 8, 11, 13], "source_column_a": 1, "source_column_b": 1, "sp": 1, "sp_caution": [1, 3, 13], "sp_interacted_jw_sp": [3, 13], "sp_pre": [3, 13], "space": [0, 3, 4, 12], "span": 13, "spark": [1, 3, 4, 6, 9, 10, 11, 13], "sparkfactori": 11, "sparksess": 11, "special": 1, "specif": [1, 3, 11], "specifi": [0, 1, 3, 7, 8, 10, 11, 12], "split": [3, 4, 8, 9, 10, 13], "split_by_id_a": [3, 13], "sploc": 1, "spous": 0, "spouse_birthyr": 1, "spouse_bpl": 1, "sql": [0, 1, 2, 3, 4, 5, 8, 11], "sql_condit": 3, "sqrt": 9, "squar": 1, "sr": [0, 3], "srace": [3, 10, 13], "stage": 3, "standard": [0, 1, 12], "start": [0, 12], "state": [1, 7], "state_1900_1910_distances_km": 1, "state_dist": 1, "state_distance_lookup": 1, "statecode1": 1, "statecode2": 1, "statefip": [0, 1, 3], "statefip_h": 3, "step": [0, 1, 3, 7], "stepmom": 1, "still": 12, "stop": 0, "street": [0, 1], "street_jw": [1, 3, 13], "street_unstd": 12, "strictli": 10, "string": [0, 1, 3, 4, 8, 9, 11, 12], "strip": [0, 8], "structur": [2, 3, 7], "sub": [1, 2, 3], "subhead": 12, "subset": [3, 12], "substitut": [5, 8], "substitution_column": [3, 8, 12], "substitution_fil": [3, 12], "substitutions_street_abbrev": 12, "subtract": 1, "suffix": 0, "suppli": 12, "support": [0, 2, 3, 8, 9], "suppos": [0, 2, 3], "sure": [2, 3, 6, 11], "surnam": 1, "surround": 0, "suspect": [1, 7], "swap": 12, "syntax": 2, "system": 6, "t": [0, 1, 3, 4, 13], "tabl": [1, 2, 3, 5, 8, 11, 13], "table_nam": 1, "tablenam": 11, "tailor": 7, "take": [0, 1, 2, 3, 4, 8, 11], "taken": [0, 1], "task": [2, 3, 5, 7, 9, 13], "task_nam": 11, "tell": [1, 3, 4], "templat": 1, "ten": 1, "term": 3, "test": [3, 8, 13], "text": 11, "than": [0, 1, 2, 3, 9], "thei": [0, 1, 2, 3, 8, 11], "them": [0, 1, 3, 8], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 13], "thing": 3, "those": [1, 3], "though": 1, "thought": 8, "three": [2, 3, 8], "threshold": [2, 3, 9, 13], "threshold_expr": [2, 3], "threshold_ratio": [3, 9, 13], "through": [6, 8, 10, 11], "thu": 1, "time": [0, 3, 8, 11], "to_icpsrctyi": 1, "to_statefip": 1, "togeth": [0, 1, 3], "toml": [3, 7, 11], "tool": [6, 7], "top": [1, 5], "topic": 11, "total": 8, "train": [1, 5, 7, 9, 11], "training_data": [3, 11], "training_data_1900_1910": 13, "training_data_1900_1910_hlink_featur": 13, "training_data_subset": 3, "training_featur": [11, 13], "training_result": 13, "transform": [1, 3, 5, 7, 8], "transformer_typ": [3, 10], "treat": [3, 10], "tree": [5, 9], "true": [1, 3, 4, 8, 10, 12, 13], "try": 3, "tune": [7, 13], "tutori": [3, 11], "two": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "type": [0, 3, 4, 5, 8, 9, 11, 12, 13], "typic": [3, 8], "u": 0, "ugli": 2, "under": [1, 3], "understand": 7, "union": 5, "uniqu": 3, "unit": 7, "univers": [1, 5, 8], "unknown": 1, "unlik": 2, "unrel": 1, "unstabl": 11, "up": [1, 3, 11, 12], "updat": 13, "upper": 1, "upper_threshold": 3, "uppercas": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13], "us1900": 3, "us1900m_usa": 3, "us1910": 3, "us1910m_usa": 3, "usag": [3, 4, 5, 11], "use_potential_matches_featur": 13, "use_training_data_featur": [3, 8, 13], "user": [1, 11], "usual": [0, 8, 13], "util": 4, "v": 3, "valu": [0, 1, 2, 3, 4, 9, 10, 11, 12], "var": [1, 3], "vari": [0, 3, 4], "variabl": [0, 1, 3, 13], "variant": 12, "varieti": 8, "ve": 13, "vector": [3, 10], "verbos": 2, "veri": [4, 8, 13], "version": [0, 6, 13], "vi": 3, "via": [6, 7], "vii": 3, "viii": 3, "volumn": 1, "wa": [1, 13], "wai": [1, 2, 3, 6, 11], "want": [0, 1, 3, 11, 13], "washington": 3, "we": [0, 1, 11, 13], "well": 3, "were": 3, "weren": 13, "what": [1, 3, 11, 13], "when": [0, 1, 2, 3, 4, 8, 13], "where": [1, 8, 11, 13], "whether": [1, 3, 12], "which": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "white": 0, "whitespac": [0, 8], "who": 1, "whole": [0, 7], "whose": 1, "width": 8, "winkler": [1, 10], "within": [1, 3, 7, 11, 12], "word": [0, 5], "work": [0, 1, 3, 6, 8, 11, 13], "workflow": [4, 5], "world": 7, "would": [1, 2, 3, 13], "write": [1, 11, 13], "written": [3, 7], "x": [3, 10, 11], "x_crosswalk": 11, "x_hh_tfam": 11, "x_hh_tfam_2a": 11, "x_hh_tfam_2b": 11, "x_load": 11, "x_parquet_from_csv": 11, "x_persist": 11, "x_sql": 11, "x_sqlf": 11, "x_summari": 11, "x_tab": 11, "x_tfam": 11, "x_tfam_raw": 11, "x_union": 11, "y": 10, "year": [0, 1, 3, 4, 5], "year_b": 1, "yet": 11, "you": [0, 1, 2, 3, 6, 11, 12, 13], "your": [2, 3, 4, 6, 8, 11, 13], "yrimmig": 1, "zero": 1}, "titles": ["Column Mappings", "Comparison Features", "Comparisons", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "titleterms": {"": 5, "1": 12, "abs_diff": 1, "add": 1, "add_to_a": 0, "advanc": [0, 3, 13], "after": 13, "aggreg": 1, "alia": 1, "all_equ": 1, "any_equ": 1, "api": 5, "arrai": 4, "array_index": 0, "b_minus_a": 1, "basic": [0, 3], "bigram": 4, "block": 3, "btwn_threshold": 1, "bucket": 10, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "column": [0, 3], "comparison": [1, 2, 3], "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "condense_strip_whitespac": 0, "config": 3, "configur": [3, 5, 8], "data": [3, 12, 13], "decision_tre": 9, "defin": 2, "differ": 13, "divide_by_int": 0, "document": 5, "either_are_0": 1, "either_are_1": 1, "equal": 1, "equals_as_int": 1, "exact_mult": 1, "exampl": [11, 13], "explor": [8, 13], "export": 13, "extra_children": 1, "f1_match": 1, "f2_match": 1, "fals": 13, "featur": [1, 3, 4, 10, 13], "fetch_a": 1, "fetch_b": 1, "file": 3, "filter": 3, "fn": 13, "fp": 13, "from": 6, "gener": [3, 10, 13], "geo_dist": 1, "get_floor": 0, "gradient_boosted_tre": 9, "gt_threshold": 1, "hit": 1, "hits2": 1, "hlink": [5, 11], "household": [1, 3, 8], "instal": 6, "interact": [10, 11], "introduct": 7, "jaro_winkl": 1, "jaro_winkler_r": 1, "jaro_winkler_street": 1, "jw_max_a": 1, "jw_max_b": 1, "length_b": 1, "level": 3, "librari": 11, "link": [8, 11, 13], "list": 13, "logistic_regress": 9, "look_at_addl_var": 1, "lower_threshold": 1, "lowercase_strip": 0, "map": [0, 3], "match": [3, 8], "max_jaro_winkl": 1, "maximum_jaro_winkl": 1, "ml": 13, "mode": 11, "model": [3, 8, 9, 13], "multi_jaro_winkler_search": 1, "multipl": [2, 3], "neg": 13, "neither_are_nul": 1, "not_equ": 1, "not_zero_and_not_equ": 1, "ons": 1, "overview": [2, 7, 8], "pipelin": [3, 10], "posit": 13, "potenti": [3, 13], "power": [1, 4], "preprocess": 8, "present_and_equal_categorical_in_univers": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_both_year": 1, "probit": 9, "program": 11, "pypi": 6, "random_forest": 9, "rationalize_name_word": 0, "regex": 12, "rel_jaro_winkl": 1, "relat": 8, "remove_alternate_nam": 0, "remove_one_letter_nam": 0, "remove_prefix": 0, "remove_punctu": 0, "remove_qmark_hyphen": 0, "remove_stop_word": 0, "remove_suffix": 0, "replac": 12, "replace_apostroph": 0, "report": 8, "requir": 6, "reus": 13, "run": 11, "second_gen_imm": 1, "section": 8, "select": [3, 4], "singl": 3, "soundex": 4, "sourc": [3, 6], "split": 0, "sql_condit": [1, 4], "start": 11, "step": [8, 11], "substitut": [3, 12], "substr": 0, "sum": 1, "tabl": 12, "task": [8, 11], "threshold": 1, "time": 1, "top": 3, "train": [3, 8, 13], "transform": [0, 4, 10], "type": [1, 2, 10], "union": 4, "univers": 3, "upper_threshold": 1, "us": 11, "usag": 0, "welcom": 5, "when_valu": 0, "word": 12, "workflow": [11, 13], "year": 13}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"1:1 substitution by data table": [[12, "substitution-by-data-table"]], "Advanced Config File": [[3, "advanced-config-file"]], "Advanced Usage": [[0, "advanced-usage"]], "Advanced Workflow Examples": [[13, null]], "Aggregate Features": [[1, "aggregate-features"]], "Basic Config File": [[3, "basic-config-file"]], "Basic Usage": [[0, "basic-usage"]], "Blocking": [[3, "blocking"]], "Column Mappings": [[0, null], [3, "column-mappings"]], "Comparison Features": [[1, null], [3, "comparison-features"]], "Comparison Types": [[1, "comparison-types"], [2, "comparison-types"]], "Comparisons": [[2, null], [3, "comparisons"]], "Configuration": [[3, null]], "Configuration API": [[5, "configuration-api"], [5, null]], "Data sources": [[3, "data-sources"]], "Defining Multiple Comparisons": [[2, "defining-multiple-comparisons"]], "Example interactive mode workflow": [[11, "example-interactive-mode-workflow"]], "Example model exploration and FP/FN export workflow": [[13, "example-model-exploration-and-fp-fn-export-workflow"]], "Example training data export with generated ML features": [[13, "example-training-data-export-with-generated-ml-features"]], "Export training data after generating features to reuse in different linking years": [[13, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Feature Selection Transforms": [[4, null]], "Feature Selections": [[3, "feature-selections"]], "Feature add-ons": [[1, "feature-add-ons"]], "Filter": [[3, "filter"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "Household Comparisons": [[3, "household-comparisons"]], "Household Matching": [[8, "household-matching"]], "Household training and models": [[3, "household-training-and-models"]], "Installation": [[6, null]], "Installing from PyPI": [[6, "installing-from-pypi"]], "Installing from source": [[6, "installing-from-source"]], "Interactive Mode": [[11, "interactive-mode"]], "Introduction": [[7, null]], "Link Tasks": [[8, null]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[13, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Matching": [[8, "matching"]], "Model Exploration and Household Model Exploration": [[8, "model-exploration-and-household-model-exploration"]], "Models": [[9, null]], "Multiple Comparisons": [[3, "multiple-comparisons"]], "Overview": [[2, "overview"], [7, "overview"], [8, "overview"], [8, "id1"], [8, "id4"], [8, "id7"], [8, "id10"], [8, "id13"]], "Pipeline generated features": [[10, null]], "Pipeline-generated Features": [[3, "pipeline-generated-features"]], "Potential Matches Universe": [[3, "potential-matches-universe"]], "Preprocessing": [[8, "preprocessing"]], "Related Configuration Sections": [[8, "related-configuration-sections"], [8, "id3"], [8, "id6"], [8, "id9"], [8, "id12"], [8, "id15"]], "Reporting": [[8, "reporting"]], "Requirements": [[6, "requirements"]], "Running Linking Tasks and Steps": [[11, "running-linking-tasks-and-steps"]], "Running hlink": [[11, null]], "Single Comparison": [[3, "single-comparison"]], "Starting the program": [[11, "starting-the-program"]], "Substitution Columns": [[3, "substitution-columns"]], "Substitution by regex word replace": [[12, "substitution-by-regex-word-replace"]], "Substitutions": [[12, null]], "Task steps": [[8, "task-steps"], [8, "id2"], [8, "id5"], [8, "id8"], [8, "id11"], [8, "id14"]], "Top level configs": [[3, "top-level-configs"]], "Training and Household Training": [[8, "training-and-household-training"]], "Training and models": [[3, "training-and-models"]], "Transformer types": [[10, "transformer-types"]], "Transforms": [[0, "transforms"]], "Using hlink as a Library": [[11, "using-hlink-as-a-library"]], "Welcome to hlink\u2019s documentation!": [[5, null]], "abs_diff": [[1, "abs-diff"]], "add_to_a": [[0, "add-to-a"]], "alias": [[1, "alias"]], "all_equals": [[1, "all-equals"]], "and": [[1, "and"]], "any_equals": [[1, "any-equals"]], "array": [[4, "array"]], "array_index": [[0, "array-index"]], "b_minus_a": [[1, "b-minus-a"]], "bigrams": [[4, "bigrams"]], "btwn_threshold": [[1, "btwn-threshold"]], "bucketizer": [[10, "bucketizer"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "decision_tree": [[9, "decision-tree"]], "divide_by_int": [[0, "divide-by-int"]], "either_are_0": [[1, "either-are-0"]], "either_are_1": [[1, "either-are-1"]], "equals": [[1, "equals"]], "equals_as_int": [[1, "equals-as-int"]], "exact_mult": [[1, "exact-mult"]], "extra_children": [[1, "extra-children"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "geo_distance": [[1, "geo-distance"]], "get_floor": [[0, "get-floor"]], "gradient_boosted_trees": [[9, "gradient-boosted-trees"]], "gt_threshold": [[1, "gt-threshold"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "interaction": [[10, "interaction"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "length_b": [[1, "length-b"]], "logistic_regression": [[9, "logistic-regression"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "lower_threshold": [[1, "lower-threshold"]], "lowercase_strip": [[0, "lowercase-strip"]], "mapping": [[0, "mapping"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "neither_are_null": [[1, "neither-are-null"]], "not_equals": [[1, "not-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "power": [[1, "power"], [4, "power"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_both_years": [[1, "present-both-years"]], "probit": [[9, "probit"]], "random_forest": [[9, "random-forest"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "remove_prefixes": [[0, "remove-prefixes"]], "remove_punctuation": [[0, "remove-punctuation"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_suffixes": [[0, "remove-suffixes"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "second_gen_imm": [[1, "second-gen-imm"]], "soundex": [[4, "soundex"]], "split": [[0, "split"]], "sql_condition": [[1, "sql-condition"], [4, "sql-condition"]], "substring": [[0, "substring"]], "sum": [[1, "sum"]], "threshold": [[1, "threshold"]], "times": [[1, "times"]], "union": [[4, "union"]], "upper_threshold": [[1, "upper-threshold"]], "when_value": [[0, "when-value"]]}, "docnames": ["column_mappings", "comparison_features", "comparisons", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["column_mappings.md", "comparison_features.md", "comparisons.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [0, 1, 3, 7, 8, 11, 12], "0": [0, 1, 2, 3, 8, 9, 10, 11, 13], "005": 3, "012": 1, "05": [1, 3], "1": [0, 1, 2, 3, 5, 8, 9, 10, 11, 13], "10": [0, 3, 6, 13], "100": [0, 3, 13], "11": [0, 3, 6, 10], "12": [0, 6], "14": 1, "15": 9, "1867": 3, "1868": 3, "1869": 3, "1870": 3, "1871": 3, "1872": 3, "1873": 3, "1900": [3, 13], "1900_1910_potential_fn": 13, "1900_1910_potential_fp": 13, "1900_1910_training_data_20191023": 3, "1900_1910_training_result": 13, "1910": [1, 3, 13], "1920": 13, "1930": [3, 13], "1940": [3, 13], "1999": [0, 3], "2": [0, 1, 2, 3, 4, 8, 9, 12, 13], "20": 9, "25": 1, "3": [0, 1, 2, 3, 6, 8, 9, 10, 13], "300": 0, "301": 0, "302": 0, "303": 0, "4": [0, 1, 9], "5": [0, 1, 2, 3, 9, 10, 11, 13], "50": [3, 13], "50g": 13, "53": 3, "5g": 11, "6": [0, 3, 9, 10, 13], "65": 3, "7": [0, 1, 3, 9, 13], "75": [3, 9, 13], "79": [2, 3], "7th": 12, "8": [1, 3, 6, 11], "80": 3, "84": [2, 3], "85": [9, 11], "9": 1, "95": 1, "99": [1, 3], "9998": 0, "9999": [0, 3, 10], "A": [0, 1, 2, 3, 4, 10, 11], "AND": [1, 2, 3], "As": [0, 2], "At": [7, 8], "But": [3, 6], "By": [0, 3, 11], "For": [0, 1, 2, 3, 8, 11, 13], "If": [0, 1, 3, 4, 8, 9, 11, 12], "In": [0, 1, 2, 7, 11, 13], "It": [0, 1, 2, 3, 7, 11, 13], "NOT": 1, "OR": [1, 2, 3], "THEN": 1, "The": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13], "Then": [0, 6], "There": [1, 3, 8], "These": [0, 1, 2, 3, 7, 8, 9, 10], "To": [0, 1, 6, 8, 11], "Will": 3, "With": [0, 10], "_": [0, 1, 2, 3, 4, 5, 9, 10, 11], "_a": 1, "_bpl": 1, "_namefrst": 1, "_sex": 1, "a304bt": 3, "ab": 1, "abbrevi": [8, 12], "abl": 3, "about": [3, 11, 13], "abov": [1, 2, 6], "absolut": 1, "accept": [1, 3, 13], "access": [11, 13], "accord": 1, "across": 1, "ad": [0, 1, 2, 3], "add": [0, 2, 5], "add_to_a": 3, "addit": [0, 1, 3, 6, 7, 11], "addl": 1, "addl_var": 1, "adjust": 11, "adopt": 0, "advanc": 5, "affect": [4, 12], "after": [1, 2, 3, 5, 9, 11], "ag": [0, 1, 3, 4], "against": [1, 3, 12], "age_2": 3, "age_at_dataset_b": 0, "age_threshold": 1, "aggreg": 5, "ah": 0, "ahead": 8, "akin": 1, "algorithm": [1, 2, 7, 8], "alia": [0, 3, 8], "all": [0, 1, 3, 4, 8, 9, 10, 11], "allow": [1, 3, 8, 13], "along": 1, "alpha": 9, "alphabet": 0, "alphanumer": 3, "also": [0, 1, 2, 3, 6, 7, 8, 10, 11, 13], "altern": [0, 3], "although": 3, "alwai": 2, "among": 1, "amount": 3, "an": [0, 1, 2, 3, 7, 9, 11], "analysi": 13, "analyz": [11, 13], "ani": [1, 3, 4, 6, 9], "anoth": [0, 1, 3, 4, 8], "anyon": 8, "anywher": 12, "apach": 6, "apart": 0, "api": [3, 7, 10], "apostroph": 0, "appear": [0, 1], "appli": [0, 2, 3, 4, 8, 13], "apply_model": 3, "appropri": 3, "ar": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "arbitrari": 0, "area": 3, "aren": 1, "arg": 13, "argument": [1, 11, 13], "around": 3, "arrai": [0, 3, 5, 9, 10], "array_index": 3, "ask": 13, "aspect": [8, 11], "assert": [1, 11], "assum": 3, "attach_vari": 3, "attempt": 3, "attribut": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12], "auto": 9, "automat": [0, 3, 6, 8], "av": 12, "avail": [0, 1, 3, 4, 6, 8, 9, 10, 13], "avenu": [0, 12], "b": [0, 1, 2, 3, 4, 11], "b200": 3, "back": 1, "backup": 1, "base": [1, 3, 8], "basic": 5, "becaus": 2, "been": 7, "befor": [0, 1, 3, 4, 6, 8], "begin": 11, "behind": 0, "being": [1, 8], "belong": 3, "below": [0, 1, 3, 4, 9, 10, 11], "best": [3, 8], "beta": [3, 9], "better": [0, 7], "between": [0, 1, 2, 3, 7, 8, 11, 13], "beyond": 1, "bigint": 3, "bigram": [3, 5], "bin": 9, "binomi": 9, "birth": 1, "birthplac": [0, 3], "birthyr": [0, 3], "birthyr_3": 3, "birthyr_col": 1, "block": [2, 5, 8], "blvd": 0, "boolean": [1, 3, 4, 12, 13], "boost": [5, 9], "born": 1, "borrow_t": 11, "both": [0, 1, 2, 3, 8, 13], "boundari": 1, "bpl": [0, 1, 3], "bpl1": 3, "bpl2": 3, "bpl2_str": 3, "bpl3": 3, "bpl_clean": 3, "bpl_orig": 3, "bpl_root": 0, "bpl_str": 3, "bplmatch": 3, "broken": 7, "btwn": 1, "bucket": [3, 8], "built": 6, "builtin": 1, "byrdifcat": 3, "byrdiff": [1, 3, 13], "c": 1, "c201": 3, "calcul": [1, 13], "call": 0, "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "cannot": 3, "care": 2, "cartesian": 1, "case": [0, 1, 2, 3, 4, 7], "cast": 1, "categor": [1, 3, 9, 10], "categori": [0, 9], "caus": 9, "caution": [1, 10], "censu": [0, 8, 13], "census": 7, "chang": [0, 1, 3, 6, 11, 13], "charact": 0, "characterist": [7, 8], "check": [1, 11], "check_val_expr": 1, "checkpoint": 4, "child": [0, 9], "children": 1, "chines": 0, "choic": 11, "chosen": 8, "chosen_model": [3, 9, 13], "circl": 0, "circumst": 1, "class": 11, "classif": [8, 9], "classifi": [], "claus": [0, 1], "clean": [0, 7], "clean_birthyr": [0, 3, 4], "clone": 6, "code": [0, 1, 3, 6], "coeffici": [3, 8], "col": [0, 1], "col_to_add": 3, "col_to_join_on": 3, "col_typ": 3, "column": [1, 4, 5, 8, 10, 11, 12, 13], "column_map": [0, 3, 8], "column_nam": [0, 1, 3, 12], "column_to_append": 0, "combin": [1, 2, 3, 4, 8], "come": 1, "command": [3, 7, 11, 13], "comment": 4, "common": [7, 8, 12, 13], "commonli": 9, "comp": 1, "comp_a": [1, 2, 3], "comp_b": [1, 2, 3], "comp_c": 1, "comp_d": 1, "compar": [0, 1, 3, 7, 8], "comparis": 3, "comparison": [5, 8], "comparison_featur": [1, 2, 3, 8], "comparison_typ": [1, 2, 3], "complet": 11, "complex": [3, 4], "comput": [1, 4, 8], "concat": 0, "concaten": [0, 1], "condens": 0, "condense_strip_whitespac": 3, "condit": [0, 1, 2, 3, 4, 5, 8], "conf": [11, 13], "config": [1, 4, 5, 8, 11, 13], "configur": [0, 1, 2, 7, 11, 13], "conjuct": 3, "conjunct": 3, "connect": [2, 3], "consid": [1, 2, 9], "consider": 1, "consol": 11, "constraint": [1, 2], "construct": 8, "contain": [0, 1, 3, 12], "context": 10, "continu": [9, 10, 13], "conveni": 11, "convert": [0, 1, 3], "convert_ints_to_long": 3, "copi": [4, 13], "core": [1, 8, 11, 13], "correspond": [7, 8], "could": [0, 2, 3], "count": [1, 11, 13], "counterpart": [], "counti": [0, 1], "county_1900_1910_distances_km": 1, "county_a": 1, "county_b": 1, "county_dist": [1, 3, 13], "county_distance_lookup": 1, "county_distance_squar": [1, 3, 13], "county_state_dist": 1, "court": 0, "cover": 10, "coverag": 3, "cpu": 11, "creat": [0, 3, 7, 8, 10, 11, 12, 13], "creation": 3, "crosswalk": 8, "csv": [1, 3, 8, 11, 12, 13], "current": [1, 2, 3, 11], "d": 1, "d425": 3, "data": [0, 1, 5, 7, 8, 11], "databas": 11, "datafram": [8, 11, 13], "dataset": [0, 1, 3, 4, 7, 8, 11, 13], "datasourc": [1, 3, 11], "datasource_a": [3, 8], "datasource_b": [3, 8], "de": 9, "decis": [3, 5, 9, 13], "decisiontreeclassifi": 9, "default": [0, 1, 2, 3, 8, 9, 11], "defin": [1, 3, 5, 8, 9, 10, 11], "definit": [3, 8], "demograph": 8, "depend": [1, 2, 3, 6, 13], "dependent_var": [3, 13], "depth": 9, "deriv": 13, "derived_from": 3, "desc": 11, "describ": [0, 1, 2, 3, 11], "descript": [3, 9, 11], "detail": [0, 3, 11], "detect": [], "determin": [1, 8], "determinist": [7, 8], "dev": 6, "develop": [6, 7], "df": [3, 11], "dictionari": 11, "diff": 1, "differ": [0, 1, 3, 5, 7, 8], "digit": 0, "dir": 12, "directli": 2, "directori": [6, 11, 13], "disabl": [], "discard": 9, "discret": 9, "discuss": 3, "dist": 1, "dist_tabl": 1, "distanc": [1, 9], "distance_col": 1, "distance_km": 1, "distances_fil": 1, "distinct": 1, "divid": 0, "divide_by_int": 3, "do": [0, 1, 2, 4, 11, 13], "doc": [9, 10], "document": [1, 3, 9, 11, 13], "doe": [1, 4, 8, 13], "don": [3, 4], "doubl": 10, "down": [0, 7, 13], "drastic": 8, "drop": [0, 3, 11], "drop_al": 11, "drop_all_prc": 11, "drop_all_temp": 11, "drop_data_from_scored_match": 3, "drop_duplicate_a": 3, "drop_duplicate_with_threshold_ratio": [3, 13], "duplic": [3, 9], "durat": 1, "dure": [1, 2, 3, 8], "durmarr": [1, 3], "e": 6, "each": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11], "easiest": [6, 11], "easili": 11, "edit": 6, "effect": [2, 4], "either": [0, 1, 2, 3, 7, 12], "either_0": 1, "either_1": 1, "element": 0, "elig": 8, "els": [1, 3, 4], "else_v": 1, "else_valu": 0, "empti": 1, "enabl": [3, 8, 11], "encod": [3, 4], "encount": [], "end": [0, 1, 3, 4, 12], "enorm": [], "ensur": 3, "enter": 11, "entir": [0, 3], "enum_dist": 1, "enumer": 11, "equal": [2, 3, 12], "equal_and_not_null_templ": 1, "equival": 3, "error": [3, 10], "especi": 3, "eta": [], "etc": 13, "eval": 3, "evalu": [1, 3, 7, 8, 9], "even": 1, "everi": [1, 4], "ex": 1, "exact": [1, 3], "exact_mult": [3, 13], "exampl": [0, 1, 2, 3], "except": [3, 10], "exclud": 1, "excute_command": 11, "execut": 11, "execute_command": 11, "execute_task": 11, "executor": 11, "executor_memori": [11, 13], "exist": [1, 3, 11], "exit": 11, "expand": 3, "expand_length": 3, "expect": 1, "experi": 7, "experiment": [8, 11], "explan": 9, "explicitli": [1, 10], "explod": [2, 3, 8], "exploded_df_a": 11, "exploded_df_b": 11, "explor": [3, 5, 7, 11], "expon": 4, "exponenti": 1, "export": [5, 8, 11], "express": [0, 1, 3], "extend": 1, "extens": 8, "extra": 1, "extract": 3, "f": [1, 11], "f1": 1, "f1_match": 3, "f2": 1, "f2_match": 3, "f_caution": [3, 13], "f_interacted_jw_f": [3, 13], "f_pre": [3, 13], "factori": 11, "fail": 3, "fallback": 1, "fals": [1, 3, 4, 5, 7, 11], "famili": 9, "father_namefrst": 1, "favor": 1, "fbpl": 1, "fbpl_nomatch": 1, "fbplmatch": [3, 13], "featur": [2, 5, 7, 8, 9, 11], "feature_import": [3, 8, 13], "feature_nam": [2, 3], "feature_select": [2, 3, 4, 8], "featuresubsetstrategi": 9, "fed": [3, 8], "femal": [3, 12], "fetch": 1, "fetch_a": 3, "few": 4, "fewer": [1, 9], "fi": 1, "file": [1, 4, 5, 7, 8, 11, 12, 13], "filepath": 11, "fill": 1, "filter": [1, 2, 5, 8, 12], "final": [1, 3, 13], "find": [1, 8, 13], "finish": 11, "first": [0, 1, 3, 6, 8, 11, 12], "first_init_col": 1, "first_nam": 0, "five": 11, "fix": 8, "flag": [1, 2, 3, 10, 11, 13], "flexibl": [1, 2], "float": [1, 3, 9], "floor": 0, "focus": [2, 8], "follow": [0, 1, 2, 7, 11, 12, 13], "force_row_wis": [], "foreign": 1, "forest": [5, 9], "form": [1, 3, 8, 12], "format": 0, "four": 1, "framework": 13, "from": [0, 1, 3, 5, 7, 8, 9, 10, 11, 13], "from_icpsrctyi": 1, "from_statefip": 1, "fsoundex": [3, 13], "full": [3, 8, 13], "full_count_1870_1880": 11, "full_count_1900_1910": 13, "fullcount_1870_1880": 11, "function": [0, 1, 3, 7, 11], "further": [7, 13], "gamma": [], "gbtclassifi": 9, "gen": 1, "gener": [0, 1, 5, 7, 8, 11], "generalizedlinearregress": 9, "geo": 1, "geograph": 1, "get": [0, 1, 2, 3, 11], "get_floor": 3, "get_set": 11, "get_step": 11, "get_tabl": 11, "get_task": 11, "github": 6, "give": [0, 3], "given": [0, 1, 2, 3, 4, 9, 13], "go": [3, 11], "good": 1, "gradient": [5, 9], "greater": [1, 2, 6], "greatest": 1, "group": [3, 8], "gt": 1, "h": 11, "ha": [0, 1, 2, 3, 4, 7, 11, 13], "handl": 11, "harmon": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "haven": 3, "head": 0, "header": [3, 10, 12], "help": [3, 8, 11], "here": [2, 3, 8, 11, 13], "hh": [1, 2], "hh_blocked_match": 11, "hh_col": 3, "hh_comparison": [2, 3, 8], "hh_match": [3, 11], "hh_model_eval_repeat_fn": 11, "hh_model_eval_repeat_fp": 11, "hh_model_eval_training_data": 11, "hh_model_eval_training_featur": 11, "hh_model_eval_training_result": 11, "hh_model_eval_training_vector": 11, "hh_model_explor": 11, "hh_potential_match": [2, 11], "hh_potential_matchs_prep": 11, "hh_predicted_match": 11, "hh_repeat_fn": 13, "hh_repeat_fp": 13, "hh_scored_potential_match": 11, "hh_train": [1, 3, 8, 11, 13], "hh_training_data": 11, "hh_training_data_1900_1910": 3, "hh_training_featur": [11, 13], "hh_training_result": 13, "hidden": 11, "hierarch": [7, 11], "hierarchi": 11, "high": 11, "highest": [1, 3], "highli": [7, 8], "histid": [1, 3, 13], "histid_col": 1, "hit": [3, 11, 13], "hits2": [3, 13], "hlink": [0, 1, 2, 3, 4, 6, 7, 8, 13], "hold": 10, "hot": 3, "household": [0, 2, 5, 7, 9, 11, 13], "how": [1, 3, 8], "howev": [4, 7, 13], "hundr": 0, "hyper": [3, 7, 13], "hyperparamet": [9, 13], "hyphen": 0, "i": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13], "id": [1, 3], "id_column": [1, 3], "ident": 13, "identifi": [3, 7, 13], "if_valu": 0, "ignor": 7, "ii": [0, 3], "iii": [0, 3], "imm": [1, 3, 13], "imm_interacted_bplmatch": 3, "imm_interacted_immyear_caut": [3, 13], "immigr": 1, "immyear_caut": [3, 10], "immyear_diff": [1, 3, 10, 13], "implement": 13, "implicitli": 2, "import": [3, 8, 11, 13], "improv": 8, "includ": [1, 3, 8, 10, 11], "incompar": 1, "increas": [3, 10], "independ": [0, 3], "independent_var": [1, 3, 13], "index": [0, 6], "indic": [1, 13], "individu": [1, 3, 8, 13], "inf": 10, "inform": [0, 1, 3, 11], "ingest": 8, "initi": [0, 1, 11], "input": [0, 1, 3, 4, 7, 8, 11, 12], "input_col": 4, "input_column": [3, 4, 10], "input_table_nam": 11, "inspect": 8, "instal": 5, "instanc": [0, 9], "instead": [1, 3, 4, 6, 8], "instruct": [6, 11], "int": [0, 1, 3, 4, 9], "integ": [0, 1, 3, 10], "integr": 6, "interact": [3, 5, 8, 13], "interfac": 11, "intermedi": 11, "introduct": 5, "introspect": 8, "invalid": [1, 9], "ipum": [0, 7], "ipython": 11, "isn": 13, "istemporari": 11, "item": 0, "iter": 3, "its": [0, 1, 7, 11], "itself": [1, 2], "iv": 3, "jaro": [1, 10], "jaro_winkl": 3, "java": 6, "job": 11, "join": [1, 3, 12], "join_column": [3, 12], "join_valu": [3, 12], "jr": [0, 3], "json": [3, 11], "just": [1, 3, 11, 13], "jw": 1, "jw_col_templ": 1, "jw_f": [1, 3, 13], "jw_m": [3, 10, 13], "jw_max_a": 3, "jw_max_b": 3, "jw_sp": [3, 13], "jw_street": 1, "jw_threshold": 1, "kei": [1, 8, 11], "key_count": 1, "kind": 1, "know": 3, "known": 0, "label": 9, "languag": 7, "larger": [], "last": [0, 1, 8, 10], "latest": 6, "launch": [11, 13], "law": 0, "le": [], "lead": 0, "learn": [1, 2, 3, 7, 8, 11, 13], "learningr": [], "least": [0, 1], "leav": 0, "left": 9, "length": [1, 3, 10], "leq": [], "less": [1, 3], "let": 11, "letter": 0, "level": [1, 5, 11], "leverag": 3, "libomp": [], "librari": [5, 7], "lightgbm": [], "lightgbmclassifi": [], "like": [0, 2, 3, 8, 11, 12], "limit": 3, "line": [7, 11], "link": [0, 1, 3, 5, 7, 9], "link_run": 11, "linkrun": [7, 11], "list": [0, 1, 3, 4, 5, 9, 11, 12], "liter": 3, "ll": 11, "load": 11, "load_conf_fil": 11, "load_config": 11, "loc_a": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b": 1, "loc_b_0": 1, "loc_b_1": 1, "locat": [1, 3, 11], "log2": 9, "logic": [0, 2, 3], "logist": [5, 9], "logistic_regress": [3, 13], "logisticregress": 9, "long": [3, 12], "longest": 8, "look": [1, 2, 11, 12], "lookup": 1, "lower": [0, 1], "lowercas": 0, "lowercase_strip": 3, "lowest": 0, "lr": 11, "lsoundex": [3, 13], "m": [0, 1], "m_caution": [1, 3, 10, 13], "m_interacted_jw_m": [3, 10, 13], "m_namefrst": 1, "m_pre": [3, 10, 13], "machin": [1, 2, 3, 7, 8, 11, 13], "made": 6, "mai": [0, 2, 3, 7, 8, 11, 12], "main": 11, "mainli": 1, "major": [1, 11], "make": [0, 1, 2, 3, 6, 13], "male": [3, 12], "mani": [2, 8, 11], "manual": 13, "map": [5, 8, 10], "mardurmatch": [1, 3], "mark": 2, "marriag": 1, "match": [0, 1, 2, 5, 7, 11, 12, 13], "matches_df": 11, "matrix": 13, "max": [1, 9, 11], "max_depth": [], "maxbin": 9, "maxdepth": [3, 9, 13], "maximum": [1, 9], "maximum_jw": 1, "mbpl": 1, "mbplmatch": [3, 13], "mean": [0, 2, 3], "meant": 3, "meet": 1, "member": [1, 8], "memori": 11, "men": 3, "messag": 11, "metadata": 8, "method": [11, 13], "mfbplmatch": 1, "mi": [3, 13], "mi0": 1, "mi1": 1, "mid_init_col": 1, "mid_init_match": 1, "middl": [0, 1], "might": 13, "minimum": [1, 9], "mininstancespernod": 9, "minu": [1, 3], "mismatch": 1, "miss": [1, 11], "ml": [3, 5, 9, 10], "mode": [5, 6, 13], "model": [5, 7, 11], "model_eval_repeat_fn": 11, "model_eval_repeat_fp": 11, "model_eval_training_data": 11, "model_eval_training_featur": 11, "model_eval_training_result": 11, "model_eval_training_vector": 11, "model_explor": [3, 11, 13], "model_paramet": [3, 8, 9, 13], "modifi": 3, "modul": 11, "moment": 8, "momloc": 1, "more": [0, 1, 2, 3, 10, 11, 13], "most": [0, 1, 8, 11], "mother": 10, "mother_birthyr": 1, "mpre": 1, "mr": 0, "much": [1, 8], "mult": 1, "multi": 1, "multipl": [0, 1, 5, 11], "multipli": 1, "must": [0, 1, 2, 3, 4, 9, 10, 12], "my": [11, 13], "my_conf": 11, "my_fil": 3, "myriad": 3, "n": [1, 9, 10], "n_training_iter": [3, 8, 13], "name": [0, 1, 3, 4, 11, 12], "name_col": 1, "name_std": [3, 12], "namefrst": [0, 1, 3], "namefrst_clean": [0, 3], "namefrst_init": 1, "namefrst_jw": [1, 2, 3, 13], "namefrst_mid_init": [0, 1], "namefrst_mid_init_2": 1, "namefrst_rel": 1, "namefrst_related_row": 1, "namefrst_split": [0, 3], "namefrst_std": [3, 12], "namefrst_std_jw": [3, 13], "namefrst_unstd": [1, 3], "namefrst_unstd_bigram": 4, "namelast": [1, 3], "namelast1": 1, "namelast2": 1, "namelast3": 1, "namelast_bigram": 3, "namelast_clean": [1, 3, 4], "namelast_clean_bigram": [3, 4], "namelast_clean_soundex": 4, "namelast_equal_as_int": 1, "namelast_frst_bigram": 4, "namelast_jw": [2, 3, 13], "namelast_jw_max": 1, "namelast_neighbor": 1, "namelast_popular": 1, "namelast_popularity_sum": 1, "nativ": 1, "nbor": [1, 3, 13], "ncount": [3, 4, 13], "ncount2": [3, 4, 13], "nearest": 0, "necessari": [4, 13], "need": [0, 1, 2, 3, 8, 11, 13], "neg": [3, 5, 7], "neighbor": 1, "neighborhood": 1, "neither": 1, "nest": [2, 3, 7], "new": [0, 3, 13], "new_marr": [1, 3], "newli": 3, "niu": 1, "no_first_pad": 4, "node": 9, "non": 0, "nonzero": 1, "nor": 1, "normal": 8, "note": [2, 3], "now": [2, 11], "null": [1, 3, 4], "null_fil": 3, "num": 11, "num_col": 1, "number": [0, 1, 3, 4, 8, 9, 11], "numer": [0, 1], "numtre": [3, 9, 13], "nvl": 1, "object": [1, 3, 7, 11], "often": [2, 4, 11], "onc": [1, 11], "one": [0, 1, 3, 8], "onethird": 9, "onli": [0, 1, 2, 3, 8, 13], "ons": 5, "oper": [0, 1, 2, 3], "opt": [], "option": [0, 1, 3, 4, 7, 8, 9, 11, 13], "or_group": 3, "order": [0, 3, 8], "org": 6, "organ": 11, "original_valu": 3, "oth": [3, 13], "other": [1, 3, 13], "otherwis": [0, 1, 10, 13], "our": 11, "out": [3, 4, 8, 13], "output": [0, 1, 2, 3, 4, 7, 8, 11, 13], "output_col": 4, "output_column": [3, 4, 10], "output_suspicious_td": [3, 13], "output_table_nam": 11, "output_typ": 3, "outsid": 10, "overrid": [0, 3], "override_column_a": [0, 3, 4], "override_column_b": [0, 3, 4], "override_transform": [0, 3], "overview": 5, "own": [1, 3], "p": 3, "packag": 6, "page": [1, 2, 3, 11], "pair": [1, 2, 3, 13], "param": [9, 13], "param_grid": [3, 13], "paramet": [3, 7, 8, 9, 11, 13], "parent": [0, 1, 12], "parent_step_chang": 1, "parenthes": 3, "parquet": [3, 8], "part": [3, 8], "particular": [1, 2, 3], "particularli": 1, "partit": 13, "pass": [1, 8, 9], "passthrougharg": [], "path": [1, 3, 11, 12, 13], "pattern": 11, "peopl": [0, 1, 7, 11], "per": [1, 3, 9, 10, 11], "percent": 8, "percentag": 1, "perform": [1, 3, 7, 8, 12], "persist": 11, "person": [0, 1, 7], "pip": 6, "pipelin": 5, "pipeline_featur": [3, 8, 10], "piplin": 3, "placehold": 1, "pleas": [3, 11], "plu": 3, "point": [10, 13], "popul": 8, "posit": [0, 1, 3, 5, 7], "possibl": 3, "post": [3, 8], "potenti": [1, 2, 5, 8], "potential_match": [1, 2, 8, 11], "potential_matches_prep": 11, "potential_matches_univers": [3, 8], "power": 5, "predefin": 1, "predict": [3, 13], "predicted_match": 11, "prediction_col": 3, "preexist": 11, "prefer": 13, "prefix": 0, "prep": 3, "prep_step": 11, "prepar": [7, 8, 11], "prepend": 4, "prepped_df_a": 11, "prepped_df_b": 11, "preprocess": [5, 7, 11, 13], "present": [1, 3, 10], "pretti": 2, "primari": [1, 7], "primarili": [1, 8], "print": 11, "probabilist": [3, 7], "probabl": [0, 3, 9], "probit": [3, 5], "proceed": 12, "process": [2, 3, 7, 11], "produc": [0, 11], "product": [1, 7, 13], "program": [1, 3, 8, 13], "project": 6, "prompt": 11, "provid": [0, 3, 4, 7, 8, 10, 11], "pull": 8, "punctuat": 0, "put": [11, 13], "py": [1, 3], "pypi": 5, "pyspark": [6, 9, 10, 11], "python": [6, 7, 11], "q": [11, 13], "qmark": 0, "qualifi": 3, "queri": [1, 3], "race": [0, 1, 3, 10, 13], "race_interacted_srac": [3, 10, 13], "racematch": 3, "rais": [1, 4], "random": [5, 9], "random_forest": [3, 13], "randomforestclassifi": 9, "rang": [3, 10], "rate": 1, "ratio": [3, 9], "ration": 0, "rationalize_name_word": 3, "raw": [0, 3, 8, 11], "raw_df_a": 11, "raw_df_b": 11, "read": [0, 1, 3, 8, 11], "readm": 3, "real": 7, "reason": 1, "recod": 0, "record": [0, 1, 2, 3, 7, 8], "recurs": [2, 3], "reduc": [4, 8], "refer": 3, "regex": 5, "regex_word_replac": 12, "region": [3, 13], "region_dict": 3, "regionf": 3, "regist": [8, 11], "regress": [5, 9], "regular": 13, "rel": [1, 3, 13], "relat": [0, 1, 3], "relate_a": [3, 10], "relate_col": 1, "relate_div_100": [0, 1, 3], "related_individual_max_jw": 1, "related_individual_row": 1, "relatematch": [1, 3], "relatetyp": [3, 10], "relatetype_interacted_relatematch": 3, "relationship": 7, "relev": 13, "reload": 11, "remain": 8, "remov": 0, "remove_alternate_nam": 3, "remove_qmark_hyphen": 3, "remove_suffix": 3, "repeat_fn": 13, "repeat_fp": 13, "repeatedli": 3, "replac": [0, 1, 2, 5], "replace_apostroph": 3, "replaced_birthyr": [1, 3, 4], "report": [1, 5, 7, 11], "repositori": 6, "repres": [0, 1, 3, 10, 11], "represent": [3, 8], "reproduc": 11, "request": 8, "requir": [0, 1, 2, 3, 4, 5, 8, 10, 11, 12], "research": 7, "reshap": 8, "resourc": 4, "respect": [1, 8], "restrict": 2, "result": [0, 1, 7, 10, 11, 13], "return": [1, 4, 9, 11], "reus": 5, "right": [9, 11], "risk": 1, "road": 0, "robust": 3, "root": 6, "round": [0, 3], "row": [0, 3, 4], "rule": [2, 7, 8], "run": [5, 6, 7, 8, 13], "run_all_step": [3, 11, 13], "run_step": 11, "runtim": 8, "sai": 11, "same": [0, 1, 2, 3, 4, 7, 8, 11], "sampl": 1, "satisfi": [2, 3, 8], "save": [3, 8, 13], "scala": 1, "scale": 3, "scale_data": [3, 13], "scenario": 13, "schema": 3, "score": [1, 3, 8, 10], "score_with_model": [3, 13], "scored_potential_match": 11, "scratch": 3, "script": [3, 7, 11], "search": 1, "second": [0, 1, 3, 12], "secondari": 1, "secondary_distance_col": 1, "secondary_distances_fil": 1, "secondary_key_count": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_source_column": 1, "secondary_table_nam": 1, "section": [0, 1, 2, 3, 13], "see": [1, 3, 6, 11, 13], "seen": 1, "select": [0, 1, 5, 7, 11, 13], "send": [], "separ": 8, "sequenc": 7, "seri": 11, "serialp": 3, "serv": 1, "set": [0, 1, 2, 3, 4, 7, 8, 11, 13], "set_executor_memori": 11, "set_link_task": 11, "set_loc": 11, "set_num_cor": 11, "set_preexisting_t": 11, "set_print_sql": 11, "set_value_column_a": [3, 4], "set_value_column_b": [3, 4], "sever": [1, 7], "sex": [1, 3, 12], "sex_equ": 3, "sex_region_interact": 3, "sex_regionf_interact": 3, "sexmatch": 3, "sgen": [1, 3, 13], "share": 7, "short": 3, "should": [1, 3, 4, 9, 10, 11], "show": 11, "showf": 11, "shut": 13, "sibl": 0, "sign": 1, "signific": 0, "similar": [1, 3], "simpli": [0, 3], "simplifi": 1, "sinc": [0, 3], "singl": [0, 4, 11, 13], "size": 1, "skip": [3, 4, 8], "sm_bpl": 1, "sm_namefrst": 1, "sm_sex": 1, "small": 3, "smaller": 7, "smallest": 3, "sn_bpl": 1, "sn_namefrst": 1, "sn_sex": 1, "so": [0, 1, 2, 3, 6, 13], "some": [0, 1, 2, 3, 4, 7, 8, 11], "someth": 11, "sometim": 3, "somewhat": 2, "soundex": 5, "sourc": [1, 5, 8, 11, 13], "source_column_a": 1, "source_column_b": 1, "sp": 1, "sp_caution": [1, 3, 13], "sp_interacted_jw_sp": [3, 13], "sp_pre": [3, 13], "space": [0, 3, 4, 12], "span": 13, "spark": [1, 3, 4, 6, 9, 10, 11, 13], "sparkfactori": 11, "sparksess": 11, "sparkxgbclassifi": [], "special": 1, "specif": [1, 3, 11], "specifi": [0, 1, 3, 7, 8, 10, 11, 12], "split": [3, 4, 8, 9, 10, 13], "split_by_id_a": [3, 13], "sploc": 1, "spous": 0, "spouse_birthyr": 1, "spouse_bpl": 1, "sql": [0, 1, 2, 3, 4, 5, 8, 11], "sql_condit": 3, "sqrt": 9, "squar": 1, "sr": [0, 3], "srace": [3, 10, 13], "stage": 3, "standard": [0, 1, 12], "start": [0, 12], "state": [1, 7], "state_1900_1910_distances_km": 1, "state_dist": 1, "state_distance_lookup": 1, "statecode1": 1, "statecode2": 1, "statefip": [0, 1, 3], "statefip_h": 3, "step": [0, 1, 3, 7], "stepmom": 1, "still": 12, "stop": 0, "street": [0, 1], "street_jw": [1, 3, 13], "street_unstd": 12, "strictli": 10, "string": [0, 1, 3, 4, 8, 9, 11, 12], "strip": [0, 8], "structur": [2, 3, 7], "sub": [1, 2, 3], "subhead": 12, "subset": [3, 12], "substitut": [5, 8], "substitution_column": [3, 8, 12], "substitution_fil": [3, 12], "substitutions_street_abbrev": 12, "subtract": 1, "suffix": 0, "suppli": 12, "support": [0, 2, 3, 8, 9], "suppos": [0, 2, 3], "sure": [2, 3, 6, 11], "surnam": 1, "surround": 0, "suspect": [1, 7], "swap": 12, "synaps": [], "syntax": 2, "system": 6, "t": [0, 1, 3, 4, 13], "tabl": [1, 2, 3, 5, 8, 11, 13], "table_nam": 1, "tablenam": 11, "tailor": 7, "take": [0, 1, 2, 3, 4, 8, 11], "taken": [0, 1], "task": [2, 3, 5, 7, 9, 13], "task_nam": 11, "tell": [1, 3, 4], "templat": 1, "ten": 1, "term": 3, "test": [3, 8, 13], "text": 11, "than": [0, 1, 2, 3, 9], "thei": [0, 1, 2, 3, 8, 11], "them": [0, 1, 3, 8], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 13], "thing": 3, "those": [1, 3], "though": 1, "thought": 8, "three": [2, 3, 8], "threshold": [2, 3, 9, 13], "threshold_expr": [2, 3], "threshold_ratio": [3, 9, 13], "through": [6, 8, 10, 11], "thu": 1, "time": [0, 3, 8, 11], "to_icpsrctyi": 1, "to_statefip": 1, "togeth": [0, 1, 3], "toml": [3, 7, 11], "tool": [6, 7], "top": [1, 5], "topic": 11, "total": 8, "train": [1, 5, 7, 9, 11], "training_data": [3, 11], "training_data_1900_1910": 13, "training_data_1900_1910_hlink_featur": 13, "training_data_subset": 3, "training_featur": [11, 13], "training_result": 13, "transform": [1, 3, 5, 7, 8], "transformer_typ": [3, 10], "treat": [3, 10], "tree": [5, 9], "true": [1, 3, 4, 8, 10, 12, 13], "try": 3, "tune": [7, 13], "tutori": [3, 11], "two": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "type": [0, 3, 4, 5, 8, 9, 11, 12, 13], "typic": [3, 8], "u": 0, "ugli": 2, "under": [1, 3], "underli": [], "understand": 7, "union": 5, "uniqu": 3, "unit": 7, "univers": [1, 5, 8], "unknown": 1, "unlik": 2, "unrel": 1, "unstabl": 11, "up": [1, 3, 11, 12], "updat": 13, "upper": 1, "upper_threshold": 3, "uppercas": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13], "us1900": 3, "us1900m_usa": 3, "us1910": 3, "us1910m_usa": 3, "usag": [3, 4, 5, 11], "use_potential_matches_featur": 13, "use_training_data_featur": [3, 8, 13], "user": [1, 11], "usual": [0, 8, 13], "util": 4, "v": 3, "valu": [0, 1, 2, 3, 4, 9, 10, 11, 12], "var": [1, 3], "vari": [0, 3, 4], "variabl": [0, 1, 3, 13], "variant": 12, "varieti": 8, "variou": [], "ve": 13, "vector": [3, 10], "verbos": 2, "veri": [4, 8, 13], "version": [0, 6, 13], "vi": 3, "via": [6, 7], "view": [], "vii": 3, "viii": 3, "volumn": 1, "wa": [1, 13], "wai": [1, 2, 3, 6, 11], "want": [0, 1, 3, 11, 13], "washington": 3, "we": [0, 1, 11, 13], "well": 3, "were": 3, "weren": 13, "what": [1, 3, 11, 13], "when": [0, 1, 2, 3, 4, 8, 13], "where": [1, 8, 11, 13], "whether": [1, 3, 12], "which": [0, 1, 2, 3, 4, 7, 8, 10, 11, 12, 13], "white": 0, "whitespac": [0, 8], "who": 1, "whole": [0, 7], "whose": 1, "width": 8, "winkler": [1, 10], "within": [1, 3, 7, 11, 12], "without": [], "word": [0, 5], "work": [0, 1, 3, 6, 8, 11, 13], "workflow": [4, 5], "world": 7, "would": [1, 2, 3, 13], "write": [1, 11, 13], "written": [3, 7], "x": [3, 10, 11], "x_crosswalk": 11, "x_hh_tfam": 11, "x_hh_tfam_2a": 11, "x_hh_tfam_2b": 11, "x_load": 11, "x_parquet_from_csv": 11, "x_persist": 11, "x_sql": 11, "x_sqlf": 11, "x_summari": 11, "x_tab": 11, "x_tfam": 11, "x_tfam_raw": 11, "x_union": 11, "xgboost": [], "y": 10, "year": [0, 1, 3, 4, 5], "year_b": 1, "yet": 11, "you": [0, 1, 2, 3, 6, 11, 12, 13], "your": [2, 3, 4, 6, 8, 11, 13], "yrimmig": 1, "zero": 1}, "titles": ["Column Mappings", "Comparison Features", "Comparisons", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "titleterms": {"": 5, "1": 12, "abs_diff": 1, "add": 1, "add_to_a": 0, "advanc": [0, 3, 13], "after": 13, "aggreg": 1, "alia": 1, "all_equ": 1, "any_equ": 1, "api": 5, "arrai": 4, "array_index": 0, "b_minus_a": 1, "basic": [0, 3], "bigram": 4, "block": 3, "btwn_threshold": 1, "bucket": 10, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "column": [0, 3], "comparison": [1, 2, 3], "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "condense_strip_whitespac": 0, "config": 3, "configur": [3, 5, 8], "data": [3, 12, 13], "decision_tre": 9, "defin": 2, "differ": 13, "divide_by_int": 0, "document": 5, "either_are_0": 1, "either_are_1": 1, "equal": 1, "equals_as_int": 1, "exact_mult": 1, "exampl": [11, 13], "explor": [8, 13], "export": 13, "extra_children": 1, "f1_match": 1, "f2_match": 1, "fals": 13, "featur": [1, 3, 4, 10, 13], "fetch_a": 1, "fetch_b": 1, "file": 3, "filter": 3, "fn": 13, "fp": 13, "from": 6, "gener": [3, 10, 13], "geo_dist": 1, "get_floor": 0, "gradient_boosted_tre": 9, "gt_threshold": 1, "hit": 1, "hits2": 1, "hlink": [5, 11], "household": [1, 3, 8], "instal": 6, "interact": [10, 11], "introduct": 7, "jaro_winkl": 1, "jaro_winkler_r": 1, "jaro_winkler_street": 1, "jw_max_a": 1, "jw_max_b": 1, "length_b": 1, "level": 3, "librari": 11, "lightgbm": [], "link": [8, 11, 13], "list": 13, "logistic_regress": 9, "look_at_addl_var": 1, "lower_threshold": 1, "lowercase_strip": 0, "map": [0, 3], "match": [3, 8], "max_jaro_winkl": 1, "maximum_jaro_winkl": 1, "ml": 13, "mode": 11, "model": [3, 8, 9, 13], "multi_jaro_winkler_search": 1, "multipl": [2, 3], "neg": 13, "neither_are_nul": 1, "not_equ": 1, "not_zero_and_not_equ": 1, "ons": 1, "overview": [2, 7, 8], "pipelin": [3, 10], "posit": 13, "potenti": [3, 13], "power": [1, 4], "preprocess": 8, "present_and_equal_categorical_in_univers": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_both_year": 1, "probit": 9, "program": 11, "pypi": 6, "random_forest": 9, "rationalize_name_word": 0, "regex": 12, "rel_jaro_winkl": 1, "relat": 8, "remove_alternate_nam": 0, "remove_one_letter_nam": 0, "remove_prefix": 0, "remove_punctu": 0, "remove_qmark_hyphen": 0, "remove_stop_word": 0, "remove_suffix": 0, "replac": 12, "replace_apostroph": 0, "report": 8, "requir": 6, "reus": 13, "run": 11, "second_gen_imm": 1, "section": 8, "select": [3, 4], "singl": 3, "soundex": 4, "sourc": [3, 6], "split": 0, "sql_condit": [1, 4], "start": 11, "step": [8, 11], "substitut": [3, 12], "substr": 0, "sum": 1, "tabl": 12, "task": [8, 11], "threshold": 1, "time": 1, "top": 3, "train": [3, 8, 13], "transform": [0, 4, 10], "type": [1, 2, 10], "union": 4, "univers": 3, "upper_threshold": 1, "us": 11, "usag": 0, "welcom": 5, "when_valu": 0, "word": 12, "workflow": [11, 13], "xgboost": [], "year": 13}})
\ No newline at end of file
diff --git a/docs/substitutions.html b/docs/substitutions.html
index 2d12001..2103aaf 100644
--- a/docs/substitutions.html
+++ b/docs/substitutions.html
@@ -7,7 +7,8 @@
 
     <title>Substitutions &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -47,7 +48,11 @@ <h1>Substitutions<a class="headerlink" href="#substitutions" title="Link to this
 <p>You must supply a substitution file and either specify <code class="docutils literal notranslate"><span class="pre">regex_word_replace=true</span></code> or supply a join value.</p>
 <section id="substitution-by-data-table">
 <h2>1:1 substitution by data table<a class="headerlink" href="#substitution-by-data-table" title="Link to this heading">¶</a></h2>
-<p>Performs a 1:1 replacement on a filtered subset of the data table.  If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file.  Used to replace variant name forms with standardized name forms, filtering on sex.</p>
+<p>Performs a 1:1 replacement on a filtered subset of the data table.  If the
+input column data equals a value in the second column of the substitution file,
+it is replaced with the data in the first column of the substitution file.
+Used to replace variant name forms with standardized name forms, filtering on
+a column like sex which may affect common names.</p>
 <ul class="simple">
 <li><p>Attributes:</p>
 <ul>
@@ -113,7 +118,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul>
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -146,16 +160,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -173,7 +177,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/substitutions.md.txt"
diff --git a/docs/use_examples.html b/docs/use_examples.html
index 84fbe95..94e3c6a 100644
--- a/docs/use_examples.html
+++ b/docs/use_examples.html
@@ -7,7 +7,8 @@
 
     <title>Advanced Workflow Examples &#8212; hlink 3.7.0 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
-    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
+    <link rel="stylesheet" type="text/css" href="_static/basic.css?v=686e5160" />
+    <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=27fed22d" />
     <script src="_static/documentation_options.js?v=229cbe3b"></script>
     <script src="_static/doctools.js?v=9bcbadda"></script>
     <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
@@ -177,7 +178,16 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 
 
 
-<h3>Navigation</h3>
+
+<search id="searchbox" style="display: none" role="search">
+    <div class="searchformwrapper">
+    <form class="search" action="search.html" method="get">
+      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false" placeholder="Search"/>
+      <input type="submit" value="Go" />
+    </form>
+    </div>
+</search>
+<script>document.getElementById('searchbox').style.display = "block"</script><h3>Navigation</h3>
 <ul class="current">
 <li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
 <li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
@@ -210,16 +220,6 @@ <h3>Related Topics</h3>
   </ul></li>
 </ul>
 </div>
-<search id="searchbox" style="display: none" role="search">
-  <h3 id="searchlabel">Quick search</h3>
-    <div class="searchformwrapper">
-    <form class="search" action="search.html" method="get">
-      <input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
-      <input type="submit" value="Go" />
-    </form>
-    </div>
-</search>
-<script>document.getElementById('searchbox').style.display = "block"</script>
 
 
 
@@ -237,7 +237,7 @@ <h3 id="searchlabel">Quick search</h3>
       
       |
       Powered by <a href="https://www.sphinx-doc.org/">Sphinx 8.1.3</a>
-      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 0.7.16</a>
+      &amp; <a href="https://alabaster.readthedocs.io">Alabaster 1.0.0</a>
       
       |
       <a href="_sources/use_examples.md.txt"
diff --git a/hlink/linking/core/substitutions.py b/hlink/linking/core/substitutions.py
index f2e6478..e5edc5e 100644
--- a/hlink/linking/core/substitutions.py
+++ b/hlink/linking/core/substitutions.py
@@ -4,10 +4,18 @@
 #   https://github.com/ipums/hlink
 
 from collections import namedtuple
+from typing import Any
+
+from pyspark import SparkContext
+from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.functions import concat_ws, lit, regexp_replace, split, when
 
 
-def generate_substitutions(spark, df_selected, substitution_columns):
+def generate_substitutions(
+    spark: SparkSession,
+    df_selected: DataFrame,
+    substitution_columns: list[dict[str, Any]],
+) -> DataFrame:
     for substitution_column in substitution_columns:
         column_name = substitution_column["column_name"]
         for substitution in substitution_column["substitutions"]:
@@ -29,7 +37,7 @@ def generate_substitutions(spark, df_selected, substitution_columns):
     return df_selected
 
 
-def _load_substitutions(file_name):
+def _load_substitutions(file_name: str) -> tuple[list[str], list[str]]:
     """Reads in the substitution file and returns a 2-tuple representing it.
 
     Parameters
@@ -51,7 +59,9 @@ def _load_substitutions(file_name):
     return (sub_froms, sub_tos)
 
 
-def _apply_substitution(df, column_name, substitution, sc):
+def _apply_substitution(
+    df: DataFrame, column_name: str, substitution: dict[str, Any], sc: SparkContext
+) -> DataFrame:
     """Returns a new df with the values in the column column_name replaced using substitutions defined in substitution_file."""
     substitution_file = substitution["substitution_file"]
     join_value = substitution["join_value"]
@@ -81,7 +91,9 @@ def _apply_substitution(df, column_name, substitution, sc):
     return df_sub.select(df_sub_selects)
 
 
-def _apply_regex_substitution(df, column_name, substitution, sc):
+def _apply_regex_substitution(
+    df: DataFrame, column_name: str, substitution: dict[str, Any], sc: SparkContext
+) -> DataFrame:
     """Returns a new df with the values in the column column_name replaced using substitutions defined in substitution_file."""
 
     substitution_file = substitution["substitution_file"]
diff --git a/hlink/tests/config_loader_test.py b/hlink/tests/config_loader_test.py
index 140d34f..4fd4827 100644
--- a/hlink/tests/config_loader_test.py
+++ b/hlink/tests/config_loader_test.py
@@ -5,24 +5,20 @@
 
 from hlink.configs.load_config import load_conf_file
 import os.path
-import pytest
 
 
-@pytest.mark.quickcheck
 def test_load_conf_file_json(conf_dir_path):
     conf_file = os.path.join(conf_dir_path, "test")
     conf = load_conf_file(conf_file)
     assert conf["id_column"] == "id"
 
 
-@pytest.mark.quickcheck
 def test_load_conf_file_toml(conf_dir_path):
     conf_file = os.path.join(conf_dir_path, "test1")
     conf = load_conf_file(conf_file)
     assert conf["id_column"] == "id-toml"
 
 
-@pytest.mark.quickcheck
 def test_load_conf_file_json2(conf_dir_path):
     conf_file = os.path.join(conf_dir_path, "test_conf_flag_run")
     conf = load_conf_file(conf_file)
diff --git a/hlink/tests/core/pipeline_test.py b/hlink/tests/core/pipeline_test.py
index 5c4846a..fac43a6 100644
--- a/hlink/tests/core/pipeline_test.py
+++ b/hlink/tests/core/pipeline_test.py
@@ -1,8 +1,6 @@
-import pytest
 import hlink.linking.core.pipeline as pipeline_core
 
 
-@pytest.mark.quickcheck
 def test_categorical_comparison_features():
     """Catches a bug where comparison features marked as categorical = false
     were still included as categorical. See Issue #81.
diff --git a/hlink/tests/core/substitutions_test.py b/hlink/tests/core/substitutions_test.py
new file mode 100644
index 0000000..043d70c
--- /dev/null
+++ b/hlink/tests/core/substitutions_test.py
@@ -0,0 +1,63 @@
+# This file is part of the ISRDI's hlink.
+# For copyright and licensing information, see the NOTICE and LICENSE files
+# in this project's top-level directory, and also on-line at:
+#   https://github.com/ipums/hlink
+
+from pathlib import Path
+
+from pyspark.sql import Row, SparkSession
+
+from hlink.linking.core.substitutions import generate_substitutions, _load_substitutions
+
+
+def test_load_substitutions(tmp_path: Path) -> None:
+    file_contents = """a,b
+    to this,from this"""
+
+    tmp_file = tmp_path / "substitutions.csv"
+    tmp_file.write_text(file_contents)
+    sub_froms, sub_tos = _load_substitutions(str(tmp_file))
+
+    assert sub_froms == ["b", "from this"]
+    assert sub_tos == ["a", "to this"]
+
+
+def test_generate_substitutions(spark: SparkSession, tmp_path: Path) -> None:
+    tmp_file = tmp_path / "substitutions.csv"
+    tmp_file.write_text(
+        """rose,rosie
+        sophia,sophy
+        sophia,sofia
+        amanda,mandy
+        jane,jean"""
+    )
+
+    df = spark.createDataFrame(
+        [("agnes", 2), ("mandy", 2), ("sophy", 2), ("rosie", 2), ("jean", 1)],
+        schema=["first_name", "sex"],
+    )
+
+    substitution_columns = [
+        {
+            "column_name": "first_name",
+            "substitutions": [
+                {
+                    "join_column": "sex",
+                    "join_value": 2,
+                    "substitution_file": str(tmp_file),
+                }
+            ],
+        }
+    ]
+
+    subbed_df = generate_substitutions(spark, df, substitution_columns)
+    rows = subbed_df.select("first_name", "sex").collect()
+
+    assert rows == [
+        Row(first_name="agnes", sex=2),
+        Row(first_name="amanda", sex=2),
+        Row(first_name="sophia", sex=2),
+        Row(first_name="rose", sex=2),
+        # Note that this name is not substituted because we join on sex=2
+        Row(first_name="jean", sex=1),
+    ]
diff --git a/hlink/tests/main_loop_test.py b/hlink/tests/main_loop_test.py
index 8d16325..d232249 100755
--- a/hlink/tests/main_loop_test.py
+++ b/hlink/tests/main_loop_test.py
@@ -5,12 +5,10 @@
 
 import os
 import pandas as pd
-import pytest
 from pyspark.ml.feature import VectorAssembler, OneHotEncoder
 from hlink.linking.link_run import link_task_choices
 
 
-@pytest.mark.quickcheck
 def test_do_get_steps(capsys, main, spark):
     for task in link_task_choices:
         task_inst = getattr(main.link_run, task)
@@ -22,7 +20,6 @@ def test_do_get_steps(capsys, main, spark):
             assert str(step) in output
 
 
-@pytest.mark.quickcheck
 def test_do_set_link_task(capsys, main):
     main.current_link_task = main.link_run.matching
     main.do_set_link_task("preprocessing")
diff --git a/hlink/tests/main_test.py b/hlink/tests/main_test.py
index 50e4684..2938458 100644
--- a/hlink/tests/main_test.py
+++ b/hlink/tests/main_test.py
@@ -59,7 +59,6 @@ def test_load_conf_does_not_exist_no_env(monkeypatch, tmp_path, conf_file, user)
         load_conf(filename, user)
 
 
-@pytest.mark.quickcheck
 @pytest.mark.parametrize("conf_file", ("my_conf.json",))
 @pytest.mark.parametrize("user", users)
 def test_load_conf_json_exists_no_env(monkeypatch, tmp_path, conf_file, user):
@@ -90,7 +89,6 @@ def test_load_conf_json_exists_ext_added_no_env(monkeypatch, tmp_path, conf_name
     assert conf["conf_path"] == filename
 
 
-@pytest.mark.quickcheck
 @pytest.mark.parametrize("conf_file", ("my_conf.toml",))
 @pytest.mark.parametrize("user", users)
 def test_load_conf_toml_exists_no_env(monkeypatch, tmp_path, conf_file, user):
@@ -189,7 +187,6 @@ def test_load_conf_does_not_exist_env(
         load_conf(conf_file, user)
 
 
-@pytest.mark.quickcheck
 @pytest.mark.parametrize("conf_file", ("my_conf.json",))
 @pytest.mark.parametrize("user", users)
 def test_load_conf_json_exists_in_conf_dir_env(
@@ -209,7 +206,6 @@ def test_load_conf_json_exists_in_conf_dir_env(
     assert conf["conf_path"] == str(file)
 
 
-@pytest.mark.quickcheck
 @pytest.mark.parametrize("conf_file", ("my_conf.toml",))
 @pytest.mark.parametrize("user", users)
 def test_load_conf_toml_exists_in_conf_dir_env(
diff --git a/hlink/tests/matching_blocking_explode_test.py b/hlink/tests/matching_blocking_explode_test.py
index a6333ee..3e6663a 100755
--- a/hlink/tests/matching_blocking_explode_test.py
+++ b/hlink/tests/matching_blocking_explode_test.py
@@ -4,13 +4,11 @@
 #   https://github.com/ipums/hlink
 
 from pyspark.sql import Row
-import pytest
 import pandas as pd
 from hlink.linking.matching.link_step_match import extract_or_groups_from_blocking
 from hlink.linking.matching.link_step_score import LinkStepScore
 
 
-@pytest.mark.quickcheck
 def test_steps_1_2_matching(
     spark, blocking_explode_conf, matching_test_input, matching, main
 ):
diff --git a/hlink/tests/preprocessing_test.py b/hlink/tests/preprocessing_test.py
index 86bf939..d38b886 100755
--- a/hlink/tests/preprocessing_test.py
+++ b/hlink/tests/preprocessing_test.py
@@ -10,7 +10,6 @@
 from hlink.errors import DataError
 
 
-@pytest.mark.quickcheck
 def test_step_0(preprocessing, spark, preprocessing_conf):
     """Test preprocessing step 0 to ensure that temporary raw_df_unpartitioned_(a/b) tables are created (exact copies of datasources from config). Also test that the presistent raw_df_(a/b) tables are created. Should be same as raw datasources with filters applied"""
 
diff --git a/hlink/tests/table_test.py b/hlink/tests/table_test.py
index 81f013d..89e2924 100644
--- a/hlink/tests/table_test.py
+++ b/hlink/tests/table_test.py
@@ -8,14 +8,12 @@ def simple_schema():
     return StructType([StructField("test", StringType())])
 
 
-@pytest.mark.quickcheck
 @pytest.mark.parametrize("table_name", ["this_table_does_not_exist", "@@@", "LOL rofl"])
 def test_exists_table_does_not_exist(spark, table_name):
     t = Table(spark, table_name, "table used for testing")
     assert not t.exists()
 
 
-@pytest.mark.quickcheck
 @pytest.mark.parametrize("table_name", ["table_for_testing_Table_class"])
 def test_exists_table_does_exist(spark, table_name, simple_schema):
     t = Table(spark, table_name, "table used for testing")
@@ -25,7 +23,6 @@ def test_exists_table_does_exist(spark, table_name, simple_schema):
     spark.sql(f"DROP TABLE {table_name}")
 
 
-@pytest.mark.quickcheck
 @pytest.mark.parametrize("table_name", ["table_for_testing_Table_class"])
 def test_drop_table_does_exist(spark, table_name, simple_schema):
     t = Table(spark, table_name, "table used for testing")
@@ -45,7 +42,6 @@ def test_drop_table_does_not_exist(spark, table_name):
     assert not t.exists()
 
 
-@pytest.mark.quickcheck
 @pytest.mark.parametrize("table_name", ["table_for_testing_Table_class"])
 def test_df_table_does_exist(spark, table_name, simple_schema):
     t = Table(spark, table_name, "table used for testing")
diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py
index 0fbdb0a..5c07b67 100644
--- a/hlink/tests/training_test.py
+++ b/hlink/tests/training_test.py
@@ -8,7 +8,6 @@
 import hlink.linking.core.pipeline as pipeline_core
 
 
-@pytest.mark.quickcheck
 def test_all_steps(
     spark,
     training_conf,
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index 366d99f..0000000
--- a/pytest.ini
+++ /dev/null
@@ -1,3 +0,0 @@
-[pytest]
-markers =
-    quickcheck: add a test to a list of tests that run quickly and test important features
diff --git a/sphinx-docs/substitutions.md b/sphinx-docs/substitutions.md
index 93c9947..eb76c45 100644
--- a/sphinx-docs/substitutions.md
+++ b/sphinx-docs/substitutions.md
@@ -9,7 +9,11 @@ You must supply a substitution file and either specify `regex_word_replace=true`
 
 ## 1:1 substitution by data table
 
-Performs a 1:1 replacement on a filtered subset of the data table.  If the input column data equals a value in the first column of the substitution file, it is replaced with the data in the second column of the substitution file.  Used to replace variant name forms with standardized name forms, filtering on sex.
+Performs a 1:1 replacement on a filtered subset of the data table.  If the
+input column data equals a value in the second column of the substitution file,
+it is replaced with the data in the first column of the substitution file.
+Used to replace variant name forms with standardized name forms, filtering on
+a column like sex which may affect common names.
 
 * Attributes:
   * `join_column` -- Type: `string`.  Column to filter input data on.