From b7f821cbe4284309b75880bcf4040801f42c580b Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 10 Dec 2024 14:00:59 -0600
Subject: [PATCH 1/6] [#176] Remove output_suspicious_TD and "suspicious
 traininig data" support

---
 docs/_sources/config.md.txt                   |   5 -
 docs/_sources/use_examples.md.txt             |  20 +-
 docs/config.html                              |   5 -
 docs/index.html                               |   2 +-
 docs/searchindex.js                           |   2 +-
 docs/use_examples.html                        |  19 +-
 .../link_step_train_test_models.py            | 188 +-----------------
 sphinx-docs/config.md                         |   5 -
 sphinx-docs/use_examples.md                   |  20 +-
 9 files changed, 19 insertions(+), 247 deletions(-)

diff --git a/docs/_sources/config.md.txt b/docs/_sources/config.md.txt
index 0ed63a3..b5ec9f7 100644
--- a/docs/_sources/config.md.txt
+++ b/docs/_sources/config.md.txt
@@ -334,7 +334,6 @@ split_by_id_a = true
 decision = "drop_duplicate_with_threshold_ratio"
 
 n_training_iterations = 2
-output_suspicious_TD = true
 param_grid = true
 model_parameters = [ 
     { type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] },
@@ -361,7 +360,6 @@ split_by_id_a = true
 decision = "drop_duplicate_with_threshold_ratio"
 
 n_training_iterations = 10
-output_suspicious_TD = true
 param_grid = false
 model_parameters = [
     { type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 },
@@ -750,7 +748,6 @@ splits = [-1,0,6,11,9999]
   * `n_training_iterations` -- Type: `integer`. Optional; default value is 10. The number of training iterations to use during the `model_exploration` task.
   * `scale_data` -- Type: `boolean`.  Optional. Whether to scale the data as part of the machine learning pipeline.
   * `use_training_data_features` -- Type: `boolean`. Optional. If the identifiers in the training data set are not present in your raw input data, you will need to set this to `true`, or training features will not be able to be generated, giving null column errors.  For example, if the training data set you are using has individuals from 1900 and 1910, but you are about to train a model to score the 1930-1940 potential matches, you need this to be set to `true` or it will fail, since the individual IDs are not present in the 1930 and 1940 raw input data.  If you were about to train a model to score the 1900-1910 potential matches with this same training set, it would be best to set this to `false`, so you can be sure the training features are created from scratch to match your exact current configuration settings, although if you know the features haven't changed, you could set it to `true` to save a small amount of processing time.
-  * `output_suspicious_TD` -- Type: `boolean`.  Optional.  Used in the `model_exploration` link task.  Outputs tables of potential matches that the model repeatedly scores differently than the match value given by the training data.  Helps to identify false positives/false negatives in the training data, as well as areas that need additional training feature coverage in the model, or need increased representation in the training data set.
   * `split_by_id_a` -- Type: `boolean`.  Optional.  Used in the `model_exploration` link task.  When set to true, ensures that all potential matches for a given individual with ID_a are grouped together in the same train-test-split group. For example, if individual histid_a "A304BT" has three potential matches in the training data, one each to histid_b "B200", "C201", and "D425", all of those potential matches would either end up in the "train" split or the "test" split when evaluating the model performance.
   * `feature_importances` -- Type: `boolean`. Optional.  Whether to record
     feature importances or coefficients for the training features when training
@@ -764,7 +761,6 @@ scale_data = false
 dataset = "/path/to/1900_1910_training_data_20191023.csv"
 dependent_var = "match"
 use_training_data_features = false
-output_suspicious_TD = true
 split_by_id_a = true
 
 score_with_model = true
@@ -804,7 +800,6 @@ scale_data = false
 dataset = "/path/to/hh_training_data_1900_1910.csv"
 dependent_var = "match"
 use_training_data_features = false
-output_suspicious_TD = true
 split_by_id_a = true
 score_with_model = true
 feature_importances = true
diff --git a/docs/_sources/use_examples.md.txt b/docs/_sources/use_examples.md.txt
index e781202..4d41811 100644
--- a/docs/_sources/use_examples.md.txt
+++ b/docs/_sources/use_examples.md.txt
@@ -1,6 +1,5 @@
 # Advanced Workflow Examples 
 
-
 ## Export training data after generating features to reuse in different linking years
 
 It is common to have a single training data set that spans two linked years, which is then used to train a model that is applied to a different set of linked years.  For example, we have a training data set that spans linked individuals from the 1900 census to the 1910 census.  We use this training data to predict links in the full count 1900-1910 linking run, but we also use this training data to link year pairs 1910-1920, 1920-1930, and 1930-1940.  
@@ -66,12 +65,9 @@ However, when this training data set is used for other years, the program does n
 
 8) Launch the hlink program using your new config for the new year pair you want to link. Run your link tasks and export relevant data.
 
-## ML model exploration and export of lists of potential false positives/negatives in training data
-`hlink` accepts a matrix of ML models and hyper-parameters to run train/test splits for you, and outputs data you can use to select and tune your models.  You can see example `training` and `hh_training` configuration sections that implement this in the [training](config.html#training-and-models) and [household training](config.html#household-training-and-models) sections of the configuration documentation.
-
-The model exploration link task also allows you to export lists of potential false positives (FPs) and false negatives (FNs) in your training data.  This is calculated when running the train/test splits in the regular model exploration tasks if the `output_suspicious_TD` flag is true.
+## An Example Model Exploration Workflow
 
-### Example model exploration and FP/FN export workflow
+`hlink` accepts a matrix of ML models and hyper-parameters to run train/test splits for you, and outputs data you can use to select and tune your models.  You can see example `training` and `hh_training` configuration sections that implement this in the [training](config.html#training-and-models) and [household training](config.html#household-training-and-models) sections of the configuration documentation.
 
 1) Create a config file that has a `training` and/or `hh_training` section with model parameters to explore. For example:
 
@@ -88,9 +84,6 @@ The model exploration link task also allows you to export lists of potential fal
     # source data years weren't identical to the linked years of your training data.
     use_training_data_features = false
 
-    # VERY IMPORTANT if you want to output FPs/FNs
-    output_suspicious_TD = true
-
     split_by_id_a = true
     score_with_model = true
     feature_importances = false
@@ -127,11 +120,4 @@ The model exploration link task also allows you to export lists of potential fal
     hlink $ csv training_results /my/output/1900_1910_training_results.csv
     ```
 
-5) Export the potential FPs and FNs to csv.  For `training` params, the results will be in the `repeat_FPs` and `repeat_FNs` tables, and for `hh_training` in the `hh_repeat_FPs` and `hh_repeat_FNs` tables.
-
-    ```
-    hlink $ csv repeat_FPs /my/output/1900_1910_potential_FPs.csv
-    hlink $ csv repeat_FNs /my/output/1900_1910_potential_FNs.csv
-    ```
-
-6) Use your preferred methods to analyze the data you've just exported.  Update the `chosen_model` in your configuration, and/or create new versions of your training data following your findings and update the path to the new training data in your configs.
+5) Use your preferred methods to analyze the data you've just exported.  Update the `chosen_model` in your configuration, and/or create new versions of your training data following your findings and update the path to the new training data in your configs.
diff --git a/docs/config.html b/docs/config.html
index 48684bf..3bc9b5e 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -367,7 +367,6 @@ <h2>Advanced Config File<a class="headerlink" href="#advanced-config-file" title
 <span class="n">decision</span> <span class="o">=</span> <span class="s2">&quot;drop_duplicate_with_threshold_ratio&quot;</span>
 
 <span class="n">n_training_iterations</span> <span class="o">=</span> <span class="mi">2</span>
-<span class="n">output_suspicious_TD</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">param_grid</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">model_parameters</span> <span class="o">=</span> <span class="p">[</span> 
     <span class="p">{</span> <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;random_forest&quot;</span><span class="p">,</span> <span class="n">maxDepth</span> <span class="o">=</span> <span class="p">[</span><span class="mi">7</span><span class="p">],</span> <span class="n">numTrees</span> <span class="o">=</span> <span class="p">[</span><span class="mi">100</span><span class="p">],</span> <span class="n">threshold</span> <span class="o">=</span> <span class="p">[</span><span class="mf">0.05</span><span class="p">,</span> <span class="mf">0.005</span><span class="p">],</span> <span class="n">threshold_ratio</span> <span class="o">=</span> <span class="p">[</span><span class="mf">1.2</span><span class="p">,</span> <span class="mf">1.3</span><span class="p">]</span> <span class="p">},</span>
@@ -394,7 +393,6 @@ <h2>Advanced Config File<a class="headerlink" href="#advanced-config-file" title
 <span class="n">decision</span> <span class="o">=</span> <span class="s2">&quot;drop_duplicate_with_threshold_ratio&quot;</span>
 
 <span class="n">n_training_iterations</span> <span class="o">=</span> <span class="mi">10</span>
-<span class="n">output_suspicious_TD</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">param_grid</span> <span class="o">=</span> <span class="n">false</span>
 <span class="n">model_parameters</span> <span class="o">=</span> <span class="p">[</span>
     <span class="p">{</span> <span class="nb">type</span> <span class="o">=</span> <span class="s2">&quot;random_forest&quot;</span><span class="p">,</span> <span class="n">maxDepth</span> <span class="o">=</span> <span class="mi">6</span><span class="p">,</span> <span class="n">numTrees</span> <span class="o">=</span> <span class="mi">50</span><span class="p">,</span> <span class="n">threshold</span> <span class="o">=</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">threshold_ratio</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="p">},</span>
@@ -820,7 +818,6 @@ <h2>Training and <a class="reference internal" href="models.html"><span class="d
 <li><p><code class="docutils literal notranslate"><span class="pre">n_training_iterations</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">integer</span></code>. Optional; default value is 10. The number of training iterations to use during the <code class="docutils literal notranslate"><span class="pre">model_exploration</span></code> task.</p></li>
 <li><p><code class="docutils literal notranslate"><span class="pre">scale_data</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">boolean</span></code>.  Optional. Whether to scale the data as part of the machine learning pipeline.</p></li>
 <li><p><code class="docutils literal notranslate"><span class="pre">use_training_data_features</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">boolean</span></code>. Optional. If the identifiers in the training data set are not present in your raw input data, you will need to set this to <code class="docutils literal notranslate"><span class="pre">true</span></code>, or training features will not be able to be generated, giving null column errors.  For example, if the training data set you are using has individuals from 1900 and 1910, but you are about to train a model to score the 1930-1940 potential matches, you need this to be set to <code class="docutils literal notranslate"><span class="pre">true</span></code> or it will fail, since the individual IDs are not present in the 1930 and 1940 raw input data.  If you were about to train a model to score the 1900-1910 potential matches with this same training set, it would be best to set this to <code class="docutils literal notranslate"><span class="pre">false</span></code>, so you can be sure the training features are created from scratch to match your exact current configuration settings, although if you know the features haven’t changed, you could set it to <code class="docutils literal notranslate"><span class="pre">true</span></code> to save a small amount of processing time.</p></li>
-<li><p><code class="docutils literal notranslate"><span class="pre">output_suspicious_TD</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">boolean</span></code>.  Optional.  Used in the <code class="docutils literal notranslate"><span class="pre">model_exploration</span></code> link task.  Outputs tables of potential matches that the model repeatedly scores differently than the match value given by the training data.  Helps to identify false positives/false negatives in the training data, as well as areas that need additional training feature coverage in the model, or need increased representation in the training data set.</p></li>
 <li><p><code class="docutils literal notranslate"><span class="pre">split_by_id_a</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">boolean</span></code>.  Optional.  Used in the <code class="docutils literal notranslate"><span class="pre">model_exploration</span></code> link task.  When set to true, ensures that all potential matches for a given individual with ID_a are grouped together in the same train-test-split group. For example, if individual histid_a “A304BT” has three potential matches in the training data, one each to histid_b “B200”, “C201”, and “D425”, all of those potential matches would either end up in the “train” split or the “test” split when evaluating the model performance.</p></li>
 <li><p><code class="docutils literal notranslate"><span class="pre">feature_importances</span></code> – Type: <code class="docutils literal notranslate"><span class="pre">boolean</span></code>. Optional.  Whether to record
 feature importances or coefficients for the training features when training
@@ -834,7 +831,6 @@ <h2>Training and <a class="reference internal" href="models.html"><span class="d
 <span class="n">dataset</span> <span class="o">=</span> <span class="s2">&quot;/path/to/1900_1910_training_data_20191023.csv&quot;</span>
 <span class="n">dependent_var</span> <span class="o">=</span> <span class="s2">&quot;match&quot;</span>
 <span class="n">use_training_data_features</span> <span class="o">=</span> <span class="n">false</span>
-<span class="n">output_suspicious_TD</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">split_by_id_a</span> <span class="o">=</span> <span class="n">true</span>
 
 <span class="n">score_with_model</span> <span class="o">=</span> <span class="n">true</span>
@@ -878,7 +874,6 @@ <h2>Household training and models<a class="headerlink" href="#household-training
 <span class="n">dataset</span> <span class="o">=</span> <span class="s2">&quot;/path/to/hh_training_data_1900_1910.csv&quot;</span>
 <span class="n">dependent_var</span> <span class="o">=</span> <span class="s2">&quot;match&quot;</span>
 <span class="n">use_training_data_features</span> <span class="o">=</span> <span class="n">false</span>
-<span class="n">output_suspicious_TD</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">split_by_id_a</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">score_with_model</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">feature_importances</span> <span class="o">=</span> <span class="n">true</span>
diff --git a/docs/index.html b/docs/index.html
index 1b38716..8072c8d 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -62,7 +62,7 @@ <h1>Welcome to hlink’s documentation!<a class="headerlink" href="#welcome-to-h
 </li>
 <li class="toctree-l1"><a class="reference internal" href="use_examples.html">Advanced Workflows</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="use_examples.html#export-training-data-after-generating-features-to-reuse-in-different-linking-years">Export training data after generating features to reuse in different linking years</a></li>
-<li class="toctree-l2"><a class="reference internal" href="use_examples.html#ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data">ML model exploration and export of lists of potential false positives/negatives in training data</a></li>
+<li class="toctree-l2"><a class="reference internal" href="use_examples.html#an-example-model-exploration-workflow">An Example Model Exploration Workflow</a></li>
 </ul>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="config.html">Configuration</a><ul>
diff --git a/docs/searchindex.js b/docs/searchindex.js
index 8e79012..7c7bb5e 100644
--- a/docs/searchindex.js
+++ b/docs/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"1:1 substitution by data table": [[12, "substitution-by-data-table"]], "Advanced Config File": [[3, "advanced-config-file"]], "Advanced Usage": [[0, "advanced-usage"]], "Advanced Workflow Examples": [[13, null]], "Aggregate Features": [[1, "aggregate-features"]], "Basic Config File": [[3, "basic-config-file"]], "Basic Usage": [[0, "basic-usage"]], "Blocking": [[3, "blocking"]], "Column Mappings": [[0, null], [3, "column-mappings"]], "Comparison Features": [[1, null], [3, "comparison-features"]], "Comparison Types": [[1, "comparison-types"], [2, "comparison-types"]], "Comparisons": [[2, null], [3, "comparisons"]], "Configuration": [[3, null]], "Configuration API": [[5, "configuration-api"], [5, null]], "Data sources": [[3, "data-sources"]], "Defining Multiple Comparisons": [[2, "defining-multiple-comparisons"]], "Example interactive mode workflow": [[11, "example-interactive-mode-workflow"]], "Example model exploration and FP/FN export workflow": [[13, "example-model-exploration-and-fp-fn-export-workflow"]], "Example training data export with generated ML features": [[13, "example-training-data-export-with-generated-ml-features"]], "Export training data after generating features to reuse in different linking years": [[13, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Feature Selection Transforms": [[4, null]], "Feature Selections": [[3, "feature-selections"]], "Feature add-ons": [[1, "feature-add-ons"]], "Filter": [[3, "filter"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "Household Comparisons": [[3, "household-comparisons"]], "Household Matching": [[8, "household-matching"]], "Household training and models": [[3, "household-training-and-models"]], "Installation": [[6, null]], "Installing from PyPI": [[6, "installing-from-pypi"]], "Installing from source": [[6, "installing-from-source"]], "Interactive Mode": [[11, "interactive-mode"]], "Introduction": [[7, null]], "Link Tasks": [[8, null]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[13, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Matching": [[8, "matching"]], "Model Exploration and Household Model Exploration": [[8, "model-exploration-and-household-model-exploration"]], "Models": [[9, null]], "Multiple Comparisons": [[3, "multiple-comparisons"]], "Overview": [[2, "overview"], [7, "overview"], [8, "overview"], [8, "id1"], [8, "id4"], [8, "id7"], [8, "id10"], [8, "id13"]], "Pipeline generated features": [[10, null]], "Pipeline-generated Features": [[3, "pipeline-generated-features"]], "Potential Matches Universe": [[3, "potential-matches-universe"]], "Preprocessing": [[8, "preprocessing"]], "Related Configuration Sections": [[8, "related-configuration-sections"], [8, "id3"], [8, "id6"], [8, "id9"], [8, "id12"], [8, "id15"]], "Reporting": [[8, "reporting"]], "Requirements": [[6, "requirements"]], "Running Linking Tasks and Steps": [[11, "running-linking-tasks-and-steps"]], "Running hlink": [[11, null]], "Single Comparison": [[3, "single-comparison"]], "Starting the program": [[11, "starting-the-program"]], "Substitution Columns": [[3, "substitution-columns"]], "Substitution by regex word replace": [[12, "substitution-by-regex-word-replace"]], "Substitutions": [[12, null]], "Task steps": [[8, "task-steps"], [8, "id2"], [8, "id5"], [8, "id8"], [8, "id11"], [8, "id14"]], "Top level configs": [[3, "top-level-configs"]], "Training and Household Training": [[8, "training-and-household-training"]], "Training and models": [[3, "training-and-models"]], "Transformer types": [[10, "transformer-types"]], "Transforms": [[0, "transforms"]], "Using hlink as a Library": [[11, "using-hlink-as-a-library"]], "Welcome to hlink\u2019s documentation!": [[5, null]], "abs_diff": [[1, "abs-diff"]], "add_to_a": [[0, "add-to-a"]], "alias": [[1, "alias"]], "all_equals": [[1, "all-equals"]], "and": [[1, "and"]], "any_equals": [[1, "any-equals"]], "array": [[4, "array"]], "array_index": [[0, "array-index"]], "b_minus_a": [[1, "b-minus-a"]], "bigrams": [[4, "bigrams"]], "btwn_threshold": [[1, "btwn-threshold"]], "bucketizer": [[10, "bucketizer"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "decision_tree": [[9, "decision-tree"]], "divide_by_int": [[0, "divide-by-int"]], "either_are_0": [[1, "either-are-0"]], "either_are_1": [[1, "either-are-1"]], "equals": [[1, "equals"]], "equals_as_int": [[1, "equals-as-int"]], "exact_mult": [[1, "exact-mult"]], "extra_children": [[1, "extra-children"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "geo_distance": [[1, "geo-distance"]], "get_floor": [[0, "get-floor"]], "gradient_boosted_trees": [[9, "gradient-boosted-trees"]], "gt_threshold": [[1, "gt-threshold"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "interaction": [[10, "interaction"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "length_b": [[1, "length-b"]], "lightgbm": [[9, "lightgbm"]], "logistic_regression": [[9, "logistic-regression"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "lower_threshold": [[1, "lower-threshold"]], "lowercase_strip": [[0, "lowercase-strip"]], "mapping": [[0, "mapping"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "neither_are_null": [[1, "neither-are-null"]], "not_equals": [[1, "not-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "power": [[1, "power"], [4, "power"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_both_years": [[1, "present-both-years"]], "probit": [[9, "probit"]], "random_forest": [[9, "random-forest"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "remove_prefixes": [[0, "remove-prefixes"]], "remove_punctuation": [[0, "remove-punctuation"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_suffixes": [[0, "remove-suffixes"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "second_gen_imm": [[1, "second-gen-imm"]], "soundex": [[4, "soundex"]], "split": [[0, "split"]], "sql_condition": [[1, "sql-condition"], [4, "sql-condition"]], "substring": [[0, "substring"]], "sum": [[1, "sum"]], "threshold": [[1, "threshold"]], "times": [[1, "times"]], "union": [[4, "union"]], "upper_threshold": [[1, "upper-threshold"]], "when_value": [[0, "when-value"]], "xgboost": [[9, "xgboost"]]}, "docnames": ["column_mappings", "comparison_features", "comparisons", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["column_mappings.md", "comparison_features.md", "comparisons.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [0, 1, 3, 7, 8, 9, 11, 12], "0": [0, 1, 2, 3, 8, 9, 10, 11, 13], "005": 3, "012": 1, "05": [1, 3, 9], "1": [0, 1, 2, 3, 5, 8, 9, 10, 11, 13], "10": [0, 3, 6, 13], "100": [0, 3, 13], "11": [0, 3, 6, 10], "12": [0, 6], "14": 1, "15": 9, "1867": 3, "1868": 3, "1869": 3, "1870": 3, "1871": 3, "1872": 3, "1873": 3, "1900": [3, 13], "1900_1910_potential_fn": 13, "1900_1910_potential_fp": 13, "1900_1910_training_data_20191023": 3, "1900_1910_training_result": 13, "1910": [1, 3, 13], "1920": 13, "1930": [3, 13], "1940": [3, 13], "1999": [0, 3], "2": [0, 1, 2, 3, 4, 8, 9, 12, 13], "20": 9, "25": 1, "3": [0, 1, 2, 3, 6, 8, 9, 10, 13], "300": 0, "301": 0, "302": 0, "303": 0, "4": [0, 1, 9], "5": [0, 1, 2, 3, 9, 10, 11, 13], "50": [3, 13], "50g": 13, "53": 3, "5g": 11, "6": [0, 3, 9, 10, 13], "65": 3, "7": [0, 1, 3, 9, 13], "75": [3, 9, 13], "79": [2, 3], "7th": 12, "8": [1, 3, 6, 9, 11], "80": 3, "84": [2, 3], "85": [9, 11], "9": 1, "95": 1, "99": [1, 3], "9998": 0, "9999": [0, 3, 10], "A": [0, 1, 2, 3, 4, 10, 11], "AND": [1, 2, 3], "As": [0, 2], "At": [7, 8], "But": [3, 6], "By": [0, 3, 11], "For": [0, 1, 2, 3, 8, 9, 11, 13], "If": [0, 1, 3, 4, 8, 9, 11, 12], "In": [0, 1, 2, 7, 9, 11, 13], "It": [0, 1, 2, 3, 7, 9, 11, 13], "NOT": 1, "OR": [1, 2, 3], "THEN": 1, "The": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13], "Then": [0, 6], "There": [1, 3, 8, 9], "These": [0, 1, 2, 3, 7, 8, 9, 10], "To": [0, 1, 6, 8, 9, 11], "Will": 3, "With": [0, 10], "_": [0, 1, 2, 3, 4, 5, 9, 10, 11], "_a": 1, "_bpl": 1, "_namefrst": 1, "_sex": 1, "a304bt": 3, "ab": 1, "abbrevi": [8, 12], "abl": 3, "about": [3, 11, 13], "abov": [1, 2, 6], "absolut": 1, "accept": [1, 3, 13], "access": [11, 13], "accord": 1, "across": 1, "ad": [0, 1, 2, 3, 9], "add": [0, 2, 5], "add_to_a": 3, "addit": [0, 1, 3, 6, 7, 9, 11], "addl": 1, "addl_var": 1, "adjust": 11, "adopt": 0, "advanc": 5, "affect": [4, 12], "after": [1, 2, 3, 5, 9, 11], "ag": [0, 1, 3, 4], "against": [1, 3, 12], "age_2": 3, "age_at_dataset_b": 0, "age_threshold": 1, "aggreg": 5, "ah": 0, "ahead": 8, "akin": 1, "algorithm": [1, 2, 7, 8], "alia": [0, 3, 8], "all": [0, 1, 3, 4, 8, 9, 10, 11], "allow": [1, 3, 8, 13], "along": 1, "alpha": 9, "alphabet": 0, "alphanumer": 3, "also": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 13], "altern": [0, 3, 9], "although": 3, "alwai": 2, "among": 1, "amount": 3, "an": [0, 1, 2, 3, 7, 9, 11], "analysi": 13, "analyz": [11, 13], "ani": [1, 3, 4, 6, 9], "anoth": [0, 1, 3, 4, 8, 9], "anyon": 8, "anywher": 12, "apach": 6, "apart": 0, "api": [3, 7, 10], "apostroph": 0, "appear": [0, 1], "appli": [0, 2, 3, 4, 8, 9, 13], "apply_model": 3, "appropri": 3, "ar": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "arbitrari": 0, "area": 3, "aren": 1, "arg": 13, "argument": [1, 11, 13], "around": 3, "arrai": [0, 3, 5, 10], "array_index": 3, "ask": 13, "aspect": [8, 11], "assert": [1, 11], "assum": 3, "attach_vari": 3, "attempt": 3, "attribut": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12], "auto": 9, "automat": [0, 3, 6, 8], "av": 12, "avail": [0, 1, 3, 4, 6, 8, 9, 10, 13], "avenu": [0, 12], "b": [0, 1, 2, 3, 4, 11], "b200": 3, "back": 1, "backup": 1, "base": [1, 3, 8], "basic": 5, "becaus": 2, "been": 7, "befor": [0, 1, 3, 4, 6, 8], "begin": 11, "behind": 0, "being": [1, 8], "belong": 3, "below": [0, 1, 3, 4, 9, 10, 11], "best": [3, 8], "beta": [3, 9], "better": [0, 7], "between": [0, 1, 2, 3, 7, 8, 9, 11, 13], "beyond": 1, "bigint": 3, "bigram": [3, 5], "bin": 9, "binomi": 9, "birth": 1, "birthplac": [0, 3], "birthyr": [0, 3], "birthyr_3": 3, "birthyr_col": 1, "block": [2, 5, 8], "blvd": 0, "boolean": [1, 3, 4, 12, 13], "boost": [5, 9], "born": 1, "borrow_t": 11, "both": [0, 1, 2, 3, 8, 13], "boundari": 1, "bpl": [0, 1, 3], "bpl1": 3, "bpl2": 3, "bpl2_str": 3, "bpl3": 3, "bpl_clean": 3, "bpl_orig": 3, "bpl_root": 0, "bpl_str": 3, "bplmatch": 3, "broken": 7, "btwn": 1, "bucket": [3, 8], "built": 6, "builtin": 1, "byrdifcat": 3, "byrdiff": [1, 3, 13], "c": [1, 9], "c201": 3, "calcul": [1, 13], "call": [0, 9], "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "cannot": 3, "care": 2, "cartesian": 1, "case": [0, 1, 2, 3, 4, 7], "cast": 1, "categor": [1, 3, 9, 10], "categori": [0, 9], "caus": 9, "caution": [1, 10], "censu": [0, 8, 13], "census": 7, "chang": [0, 1, 3, 6, 11, 13], "charact": 0, "characterist": [7, 8], "check": [1, 11], "check_val_expr": 1, "checkpoint": 4, "child": [0, 9], "children": 1, "chines": 0, "choic": 11, "chosen": 8, "chosen_model": [3, 9, 13], "circl": 0, "circumst": 1, "class": [9, 11], "classif": [8, 9], "classifi": 9, "claus": [0, 1], "clean": [0, 7], "clean_birthyr": [0, 3, 4], "clone": 6, "code": [0, 1, 3, 6], "coeffici": [3, 8], "col": [0, 1], "col_to_add": 3, "col_to_join_on": 3, "col_typ": 3, "column": [1, 4, 5, 8, 10, 11, 12, 13], "column_map": [0, 3, 8], "column_nam": [0, 1, 3, 12], "column_to_append": 0, "combin": [1, 2, 3, 4, 8], "come": 1, "command": [3, 7, 11, 13], "comment": 4, "common": [7, 8, 12, 13], "comp": 1, "comp_a": [1, 2, 3], "comp_b": [1, 2, 3], "comp_c": 1, "comp_d": 1, "compar": [0, 1, 3, 7, 8], "comparis": 3, "comparison": [5, 8], "comparison_featur": [1, 2, 3, 8], "comparison_typ": [1, 2, 3], "complet": 11, "complex": [3, 4], "comput": [1, 4, 8], "concat": 0, "concaten": [0, 1], "condens": 0, "condense_strip_whitespac": 3, "condit": [0, 1, 2, 3, 4, 5, 8], "conf": [11, 13], "config": [1, 4, 5, 8, 11, 13], "configur": [0, 1, 2, 7, 9, 11, 13], "conjuct": 3, "conjunct": 3, "connect": [2, 3], "consid": [1, 2, 9], "consider": 1, "consol": 11, "constraint": [1, 2], "construct": 8, "contain": [0, 1, 3, 12], "context": 10, "continu": [9, 10, 13], "conveni": 11, "convert": [0, 1, 3], "convert_ints_to_long": 3, "copi": [4, 13], "core": [1, 8, 11, 13], "correspond": [7, 8], "could": [0, 2, 3], "count": [1, 11, 13], "counterpart": 9, "counti": [0, 1], "county_1900_1910_distances_km": 1, "county_a": 1, "county_b": 1, "county_dist": [1, 3, 13], "county_distance_lookup": 1, "county_distance_squar": [1, 3, 13], "county_state_dist": 1, "court": 0, "cover": 10, "coverag": 3, "cpu": 11, "creat": [0, 3, 7, 8, 10, 11, 12, 13], "creation": 3, "crosswalk": 8, "csv": [1, 3, 8, 11, 12, 13], "current": [1, 2, 3, 9, 11], "d": 1, "d425": 3, "data": [0, 1, 5, 7, 8, 11], "databas": 11, "datafram": [8, 11, 13], "dataset": [0, 1, 3, 4, 7, 8, 11, 13], "datasourc": [1, 3, 11], "datasource_a": [3, 8], "datasource_b": [3, 8], "de": 9, "decis": [3, 5, 9, 13], "decisiontreeclassifi": 9, "default": [0, 1, 2, 3, 8, 9, 11], "defin": [1, 3, 5, 8, 10, 11], "definit": [3, 8], "demograph": 8, "depend": [1, 2, 3, 6, 9, 13], "dependent_var": [3, 13], "depth": 9, "deriv": 13, "derived_from": 3, "desc": 11, "describ": [0, 1, 2, 3, 11], "descript": [3, 11], "detail": [0, 3, 11], "detect": 9, "determin": [1, 8], "determinist": [7, 8], "dev": 6, "develop": [6, 7], "df": [3, 11], "dictionari": 11, "diff": 1, "differ": [0, 1, 3, 5, 7, 8], "digit": 0, "dir": 12, "directli": [2, 9], "directori": [6, 11, 13], "disabl": 9, "discard": 9, "discret": 9, "discuss": 3, "dist": 1, "dist_tabl": 1, "distanc": [1, 9], "distance_col": 1, "distance_km": 1, "distances_fil": 1, "distinct": 1, "divid": 0, "divide_by_int": 3, "do": [0, 1, 2, 4, 9, 11, 13], "doc": [9, 10], "document": [1, 3, 9, 11, 13], "doe": [1, 4, 8, 9, 13], "don": [3, 4], "doubl": 10, "down": [0, 7, 13], "drastic": 8, "drop": [0, 3, 11], "drop_al": 11, "drop_all_prc": 11, "drop_all_temp": 11, "drop_data_from_scored_match": 3, "drop_duplicate_a": 3, "drop_duplicate_with_threshold_ratio": [3, 9, 13], "duplic": [3, 9], "durat": 1, "dure": [1, 2, 3, 8], "durmarr": [1, 3], "e": 6, "each": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11], "easiest": [6, 11], "easili": 11, "edit": 6, "effect": [2, 4], "either": [0, 1, 2, 3, 7, 12], "either_0": 1, "either_1": 1, "element": 0, "elig": 8, "els": [1, 3, 4], "else_v": 1, "else_valu": 0, "empti": 1, "enabl": [3, 8, 9, 11], "encod": [3, 4], "encount": 9, "end": [0, 1, 3, 4, 12], "enorm": 9, "ensur": 3, "enter": 11, "entir": [0, 3], "enum_dist": 1, "enumer": 11, "equal": [2, 3, 12], "equal_and_not_null_templ": 1, "equival": 3, "error": [3, 9, 10], "especi": 3, "eta": 9, "etc": 13, "eval": 3, "evalu": [1, 3, 7, 8, 9], "even": 1, "everi": [1, 4], "ex": 1, "exact": [1, 3], "exact_mult": [3, 13], "exampl": [0, 1, 2, 3, 9], "except": [3, 10], "exclud": 1, "excute_command": 11, "execut": 11, "execute_command": 11, "execute_task": 11, "executor": 11, "executor_memori": [11, 13], "exist": [1, 3, 11], "exit": 11, "expand": 3, "expand_length": 3, "expect": 1, "experi": 7, "experiment": [8, 11], "explicitli": [1, 10], "explod": [2, 3, 8], "exploded_df_a": 11, "exploded_df_b": 11, "explor": [3, 5, 7, 11], "expon": 4, "exponenti": 1, "export": [5, 8, 11], "express": [0, 1, 3], "extend": 1, "extens": 8, "extra": [1, 9], "extract": 3, "f": [1, 11], "f1": 1, "f1_match": 3, "f2": 1, "f2_match": 3, "f_caution": [3, 13], "f_interacted_jw_f": [3, 13], "f_pre": [3, 13], "factori": 11, "fail": 3, "fallback": 1, "fals": [1, 3, 4, 5, 7, 11], "famili": 9, "father_namefrst": 1, "favor": 1, "fbpl": 1, "fbpl_nomatch": 1, "fbplmatch": [3, 13], "featur": [2, 5, 7, 8, 9, 11], "feature_import": [3, 8, 13], "feature_nam": [2, 3], "feature_select": [2, 3, 4, 8], "featuresubsetstrategi": 9, "fed": [3, 8], "femal": [3, 12], "fetch": 1, "fetch_a": 3, "few": [4, 9], "fewer": [1, 9], "fi": 1, "file": [1, 4, 5, 7, 8, 11, 12, 13], "filepath": 11, "fill": 1, "filter": [1, 2, 5, 8, 12], "final": [1, 3, 13], "find": [1, 8, 13], "finish": 11, "first": [0, 1, 3, 6, 8, 11, 12], "first_init_col": 1, "first_nam": 0, "five": 11, "fix": 8, "flag": [1, 2, 3, 10, 11, 13], "flexibl": [1, 2], "float": [1, 3, 9], "floor": 0, "focus": [2, 8], "follow": [0, 1, 2, 7, 11, 12, 13], "force_row_wis": 9, "foreign": 1, "forest": [5, 9], "form": [1, 3, 8, 12], "format": 0, "four": 1, "framework": 13, "from": [0, 1, 3, 5, 7, 8, 10, 11, 13], "from_icpsrctyi": 1, "from_statefip": 1, "fsoundex": [3, 13], "full": [3, 8, 9, 13], "full_count_1870_1880": 11, "full_count_1900_1910": 13, "fullcount_1870_1880": 11, "function": [0, 1, 3, 7, 11], "further": [7, 13], "gamma": 9, "gbtclassifi": 9, "gen": 1, "gener": [0, 1, 5, 7, 8, 11], "generalizedlinearregress": 9, "geo": 1, "geograph": 1, "get": [0, 1, 2, 3, 11], "get_floor": 3, "get_set": 11, "get_step": 11, "get_tabl": 11, "get_task": 11, "github": 6, "give": [0, 3], "given": [0, 1, 2, 3, 4, 13], "go": [3, 11], "good": 1, "gradient": [5, 9], "greater": [1, 2, 6], "greatest": 1, "group": [3, 8], "gt": 1, "h": 11, "ha": [0, 1, 2, 3, 4, 7, 9, 11, 13], "handl": 11, "harmon": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "haven": 3, "head": 0, "header": [3, 10, 12], "help": [3, 8, 9, 11], "here": [2, 3, 8, 9, 11, 13], "hh": [1, 2], "hh_blocked_match": 11, "hh_col": 3, "hh_comparison": [2, 3, 8], "hh_match": [3, 11], "hh_model_eval_repeat_fn": 11, "hh_model_eval_repeat_fp": 11, "hh_model_eval_training_data": 11, "hh_model_eval_training_featur": 11, "hh_model_eval_training_result": 11, "hh_model_eval_training_vector": 11, "hh_model_explor": 11, "hh_potential_match": [2, 11], "hh_potential_matchs_prep": 11, "hh_predicted_match": 11, "hh_repeat_fn": 13, "hh_repeat_fp": 13, "hh_scored_potential_match": 11, "hh_train": [1, 3, 8, 11, 13], "hh_training_data": 11, "hh_training_data_1900_1910": 3, "hh_training_featur": [11, 13], "hh_training_result": 13, "hidden": 11, "hierarch": [7, 11], "hierarchi": 11, "high": [9, 11], "highest": [1, 3, 9], "highli": [7, 8], "histid": [1, 3, 13], "histid_col": 1, "hit": [3, 11, 13], "hits2": [3, 13], "hlink": [0, 1, 2, 3, 4, 6, 7, 8, 9, 13], "hold": 10, "hot": 3, "household": [0, 2, 5, 7, 9, 11, 13], "how": [1, 3, 8], "howev": [4, 7, 13], "hundr": 0, "hyper": [3, 7, 13], "hyperparamet": 13, "hyphen": 0, "i": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13], "id": [1, 3], "id_column": [1, 3], "ident": 13, "identifi": [3, 7, 13], "if_valu": 0, "ignor": 7, "ii": [0, 3], "iii": [0, 3], "imm": [1, 3, 13], "imm_interacted_bplmatch": 3, "imm_interacted_immyear_caut": [3, 13], "immigr": 1, "immyear_caut": [3, 10], "immyear_diff": [1, 3, 10, 13], "implement": [9, 13], "implicitli": 2, "import": [3, 8, 11, 13], "improv": 8, "includ": [1, 3, 8, 9, 10, 11], "incompar": 1, "increas": [3, 10], "independ": [0, 3], "independent_var": [1, 3, 13], "index": [0, 6], "indic": [1, 13], "individu": [1, 3, 8, 13], "inf": 10, "inform": [0, 1, 3, 11], "ingest": 8, "initi": [0, 1, 11], "input": [0, 1, 3, 4, 7, 8, 11, 12], "input_col": 4, "input_column": [3, 4, 10], "input_table_nam": 11, "inspect": 8, "instal": [5, 9], "instanc": [0, 9], "instead": [1, 3, 4, 6, 8], "instruct": [6, 11], "int": [0, 1, 3, 4, 9], "integ": [0, 1, 3, 10], "integr": [6, 9], "interact": [3, 5, 8, 13], "interfac": 11, "intermedi": 11, "introduct": 5, "introspect": 8, "invalid": [1, 9], "ipum": [0, 7], "ipython": 11, "isn": 13, "istemporari": 11, "item": 0, "iter": 3, "its": [0, 1, 7, 9, 11], "itself": [1, 2], "iv": 3, "jaro": [1, 10], "jaro_winkl": 3, "java": 6, "job": 11, "join": [1, 3, 12], "join_column": [3, 12], "join_valu": [3, 12], "jr": [0, 3], "json": [3, 11], "just": [1, 3, 11, 13], "jw": 1, "jw_col_templ": 1, "jw_f": [1, 3, 13], "jw_m": [3, 10, 13], "jw_max_a": 3, "jw_max_b": 3, "jw_sp": [3, 13], "jw_street": 1, "jw_threshold": 1, "kei": [1, 8, 11], "key_count": 1, "kind": 1, "know": 3, "known": 0, "label": 9, "languag": 7, "larger": 9, "last": [0, 1, 8, 10], "latest": 6, "launch": [11, 13], "law": 0, "lead": 0, "learn": [1, 2, 3, 7, 8, 9, 11, 13], "learningr": 9, "least": [0, 1, 9], "leav": 0, "left": 9, "length": [1, 3, 10], "less": [1, 3], "let": 11, "letter": 0, "level": [1, 5, 11], "leverag": 3, "libomp": 9, "librari": [5, 7, 9], "lightgbm": 5, "lightgbmclassifi": 9, "like": [0, 2, 3, 8, 11, 12], "limit": 3, "line": [7, 11], "link": [0, 1, 3, 5, 7, 9], "link_run": 11, "linkrun": [7, 11], "list": [0, 1, 3, 4, 5, 9, 11, 12], "liter": 3, "ll": 11, "load": 11, "load_conf_fil": 11, "load_config": 11, "loc_a": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b": 1, "loc_b_0": 1, "loc_b_1": 1, "locat": [1, 3, 11], "log2": 9, "logic": [0, 2, 3], "logist": [5, 9], "logistic_regress": [3, 13], "logisticregress": 9, "long": [3, 12], "longest": 8, "look": [1, 2, 11, 12], "lookup": 1, "lower": [0, 1], "lowercas": 0, "lowercase_strip": 3, "lowest": 0, "lr": 11, "lsoundex": [3, 13], "m": [0, 1], "m_caution": [1, 3, 10, 13], "m_interacted_jw_m": [3, 10, 13], "m_namefrst": 1, "m_pre": [3, 10, 13], "machin": [1, 2, 3, 7, 8, 9, 11, 13], "made": 6, "mai": [0, 2, 3, 7, 8, 9, 11, 12], "main": 11, "mainli": 1, "major": [1, 11], "make": [0, 1, 2, 3, 6, 13], "male": [3, 12], "mani": [2, 8, 9, 11], "manual": 13, "map": [5, 8, 10], "mardurmatch": [1, 3], "mark": 2, "marriag": 1, "match": [0, 1, 2, 5, 7, 9, 11, 12, 13], "matches_df": 11, "matrix": 13, "max": [1, 9, 11], "max_depth": 9, "maxbin": 9, "maxdepth": [3, 9, 13], "maximum": [1, 9], "maximum_jw": 1, "mbpl": 1, "mbplmatch": [3, 13], "mean": [0, 2, 3], "meant": 3, "meet": 1, "member": [1, 8], "memori": 11, "men": 3, "messag": 11, "metadata": 8, "method": [11, 13], "mfbplmatch": 1, "mi": [3, 13], "mi0": 1, "mi1": 1, "mid_init_col": 1, "mid_init_match": 1, "middl": [0, 1], "might": 13, "minimum": [1, 9], "mininstancespernod": 9, "minu": [1, 3], "mismatch": 1, "miss": [1, 11], "ml": [3, 5, 9, 10], "mode": [5, 6, 13], "model": [5, 7, 11], "model_eval_repeat_fn": 11, "model_eval_repeat_fp": 11, "model_eval_training_data": 11, "model_eval_training_featur": 11, "model_eval_training_result": 11, "model_eval_training_vector": 11, "model_explor": [3, 9, 11, 13], "model_paramet": [3, 8, 13], "modifi": 3, "modul": 11, "moment": 8, "momloc": 1, "more": [0, 1, 2, 3, 9, 10, 11, 13], "most": [0, 1, 8, 11], "mother": 10, "mother_birthyr": 1, "mpre": 1, "mr": 0, "much": [1, 8], "mult": 1, "multi": 1, "multipl": [0, 1, 5, 9, 11], "multipli": 1, "must": [0, 1, 2, 3, 4, 9, 10, 12], "my": [11, 13], "my_conf": 11, "my_fil": 3, "myriad": 3, "n": [1, 9, 10], "n_training_iter": [3, 8, 13], "name": [0, 1, 3, 4, 9, 11, 12], "name_col": 1, "name_std": [3, 12], "namefrst": [0, 1, 3], "namefrst_clean": [0, 3], "namefrst_init": 1, "namefrst_jw": [1, 2, 3, 13], "namefrst_mid_init": [0, 1], "namefrst_mid_init_2": 1, "namefrst_rel": 1, "namefrst_related_row": 1, "namefrst_split": [0, 3], "namefrst_std": [3, 12], "namefrst_std_jw": [3, 13], "namefrst_unstd": [1, 3], "namefrst_unstd_bigram": 4, "namelast": [1, 3], "namelast1": 1, "namelast2": 1, "namelast3": 1, "namelast_bigram": 3, "namelast_clean": [1, 3, 4], "namelast_clean_bigram": [3, 4], "namelast_clean_soundex": 4, "namelast_equal_as_int": 1, "namelast_frst_bigram": 4, "namelast_jw": [2, 3, 13], "namelast_jw_max": 1, "namelast_neighbor": 1, "namelast_popular": 1, "namelast_popularity_sum": 1, "nativ": 1, "nbor": [1, 3, 13], "ncount": [3, 4, 13], "ncount2": [3, 4, 13], "nearest": 0, "necessari": [4, 13], "need": [0, 1, 2, 3, 8, 9, 11, 13], "neg": [3, 5, 7], "neighbor": 1, "neighborhood": 1, "neither": 1, "nest": [2, 3, 7], "new": [0, 3, 13], "new_marr": [1, 3], "newli": 3, "niu": 1, "no_first_pad": 4, "node": 9, "non": 0, "nonzero": 1, "nor": 1, "normal": [8, 9], "note": [2, 3], "now": [2, 11], "null": [1, 3, 4], "null_fil": 3, "num": 11, "num_col": 1, "number": [0, 1, 3, 4, 8, 9, 11], "numer": [0, 1], "numtre": [3, 9, 13], "nvl": 1, "object": [1, 3, 7, 11], "often": [2, 4, 11], "onc": [1, 11], "one": [0, 1, 3, 8], "onethird": 9, "onli": [0, 1, 2, 3, 8, 9, 13], "ons": 5, "oper": [0, 1, 2, 3, 9], "option": [0, 1, 3, 4, 7, 8, 9, 11, 13], "or_group": 3, "order": [0, 3, 8], "org": 6, "organ": 11, "original_valu": 3, "oth": [3, 13], "other": [1, 3, 9, 13], "otherwis": [0, 1, 10, 13], "our": 11, "out": [3, 4, 8, 13], "output": [0, 1, 2, 3, 4, 7, 8, 11, 13], "output_col": 4, "output_column": [3, 4, 10], "output_suspicious_td": [3, 13], "output_table_nam": 11, "output_typ": 3, "outsid": 10, "overrid": [0, 3], "override_column_a": [0, 3, 4], "override_column_b": [0, 3, 4], "override_transform": [0, 3], "overview": 5, "own": [1, 3], "p": 3, "packag": [6, 9], "page": [1, 2, 3, 11], "pair": [1, 2, 3, 13], "param": 13, "param_grid": [3, 13], "paramet": [3, 7, 8, 9, 11, 13], "parent": [0, 1, 12], "parent_step_chang": 1, "parenthes": 3, "parquet": [3, 8], "part": [3, 8], "particular": [1, 2, 3], "particularli": 1, "partit": 13, "pass": [1, 8, 9], "passthrougharg": 9, "path": [1, 3, 11, 12, 13], "pattern": 11, "peopl": [0, 1, 7, 11], "per": [1, 3, 9, 10, 11], "percent": 8, "percentag": 1, "perform": [1, 3, 7, 8, 9, 12], "persist": 11, "person": [0, 1, 7], "pip": [6, 9], "pipelin": 5, "pipeline_featur": [3, 8, 10], "piplin": 3, "placehold": 1, "pleas": [3, 9, 11], "plu": 3, "point": [10, 13], "popul": 8, "posit": [0, 1, 3, 5, 7], "possibl": 3, "post": [3, 8], "potenti": [1, 2, 5, 8, 9], "potential_match": [1, 2, 8, 11], "potential_matches_prep": 11, "potential_matches_univers": [3, 8], "power": 5, "predefin": 1, "predict": [3, 13], "predicted_match": 11, "prediction_col": 3, "preexist": 11, "prefer": 13, "prefix": 0, "prep": 3, "prep_step": 11, "prepar": [7, 8, 11], "prepend": 4, "prepped_df_a": 11, "prepped_df_b": 11, "preprocess": [5, 7, 11, 13], "present": [1, 3, 10], "pretti": 2, "primari": [1, 7], "primarili": [1, 8], "print": 11, "probabilist": [3, 7], "probabl": [0, 3, 9], "probit": [3, 5], "proceed": 12, "process": [2, 3, 7, 11], "produc": [0, 11], "product": [1, 7, 13], "program": [1, 3, 8, 13], "project": 6, "prompt": 11, "provid": [0, 3, 4, 7, 8, 9, 10, 11], "pull": 8, "punctuat": 0, "put": [11, 13], "py": [1, 3], "pypi": 5, "pyspark": [6, 9, 10, 11], "python": [6, 7, 9, 11], "q": [11, 13], "qmark": 0, "qualifi": 3, "queri": [1, 3], "race": [0, 1, 3, 10, 13], "race_interacted_srac": [3, 10, 13], "racematch": 3, "rais": [1, 4, 9], "random": [5, 9], "random_forest": [3, 13], "randomforestclassifi": 9, "rang": [3, 10], "rate": 1, "ratio": [3, 9], "ration": 0, "rationalize_name_word": 3, "raw": [0, 3, 8, 11], "raw_df_a": 11, "raw_df_b": 11, "read": [0, 1, 3, 8, 11], "readm": 3, "real": 7, "reason": 1, "recod": 0, "record": [0, 1, 2, 3, 7, 8, 9], "recurs": [2, 3], "reduc": [4, 8], "refer": 3, "regex": 5, "regex_word_replac": 12, "region": [3, 13], "region_dict": 3, "regionf": 3, "regist": [8, 11], "regress": [5, 9], "regular": 13, "rel": [1, 3, 13], "relat": [0, 1, 3], "relate_a": [3, 10], "relate_col": 1, "relate_div_100": [0, 1, 3], "related_individual_max_jw": 1, "related_individual_row": 1, "relatematch": [1, 3], "relatetyp": [3, 10], "relatetype_interacted_relatematch": 3, "relationship": 7, "relev": 13, "reload": 11, "remain": 8, "remov": 0, "remove_alternate_nam": 3, "remove_qmark_hyphen": 3, "remove_suffix": 3, "repeat_fn": 13, "repeat_fp": 13, "repeatedli": 3, "replac": [0, 1, 2, 5], "replace_apostroph": 3, "replaced_birthyr": [1, 3, 4], "report": [1, 5, 7, 11], "repositori": 6, "repres": [0, 1, 3, 10, 11], "represent": [3, 8], "reproduc": 11, "request": 8, "requir": [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12], "research": 7, "reshap": 8, "resourc": 4, "respect": [1, 8], "restrict": 2, "result": [0, 1, 7, 10, 11, 13], "return": [1, 4, 11], "reus": 5, "right": [9, 11], "risk": 1, "road": 0, "robust": 3, "root": 6, "round": [0, 3], "row": [0, 3, 4], "rule": [2, 7, 8], "run": [5, 6, 7, 8, 13], "run_all_step": [3, 11, 13], "run_step": 11, "runtim": 8, "sai": 11, "same": [0, 1, 2, 3, 4, 7, 8, 11], "sampl": 1, "satisfi": [2, 3, 8], "save": [3, 8, 13], "scala": [1, 9], "scale": 3, "scale_data": [3, 13], "scenario": 13, "schema": 3, "score": [1, 3, 8, 9, 10], "score_with_model": [3, 13], "scored_potential_match": 11, "scratch": 3, "script": [3, 7, 11], "search": 1, "second": [0, 1, 3, 9, 12], "secondari": 1, "secondary_distance_col": 1, "secondary_distances_fil": 1, "secondary_key_count": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_source_column": 1, "secondary_table_nam": 1, "section": [0, 1, 2, 3, 9, 13], "see": [1, 3, 6, 9, 11, 13], "seen": 1, "select": [0, 1, 5, 7, 11, 13], "send": 9, "separ": 8, "sequenc": 7, "seri": 11, "serialp": 3, "serv": 1, "set": [0, 1, 2, 3, 4, 7, 8, 9, 11, 13], "set_executor_memori": 11, "set_link_task": 11, "set_loc": 11, "set_num_cor": 11, "set_preexisting_t": 11, "set_print_sql": 11, "set_value_column_a": [3, 4], "set_value_column_b": [3, 4], "sever": [1, 7], "sex": [1, 3, 12], "sex_equ": 3, "sex_region_interact": 3, "sex_regionf_interact": 3, "sexmatch": 3, "sgen": [1, 3, 13], "share": 7, "short": 3, "should": [1, 3, 4, 9, 10, 11], "show": 11, "showf": 11, "shut": 13, "sibl": 0, "sign": 1, "signific": 0, "similar": [1, 3], "simpli": [0, 3], "simplifi": 1, "sinc": [0, 3, 9], "singl": [0, 4, 11, 13], "size": 1, "skip": [3, 4, 8], "sm_bpl": 1, "sm_namefrst": 1, "sm_sex": 1, "small": 3, "smaller": 7, "smallest": 3, "sn_bpl": 1, "sn_namefrst": 1, "sn_sex": 1, "so": [0, 1, 2, 3, 6, 9, 13], "some": [0, 1, 2, 3, 4, 7, 8, 9, 11], "someth": 11, "sometim": [3, 9], "somewhat": 2, "soundex": 5, "sourc": [1, 5, 8, 11, 13], "source_column_a": 1, "source_column_b": 1, "sp": 1, "sp_caution": [1, 3, 13], "sp_interacted_jw_sp": [3, 13], "sp_pre": [3, 13], "space": [0, 3, 4, 12], "span": 13, "spark": [1, 3, 4, 6, 9, 10, 11, 13], "sparkfactori": 11, "sparksess": 11, "sparkxgbclassifi": 9, "special": [1, 9], "specif": [1, 3, 9, 11], "specifi": [0, 1, 3, 7, 8, 10, 11, 12], "split": [3, 4, 8, 9, 10, 13], "split_by_id_a": [3, 13], "sploc": 1, "spous": 0, "spouse_birthyr": 1, "spouse_bpl": 1, "sql": [0, 1, 2, 3, 4, 5, 8, 11], "sql_condit": 3, "sqrt": 9, "squar": 1, "sr": [0, 3], "srace": [3, 10, 13], "stage": 3, "standard": [0, 1, 12], "start": [0, 12], "state": [1, 7], "state_1900_1910_distances_km": 1, "state_dist": 1, "state_distance_lookup": 1, "statecode1": 1, "statecode2": 1, "statefip": [0, 1, 3], "statefip_h": 3, "step": [0, 1, 3, 7], "stepmom": 1, "still": 12, "stop": [0, 9], "street": [0, 1], "street_jw": [1, 3, 13], "street_unstd": 12, "strictli": 10, "string": [0, 1, 3, 4, 8, 9, 11, 12], "strip": [0, 8], "structur": [2, 3, 7], "sub": [1, 2, 3], "subhead": 12, "subset": [3, 12], "substitut": [5, 8], "substitution_column": [3, 8, 12], "substitution_fil": [3, 12], "substitutions_street_abbrev": 12, "subtract": 1, "suffix": 0, "suppli": 12, "support": [0, 2, 3, 8, 9], "suppos": [0, 2, 3], "sure": [2, 3, 6, 11], "surnam": 1, "surround": 0, "suspect": [1, 7], "swap": 12, "synaps": 9, "syntax": 2, "system": [6, 9], "t": [0, 1, 3, 4, 13], "tabl": [1, 2, 3, 5, 8, 11, 13], "table_nam": 1, "tablenam": 11, "tailor": 7, "take": [0, 1, 2, 3, 4, 8, 11], "taken": [0, 1], "task": [2, 3, 5, 7, 9, 13], "task_nam": 11, "tell": [1, 3, 4], "templat": 1, "ten": 1, "term": 3, "test": [3, 8, 13], "text": 11, "than": [0, 1, 2, 3, 9], "thei": [0, 1, 2, 3, 8, 11], "them": [0, 1, 3, 8], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "thing": 3, "those": [1, 3], "though": 1, "thought": 8, "three": [2, 3, 8], "threshold": [2, 3, 9, 13], "threshold_expr": [2, 3], "threshold_ratio": [3, 9, 13], "through": [6, 8, 9, 10, 11], "thu": 1, "time": [0, 3, 8, 9, 11], "to_icpsrctyi": 1, "to_statefip": 1, "togeth": [0, 1, 3], "toml": [3, 7, 11], "tool": [6, 7], "top": [1, 5], "topic": 11, "total": 8, "train": [1, 5, 7, 9, 11], "training_data": [3, 11], "training_data_1900_1910": 13, "training_data_1900_1910_hlink_featur": 13, "training_data_subset": 3, "training_featur": [11, 13], "training_result": 13, "transform": [1, 3, 5, 7, 8], "transformer_typ": [3, 10], "treat": [3, 10], "tree": [5, 9], "true": [1, 3, 4, 8, 9, 10, 12, 13], "try": [3, 9], "tune": [7, 13], "tutori": [3, 11], "two": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "type": [0, 3, 4, 5, 8, 9, 11, 12, 13], "typic": [3, 8], "u": 0, "ugli": 2, "under": [1, 3], "underli": 9, "understand": 7, "union": 5, "uniqu": 3, "unit": 7, "univers": [1, 5, 8], "unknown": 1, "unlik": 2, "unrel": 1, "unstabl": [9, 11], "up": [1, 3, 11, 12], "updat": 13, "upper": 1, "upper_threshold": 3, "uppercas": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13], "us1900": 3, "us1900m_usa": 3, "us1910": 3, "us1910m_usa": 3, "usag": [3, 4, 5, 11], "use_potential_matches_featur": 13, "use_training_data_featur": [3, 8, 13], "user": [1, 11], "usual": [0, 8, 9, 13], "util": 4, "v": 3, "valu": [0, 1, 2, 3, 4, 9, 10, 11, 12], "var": [1, 3], "vari": [0, 3, 4], "variabl": [0, 1, 3, 13], "variant": 12, "varieti": 8, "variou": 9, "ve": 13, "vector": [3, 10], "verbos": 2, "veri": [4, 8, 13], "version": [0, 6, 9, 13], "vi": 3, "via": [6, 7, 9], "view": 9, "vii": 3, "viii": 3, "volumn": 1, "wa": [1, 13], "wai": [1, 2, 3, 6, 11], "want": [0, 1, 3, 11, 13], "washington": 3, "we": [0, 1, 9, 11, 13], "well": 3, "were": 3, "weren": 13, "what": [1, 3, 11, 13], "when": [0, 1, 2, 3, 4, 8, 9, 13], "where": [1, 8, 11, 13], "whether": [1, 3, 12], "which": [0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13], "white": 0, "whitespac": [0, 8], "who": 1, "whole": [0, 7], "whose": [1, 9], "width": 8, "winkler": [1, 10], "within": [1, 3, 7, 11, 12], "without": 9, "word": [0, 5], "work": [0, 1, 3, 6, 8, 11, 13], "workflow": [4, 5], "world": 7, "would": [1, 2, 3, 13], "write": [1, 11, 13], "written": [3, 7], "x": [3, 10, 11], "x_crosswalk": 11, "x_hh_tfam": 11, "x_hh_tfam_2a": 11, "x_hh_tfam_2b": 11, "x_load": 11, "x_parquet_from_csv": 11, "x_persist": 11, "x_sql": 11, "x_sqlf": 11, "x_summari": 11, "x_tab": 11, "x_tfam": 11, "x_tfam_raw": 11, "x_union": 11, "xgboost": 5, "y": 10, "year": [0, 1, 3, 4, 5], "year_b": 1, "yet": 11, "you": [0, 1, 2, 3, 6, 9, 11, 12, 13], "your": [2, 3, 4, 6, 8, 9, 11, 13], "yrimmig": 1, "zero": 1}, "titles": ["Column Mappings", "Comparison Features", "Comparisons", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "titleterms": {"": 5, "1": 12, "abs_diff": 1, "add": 1, "add_to_a": 0, "advanc": [0, 3, 13], "after": 13, "aggreg": 1, "alia": 1, "all_equ": 1, "any_equ": 1, "api": 5, "arrai": 4, "array_index": 0, "b_minus_a": 1, "basic": [0, 3], "bigram": 4, "block": 3, "btwn_threshold": 1, "bucket": 10, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "column": [0, 3], "comparison": [1, 2, 3], "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "condense_strip_whitespac": 0, "config": 3, "configur": [3, 5, 8], "data": [3, 12, 13], "decision_tre": 9, "defin": 2, "differ": 13, "divide_by_int": 0, "document": 5, "either_are_0": 1, "either_are_1": 1, "equal": 1, "equals_as_int": 1, "exact_mult": 1, "exampl": [11, 13], "explor": [8, 13], "export": 13, "extra_children": 1, "f1_match": 1, "f2_match": 1, "fals": 13, "featur": [1, 3, 4, 10, 13], "fetch_a": 1, "fetch_b": 1, "file": 3, "filter": 3, "fn": 13, "fp": 13, "from": 6, "gener": [3, 10, 13], "geo_dist": 1, "get_floor": 0, "gradient_boosted_tre": 9, "gt_threshold": 1, "hit": 1, "hits2": 1, "hlink": [5, 11], "household": [1, 3, 8], "instal": 6, "interact": [10, 11], "introduct": 7, "jaro_winkl": 1, "jaro_winkler_r": 1, "jaro_winkler_street": 1, "jw_max_a": 1, "jw_max_b": 1, "length_b": 1, "level": 3, "librari": 11, "lightgbm": 9, "link": [8, 11, 13], "list": 13, "logistic_regress": 9, "look_at_addl_var": 1, "lower_threshold": 1, "lowercase_strip": 0, "map": [0, 3], "match": [3, 8], "max_jaro_winkl": 1, "maximum_jaro_winkl": 1, "ml": 13, "mode": 11, "model": [3, 8, 9, 13], "multi_jaro_winkler_search": 1, "multipl": [2, 3], "neg": 13, "neither_are_nul": 1, "not_equ": 1, "not_zero_and_not_equ": 1, "ons": 1, "overview": [2, 7, 8], "pipelin": [3, 10], "posit": 13, "potenti": [3, 13], "power": [1, 4], "preprocess": 8, "present_and_equal_categorical_in_univers": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_both_year": 1, "probit": 9, "program": 11, "pypi": 6, "random_forest": 9, "rationalize_name_word": 0, "regex": 12, "rel_jaro_winkl": 1, "relat": 8, "remove_alternate_nam": 0, "remove_one_letter_nam": 0, "remove_prefix": 0, "remove_punctu": 0, "remove_qmark_hyphen": 0, "remove_stop_word": 0, "remove_suffix": 0, "replac": 12, "replace_apostroph": 0, "report": 8, "requir": 6, "reus": 13, "run": 11, "second_gen_imm": 1, "section": 8, "select": [3, 4], "singl": 3, "soundex": 4, "sourc": [3, 6], "split": 0, "sql_condit": [1, 4], "start": 11, "step": [8, 11], "substitut": [3, 12], "substr": 0, "sum": 1, "tabl": 12, "task": [8, 11], "threshold": 1, "time": 1, "top": 3, "train": [3, 8, 13], "transform": [0, 4, 10], "type": [1, 2, 10], "union": 4, "univers": 3, "upper_threshold": 1, "us": 11, "usag": 0, "welcom": 5, "when_valu": 0, "word": 12, "workflow": [11, 13], "xgboost": 9, "year": 13}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"1:1 substitution by data table": [[12, "substitution-by-data-table"]], "Advanced Config File": [[3, "advanced-config-file"]], "Advanced Usage": [[0, "advanced-usage"]], "Advanced Workflow Examples": [[13, null]], "Aggregate Features": [[1, "aggregate-features"]], "An Example Model Exploration Workflow": [[13, "an-example-model-exploration-workflow"]], "Basic Config File": [[3, "basic-config-file"]], "Basic Usage": [[0, "basic-usage"]], "Blocking": [[3, "blocking"]], "Column Mappings": [[0, null], [3, "column-mappings"]], "Comparison Features": [[1, null], [3, "comparison-features"]], "Comparison Types": [[1, "comparison-types"], [2, "comparison-types"]], "Comparisons": [[2, null], [3, "comparisons"]], "Configuration": [[3, null]], "Configuration API": [[5, "configuration-api"], [5, null]], "Data sources": [[3, "data-sources"]], "Defining Multiple Comparisons": [[2, "defining-multiple-comparisons"]], "Example interactive mode workflow": [[11, "example-interactive-mode-workflow"]], "Example training data export with generated ML features": [[13, "example-training-data-export-with-generated-ml-features"]], "Export training data after generating features to reuse in different linking years": [[13, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Feature Selection Transforms": [[4, null]], "Feature Selections": [[3, "feature-selections"]], "Feature add-ons": [[1, "feature-add-ons"]], "Filter": [[3, "filter"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "Household Comparisons": [[3, "household-comparisons"]], "Household Matching": [[8, "household-matching"]], "Household training and models": [[3, "household-training-and-models"]], "Installation": [[6, null]], "Installing from PyPI": [[6, "installing-from-pypi"]], "Installing from source": [[6, "installing-from-source"]], "Interactive Mode": [[11, "interactive-mode"]], "Introduction": [[7, null]], "Link Tasks": [[8, null]], "Matching": [[8, "matching"]], "Model Exploration and Household Model Exploration": [[8, "model-exploration-and-household-model-exploration"]], "Models": [[9, null]], "Multiple Comparisons": [[3, "multiple-comparisons"]], "Overview": [[2, "overview"], [7, "overview"], [8, "overview"], [8, "id1"], [8, "id4"], [8, "id7"], [8, "id10"], [8, "id13"]], "Pipeline generated features": [[10, null]], "Pipeline-generated Features": [[3, "pipeline-generated-features"]], "Potential Matches Universe": [[3, "potential-matches-universe"]], "Preprocessing": [[8, "preprocessing"]], "Related Configuration Sections": [[8, "related-configuration-sections"], [8, "id3"], [8, "id6"], [8, "id9"], [8, "id12"], [8, "id15"]], "Reporting": [[8, "reporting"]], "Requirements": [[6, "requirements"]], "Running Linking Tasks and Steps": [[11, "running-linking-tasks-and-steps"]], "Running hlink": [[11, null]], "Single Comparison": [[3, "single-comparison"]], "Starting the program": [[11, "starting-the-program"]], "Substitution Columns": [[3, "substitution-columns"]], "Substitution by regex word replace": [[12, "substitution-by-regex-word-replace"]], "Substitutions": [[12, null]], "Task steps": [[8, "task-steps"], [8, "id2"], [8, "id5"], [8, "id8"], [8, "id11"], [8, "id14"]], "Top level configs": [[3, "top-level-configs"]], "Training and Household Training": [[8, "training-and-household-training"]], "Training and models": [[3, "training-and-models"]], "Transformer types": [[10, "transformer-types"]], "Transforms": [[0, "transforms"]], "Using hlink as a Library": [[11, "using-hlink-as-a-library"]], "Welcome to hlink\u2019s documentation!": [[5, null]], "abs_diff": [[1, "abs-diff"]], "add_to_a": [[0, "add-to-a"]], "alias": [[1, "alias"]], "all_equals": [[1, "all-equals"]], "and": [[1, "and"]], "any_equals": [[1, "any-equals"]], "array": [[4, "array"]], "array_index": [[0, "array-index"]], "b_minus_a": [[1, "b-minus-a"]], "bigrams": [[4, "bigrams"]], "btwn_threshold": [[1, "btwn-threshold"]], "bucketizer": [[10, "bucketizer"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "decision_tree": [[9, "decision-tree"]], "divide_by_int": [[0, "divide-by-int"]], "either_are_0": [[1, "either-are-0"]], "either_are_1": [[1, "either-are-1"]], "equals": [[1, "equals"]], "equals_as_int": [[1, "equals-as-int"]], "exact_mult": [[1, "exact-mult"]], "extra_children": [[1, "extra-children"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "geo_distance": [[1, "geo-distance"]], "get_floor": [[0, "get-floor"]], "gradient_boosted_trees": [[9, "gradient-boosted-trees"]], "gt_threshold": [[1, "gt-threshold"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "interaction": [[10, "interaction"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "length_b": [[1, "length-b"]], "lightgbm": [[9, "lightgbm"]], "logistic_regression": [[9, "logistic-regression"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "lower_threshold": [[1, "lower-threshold"]], "lowercase_strip": [[0, "lowercase-strip"]], "mapping": [[0, "mapping"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "neither_are_null": [[1, "neither-are-null"]], "not_equals": [[1, "not-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "power": [[1, "power"], [4, "power"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_both_years": [[1, "present-both-years"]], "probit": [[9, "probit"]], "random_forest": [[9, "random-forest"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "remove_prefixes": [[0, "remove-prefixes"]], "remove_punctuation": [[0, "remove-punctuation"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_suffixes": [[0, "remove-suffixes"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "second_gen_imm": [[1, "second-gen-imm"]], "soundex": [[4, "soundex"]], "split": [[0, "split"]], "sql_condition": [[1, "sql-condition"], [4, "sql-condition"]], "substring": [[0, "substring"]], "sum": [[1, "sum"]], "threshold": [[1, "threshold"]], "times": [[1, "times"]], "union": [[4, "union"]], "upper_threshold": [[1, "upper-threshold"]], "when_value": [[0, "when-value"]], "xgboost": [[9, "xgboost"]]}, "docnames": ["column_mappings", "comparison_features", "comparisons", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["column_mappings.md", "comparison_features.md", "comparisons.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [0, 1, 3, 7, 8, 9, 11, 12], "0": [0, 1, 2, 3, 8, 9, 10, 11, 13], "005": 3, "012": 1, "05": [1, 3, 9], "1": [0, 1, 2, 3, 5, 8, 9, 10, 11, 13], "10": [0, 3, 6, 13], "100": [0, 3, 13], "11": [0, 3, 6, 10], "12": [0, 6], "14": 1, "15": 9, "1867": 3, "1868": 3, "1869": 3, "1870": 3, "1871": 3, "1872": 3, "1873": 3, "1900": [3, 13], "1900_1910_potential_fn": [], "1900_1910_potential_fp": [], "1900_1910_training_data_20191023": 3, "1900_1910_training_result": 13, "1910": [1, 3, 13], "1920": 13, "1930": [3, 13], "1940": [3, 13], "1999": [0, 3], "2": [0, 1, 2, 3, 4, 8, 9, 12, 13], "20": 9, "25": 1, "3": [0, 1, 2, 3, 6, 8, 9, 10, 13], "300": 0, "301": 0, "302": 0, "303": 0, "4": [0, 1, 9], "5": [0, 1, 2, 3, 9, 10, 11, 13], "50": [3, 13], "50g": 13, "53": 3, "5g": 11, "6": [0, 3, 9, 10, 13], "65": 3, "7": [0, 1, 3, 9, 13], "75": [3, 9, 13], "79": [2, 3], "7th": 12, "8": [1, 3, 6, 9, 11], "80": 3, "84": [2, 3], "85": [9, 11], "9": 1, "95": 1, "99": [1, 3], "9998": 0, "9999": [0, 3, 10], "A": [0, 1, 2, 3, 4, 10, 11], "AND": [1, 2, 3], "As": [0, 2], "At": [7, 8], "But": [3, 6], "By": [0, 3, 11], "For": [0, 1, 2, 3, 8, 9, 11, 13], "If": [0, 1, 3, 4, 8, 9, 11, 12], "In": [0, 1, 2, 7, 9, 11, 13], "It": [0, 1, 2, 3, 7, 9, 11, 13], "NOT": 1, "OR": [1, 2, 3], "THEN": 1, "The": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13], "Then": [0, 6], "There": [1, 3, 8, 9], "These": [0, 1, 2, 3, 7, 8, 9, 10], "To": [0, 1, 6, 8, 9, 11], "Will": 3, "With": [0, 10], "_": [0, 1, 2, 3, 4, 5, 9, 10, 11], "_a": 1, "_bpl": 1, "_namefrst": 1, "_sex": 1, "a304bt": 3, "ab": 1, "abbrevi": [8, 12], "abl": 3, "about": [3, 11, 13], "abov": [1, 2, 6], "absolut": 1, "accept": [1, 3, 13], "access": [11, 13], "accord": 1, "across": 1, "ad": [0, 1, 2, 3, 9], "add": [0, 2, 5], "add_to_a": 3, "addit": [0, 1, 3, 6, 7, 9, 11], "addl": 1, "addl_var": 1, "adjust": 11, "adopt": 0, "advanc": 5, "affect": [4, 12], "after": [1, 2, 3, 5, 9, 11], "ag": [0, 1, 3, 4], "against": [1, 3, 12], "age_2": 3, "age_at_dataset_b": 0, "age_threshold": 1, "aggreg": 5, "ah": 0, "ahead": 8, "akin": 1, "algorithm": [1, 2, 7, 8], "alia": [0, 3, 8], "all": [0, 1, 3, 4, 8, 9, 10, 11], "allow": [1, 3, 8], "along": 1, "alpha": 9, "alphabet": 0, "alphanumer": 3, "also": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 13], "altern": [0, 3, 9], "although": 3, "alwai": 2, "among": 1, "amount": 3, "an": [0, 1, 2, 3, 5, 7, 9, 11], "analysi": 13, "analyz": [11, 13], "ani": [1, 3, 4, 6, 9], "anoth": [0, 1, 3, 4, 8, 9], "anyon": 8, "anywher": 12, "apach": 6, "apart": 0, "api": [3, 7, 10], "apostroph": 0, "appear": [0, 1], "appli": [0, 2, 3, 4, 8, 9, 13], "apply_model": 3, "appropri": 3, "ar": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "arbitrari": 0, "area": [], "aren": 1, "arg": 13, "argument": [1, 11, 13], "around": 3, "arrai": [0, 3, 5, 10], "array_index": 3, "ask": 13, "aspect": [8, 11], "assert": [1, 11], "assum": 3, "attach_vari": 3, "attempt": 3, "attribut": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12], "auto": 9, "automat": [0, 3, 6, 8], "av": 12, "avail": [0, 1, 3, 4, 6, 8, 9, 10, 13], "avenu": [0, 12], "b": [0, 1, 2, 3, 4, 11], "b200": 3, "back": 1, "backup": 1, "base": [1, 3, 8], "basic": 5, "becaus": 2, "been": 7, "befor": [0, 1, 3, 4, 6, 8], "begin": 11, "behind": 0, "being": [1, 8], "belong": 3, "below": [0, 1, 3, 4, 9, 10, 11], "best": [3, 8], "beta": [3, 9], "better": [0, 7], "between": [0, 1, 2, 3, 7, 8, 9, 11, 13], "beyond": 1, "bigint": 3, "bigram": [3, 5], "bin": 9, "binomi": 9, "birth": 1, "birthplac": [0, 3], "birthyr": [0, 3], "birthyr_3": 3, "birthyr_col": 1, "block": [2, 5, 8], "blvd": 0, "boolean": [1, 3, 4, 12, 13], "boost": [5, 9], "born": 1, "borrow_t": 11, "both": [0, 1, 2, 3, 8, 13], "boundari": 1, "bpl": [0, 1, 3], "bpl1": 3, "bpl2": 3, "bpl2_str": 3, "bpl3": 3, "bpl_clean": 3, "bpl_orig": 3, "bpl_root": 0, "bpl_str": 3, "bplmatch": 3, "broken": 7, "btwn": 1, "bucket": [3, 8], "built": 6, "builtin": 1, "byrdifcat": 3, "byrdiff": [1, 3, 13], "c": [1, 9], "c201": 3, "calcul": 1, "call": [0, 9], "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "cannot": 3, "care": 2, "cartesian": 1, "case": [0, 1, 2, 3, 4, 7], "cast": 1, "categor": [1, 3, 9, 10], "categori": [0, 9], "caus": 9, "caution": [1, 10], "censu": [0, 8, 13], "census": 7, "chang": [0, 1, 3, 6, 11, 13], "charact": 0, "characterist": [7, 8], "check": [1, 11], "check_val_expr": 1, "checkpoint": 4, "child": [0, 9], "children": 1, "chines": 0, "choic": 11, "chosen": 8, "chosen_model": [3, 9, 13], "circl": 0, "circumst": 1, "class": [9, 11], "classif": [8, 9], "classifi": 9, "claus": [0, 1], "clean": [0, 7], "clean_birthyr": [0, 3, 4], "clone": 6, "code": [0, 1, 3, 6], "coeffici": [3, 8], "col": [0, 1], "col_to_add": 3, "col_to_join_on": 3, "col_typ": 3, "column": [1, 4, 5, 8, 10, 11, 12, 13], "column_map": [0, 3, 8], "column_nam": [0, 1, 3, 12], "column_to_append": 0, "combin": [1, 2, 3, 4, 8], "come": 1, "command": [3, 7, 11, 13], "comment": 4, "common": [7, 8, 12, 13], "comp": 1, "comp_a": [1, 2, 3], "comp_b": [1, 2, 3], "comp_c": 1, "comp_d": 1, "compar": [0, 1, 3, 7, 8], "comparis": 3, "comparison": [5, 8], "comparison_featur": [1, 2, 3, 8], "comparison_typ": [1, 2, 3], "complet": 11, "complex": [3, 4], "comput": [1, 4, 8], "concat": 0, "concaten": [0, 1], "condens": 0, "condense_strip_whitespac": 3, "condit": [0, 1, 2, 3, 4, 5, 8], "conf": [11, 13], "config": [1, 4, 5, 8, 11, 13], "configur": [0, 1, 2, 7, 9, 11, 13], "conjuct": 3, "conjunct": 3, "connect": [2, 3], "consid": [1, 2, 9], "consider": 1, "consol": 11, "constraint": [1, 2], "construct": 8, "contain": [0, 1, 3, 12], "context": 10, "continu": [9, 10, 13], "conveni": 11, "convert": [0, 1, 3], "convert_ints_to_long": 3, "copi": [4, 13], "core": [1, 8, 11, 13], "correspond": [7, 8], "could": [0, 2, 3], "count": [1, 11, 13], "counterpart": 9, "counti": [0, 1], "county_1900_1910_distances_km": 1, "county_a": 1, "county_b": 1, "county_dist": [1, 3, 13], "county_distance_lookup": 1, "county_distance_squar": [1, 3, 13], "county_state_dist": 1, "court": 0, "cover": 10, "coverag": [], "cpu": 11, "creat": [0, 3, 7, 8, 10, 11, 12, 13], "creation": 3, "crosswalk": 8, "csv": [1, 3, 8, 11, 12, 13], "current": [1, 2, 3, 9, 11], "d": 1, "d425": 3, "data": [0, 1, 5, 7, 8, 11], "databas": 11, "datafram": [8, 11, 13], "dataset": [0, 1, 3, 4, 7, 8, 11, 13], "datasourc": [1, 3, 11], "datasource_a": [3, 8], "datasource_b": [3, 8], "de": 9, "decis": [3, 5, 9, 13], "decisiontreeclassifi": 9, "default": [0, 1, 2, 3, 8, 9, 11], "defin": [1, 3, 5, 8, 10, 11], "definit": [3, 8], "demograph": 8, "depend": [1, 2, 3, 6, 9, 13], "dependent_var": [3, 13], "depth": 9, "deriv": 13, "derived_from": 3, "desc": 11, "describ": [0, 1, 2, 3, 11], "descript": [3, 11], "detail": [0, 3, 11], "detect": 9, "determin": [1, 8], "determinist": [7, 8], "dev": 6, "develop": [6, 7], "df": [3, 11], "dictionari": 11, "diff": 1, "differ": [0, 1, 3, 5, 7, 8], "digit": 0, "dir": 12, "directli": [2, 9], "directori": [6, 11, 13], "disabl": 9, "discard": 9, "discret": 9, "discuss": 3, "dist": 1, "dist_tabl": 1, "distanc": [1, 9], "distance_col": 1, "distance_km": 1, "distances_fil": 1, "distinct": 1, "divid": 0, "divide_by_int": 3, "do": [0, 1, 2, 4, 9, 11, 13], "doc": [9, 10], "document": [1, 3, 9, 11, 13], "doe": [1, 4, 8, 9, 13], "don": [3, 4], "doubl": 10, "down": [0, 7, 13], "drastic": 8, "drop": [0, 3, 11], "drop_al": 11, "drop_all_prc": 11, "drop_all_temp": 11, "drop_data_from_scored_match": 3, "drop_duplicate_a": 3, "drop_duplicate_with_threshold_ratio": [3, 9, 13], "duplic": [3, 9], "durat": 1, "dure": [1, 2, 3, 8], "durmarr": [1, 3], "e": 6, "each": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11], "easiest": [6, 11], "easili": 11, "edit": 6, "effect": [2, 4], "either": [0, 1, 2, 3, 7, 12], "either_0": 1, "either_1": 1, "element": 0, "elig": 8, "els": [1, 3, 4], "else_v": 1, "else_valu": 0, "empti": 1, "enabl": [3, 8, 9, 11], "encod": [3, 4], "encount": 9, "end": [0, 1, 3, 4, 12], "enorm": 9, "ensur": 3, "enter": 11, "entir": [0, 3], "enum_dist": 1, "enumer": 11, "equal": [2, 3, 12], "equal_and_not_null_templ": 1, "equival": 3, "error": [3, 9, 10], "especi": 3, "eta": 9, "etc": 13, "eval": 3, "evalu": [1, 3, 7, 8, 9], "even": 1, "everi": [1, 4], "ex": 1, "exact": [1, 3], "exact_mult": [3, 13], "exampl": [0, 1, 2, 3, 5, 9], "except": [3, 10], "exclud": 1, "excute_command": 11, "execut": 11, "execute_command": 11, "execute_task": 11, "executor": 11, "executor_memori": [11, 13], "exist": [1, 3, 11], "exit": 11, "expand": 3, "expand_length": 3, "expect": 1, "experi": 7, "experiment": [8, 11], "explicitli": [1, 10], "explod": [2, 3, 8], "exploded_df_a": 11, "exploded_df_b": 11, "explor": [3, 5, 7, 11], "expon": 4, "exponenti": 1, "export": [5, 8, 11], "express": [0, 1, 3], "extend": 1, "extens": 8, "extra": [1, 9], "extract": 3, "f": [1, 11], "f1": 1, "f1_match": 3, "f2": 1, "f2_match": 3, "f_caution": [3, 13], "f_interacted_jw_f": [3, 13], "f_pre": [3, 13], "factori": 11, "fail": 3, "fallback": 1, "fals": [1, 3, 4, 7, 11, 13], "famili": 9, "father_namefrst": 1, "favor": 1, "fbpl": 1, "fbpl_nomatch": 1, "fbplmatch": [3, 13], "featur": [2, 5, 7, 8, 9, 11], "feature_import": [3, 8, 13], "feature_nam": [2, 3], "feature_select": [2, 3, 4, 8], "featuresubsetstrategi": 9, "fed": [3, 8], "femal": [3, 12], "fetch": 1, "fetch_a": 3, "few": [4, 9], "fewer": [1, 9], "fi": 1, "file": [1, 4, 5, 7, 8, 11, 12, 13], "filepath": 11, "fill": 1, "filter": [1, 2, 5, 8, 12], "final": [1, 3, 13], "find": [1, 8, 13], "finish": 11, "first": [0, 1, 3, 6, 8, 11, 12], "first_init_col": 1, "first_nam": 0, "five": 11, "fix": 8, "flag": [1, 2, 3, 10, 11, 13], "flexibl": [1, 2], "float": [1, 3, 9], "floor": 0, "focus": [2, 8], "follow": [0, 1, 2, 7, 11, 12, 13], "force_row_wis": 9, "foreign": 1, "forest": [5, 9], "form": [1, 3, 8, 12], "format": 0, "four": 1, "framework": 13, "from": [0, 1, 3, 5, 7, 8, 10, 11, 13], "from_icpsrctyi": 1, "from_statefip": 1, "fsoundex": [3, 13], "full": [3, 8, 9, 13], "full_count_1870_1880": 11, "full_count_1900_1910": 13, "fullcount_1870_1880": 11, "function": [0, 1, 3, 7, 11], "further": [7, 13], "gamma": 9, "gbtclassifi": 9, "gen": 1, "gener": [0, 1, 5, 7, 8, 11], "generalizedlinearregress": 9, "geo": 1, "geograph": 1, "get": [0, 1, 2, 3, 11], "get_floor": 3, "get_set": 11, "get_step": 11, "get_tabl": 11, "get_task": 11, "github": 6, "give": [0, 3], "given": [0, 1, 2, 3, 4, 13], "go": [3, 11], "good": 1, "gradient": [5, 9], "greater": [1, 2, 6], "greatest": 1, "group": [3, 8], "gt": 1, "h": 11, "ha": [0, 1, 2, 3, 4, 7, 9, 11, 13], "handl": 11, "harmon": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "haven": 3, "head": 0, "header": [3, 10, 12], "help": [8, 9, 11], "here": [2, 3, 8, 9, 11, 13], "hh": [1, 2], "hh_blocked_match": 11, "hh_col": 3, "hh_comparison": [2, 3, 8], "hh_match": [3, 11], "hh_model_eval_repeat_fn": 11, "hh_model_eval_repeat_fp": 11, "hh_model_eval_training_data": 11, "hh_model_eval_training_featur": 11, "hh_model_eval_training_result": 11, "hh_model_eval_training_vector": 11, "hh_model_explor": 11, "hh_potential_match": [2, 11], "hh_potential_matchs_prep": 11, "hh_predicted_match": 11, "hh_repeat_fn": [], "hh_repeat_fp": [], "hh_scored_potential_match": 11, "hh_train": [1, 3, 8, 11, 13], "hh_training_data": 11, "hh_training_data_1900_1910": 3, "hh_training_featur": [11, 13], "hh_training_result": 13, "hidden": 11, "hierarch": [7, 11], "hierarchi": 11, "high": [9, 11], "highest": [1, 3, 9], "highli": [7, 8], "histid": [1, 3, 13], "histid_col": 1, "hit": [3, 11, 13], "hits2": [3, 13], "hlink": [0, 1, 2, 3, 4, 6, 7, 8, 9, 13], "hold": 10, "hot": 3, "household": [0, 2, 5, 7, 9, 11, 13], "how": [1, 3, 8], "howev": [4, 7, 13], "hundr": 0, "hyper": [3, 7, 13], "hyperparamet": 13, "hyphen": 0, "i": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13], "id": [1, 3], "id_column": [1, 3], "ident": 13, "identifi": [3, 7, 13], "if_valu": 0, "ignor": 7, "ii": [0, 3], "iii": [0, 3], "imm": [1, 3, 13], "imm_interacted_bplmatch": 3, "imm_interacted_immyear_caut": [3, 13], "immigr": 1, "immyear_caut": [3, 10], "immyear_diff": [1, 3, 10, 13], "implement": [9, 13], "implicitli": 2, "import": [3, 8, 11], "improv": 8, "includ": [1, 3, 8, 9, 10, 11], "incompar": 1, "increas": 10, "independ": [0, 3], "independent_var": [1, 3, 13], "index": [0, 6], "indic": [1, 13], "individu": [1, 3, 8, 13], "inf": 10, "inform": [0, 1, 3, 11], "ingest": 8, "initi": [0, 1, 11], "input": [0, 1, 3, 4, 7, 8, 11, 12], "input_col": 4, "input_column": [3, 4, 10], "input_table_nam": 11, "inspect": 8, "instal": [5, 9], "instanc": [0, 9], "instead": [1, 3, 4, 6, 8], "instruct": [6, 11], "int": [0, 1, 3, 4, 9], "integ": [0, 1, 3, 10], "integr": [6, 9], "interact": [3, 5, 8, 13], "interfac": 11, "intermedi": 11, "introduct": 5, "introspect": 8, "invalid": [1, 9], "ipum": [0, 7], "ipython": 11, "isn": 13, "istemporari": 11, "item": 0, "iter": 3, "its": [0, 1, 7, 9, 11], "itself": [1, 2], "iv": 3, "jaro": [1, 10], "jaro_winkl": 3, "java": 6, "job": 11, "join": [1, 3, 12], "join_column": [3, 12], "join_valu": [3, 12], "jr": [0, 3], "json": [3, 11], "just": [1, 3, 11, 13], "jw": 1, "jw_col_templ": 1, "jw_f": [1, 3, 13], "jw_m": [3, 10, 13], "jw_max_a": 3, "jw_max_b": 3, "jw_sp": [3, 13], "jw_street": 1, "jw_threshold": 1, "kei": [1, 8, 11], "key_count": 1, "kind": 1, "know": 3, "known": 0, "label": 9, "languag": 7, "larger": 9, "last": [0, 1, 8, 10], "latest": 6, "launch": [11, 13], "law": 0, "lead": 0, "learn": [1, 2, 3, 7, 8, 9, 11, 13], "learningr": 9, "least": [0, 1, 9], "leav": 0, "left": 9, "length": [1, 3, 10], "less": [1, 3], "let": 11, "letter": 0, "level": [1, 5, 11], "leverag": 3, "libomp": 9, "librari": [5, 7, 9], "lightgbm": 5, "lightgbmclassifi": 9, "like": [0, 2, 3, 8, 11, 12], "limit": 3, "line": [7, 11], "link": [0, 1, 3, 5, 7, 9], "link_run": 11, "linkrun": [7, 11], "list": [0, 1, 3, 4, 9, 11, 12], "liter": 3, "ll": 11, "load": 11, "load_conf_fil": 11, "load_config": 11, "loc_a": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b": 1, "loc_b_0": 1, "loc_b_1": 1, "locat": [1, 3, 11], "log2": 9, "logic": [0, 2, 3], "logist": [5, 9], "logistic_regress": [3, 13], "logisticregress": 9, "long": [3, 12], "longest": 8, "look": [1, 2, 11, 12], "lookup": 1, "lower": [0, 1], "lowercas": 0, "lowercase_strip": 3, "lowest": 0, "lr": 11, "lsoundex": [3, 13], "m": [0, 1], "m_caution": [1, 3, 10, 13], "m_interacted_jw_m": [3, 10, 13], "m_namefrst": 1, "m_pre": [3, 10, 13], "machin": [1, 2, 3, 7, 8, 9, 11, 13], "made": 6, "mai": [0, 2, 3, 7, 8, 9, 11, 12], "main": 11, "mainli": 1, "major": [1, 11], "make": [0, 1, 2, 3, 6, 13], "male": [3, 12], "mani": [2, 8, 9, 11], "manual": 13, "map": [5, 8, 10], "mardurmatch": [1, 3], "mark": 2, "marriag": 1, "match": [0, 1, 2, 5, 7, 9, 11, 12, 13], "matches_df": 11, "matrix": 13, "max": [1, 9, 11], "max_depth": 9, "maxbin": 9, "maxdepth": [3, 9, 13], "maximum": [1, 9], "maximum_jw": 1, "mbpl": 1, "mbplmatch": [3, 13], "mean": [0, 2, 3], "meant": 3, "meet": 1, "member": [1, 8], "memori": 11, "men": 3, "messag": 11, "metadata": 8, "method": [11, 13], "mfbplmatch": 1, "mi": [3, 13], "mi0": 1, "mi1": 1, "mid_init_col": 1, "mid_init_match": 1, "middl": [0, 1], "might": 13, "minimum": [1, 9], "mininstancespernod": 9, "minu": [1, 3], "mismatch": 1, "miss": [1, 11], "ml": [3, 9, 10], "mode": [5, 6, 13], "model": [5, 7, 11], "model_eval_repeat_fn": 11, "model_eval_repeat_fp": 11, "model_eval_training_data": 11, "model_eval_training_featur": 11, "model_eval_training_result": 11, "model_eval_training_vector": 11, "model_explor": [3, 9, 11, 13], "model_paramet": [3, 8, 13], "modifi": 3, "modul": 11, "moment": 8, "momloc": 1, "more": [0, 1, 2, 3, 9, 10, 11, 13], "most": [0, 1, 8, 11], "mother": 10, "mother_birthyr": 1, "mpre": 1, "mr": 0, "much": [1, 8], "mult": 1, "multi": 1, "multipl": [0, 1, 5, 9, 11], "multipli": 1, "must": [0, 1, 2, 3, 4, 9, 10, 12], "my": [11, 13], "my_conf": 11, "my_fil": 3, "myriad": 3, "n": [1, 9, 10], "n_training_iter": [3, 8, 13], "name": [0, 1, 3, 4, 9, 11, 12], "name_col": 1, "name_std": [3, 12], "namefrst": [0, 1, 3], "namefrst_clean": [0, 3], "namefrst_init": 1, "namefrst_jw": [1, 2, 3, 13], "namefrst_mid_init": [0, 1], "namefrst_mid_init_2": 1, "namefrst_rel": 1, "namefrst_related_row": 1, "namefrst_split": [0, 3], "namefrst_std": [3, 12], "namefrst_std_jw": [3, 13], "namefrst_unstd": [1, 3], "namefrst_unstd_bigram": 4, "namelast": [1, 3], "namelast1": 1, "namelast2": 1, "namelast3": 1, "namelast_bigram": 3, "namelast_clean": [1, 3, 4], "namelast_clean_bigram": [3, 4], "namelast_clean_soundex": 4, "namelast_equal_as_int": 1, "namelast_frst_bigram": 4, "namelast_jw": [2, 3, 13], "namelast_jw_max": 1, "namelast_neighbor": 1, "namelast_popular": 1, "namelast_popularity_sum": 1, "nativ": 1, "nbor": [1, 3, 13], "ncount": [3, 4, 13], "ncount2": [3, 4, 13], "nearest": 0, "necessari": [4, 13], "need": [0, 1, 2, 3, 8, 9, 11, 13], "neg": 7, "neighbor": 1, "neighborhood": 1, "neither": 1, "nest": [2, 3, 7], "new": [0, 3, 13], "new_marr": [1, 3], "newli": 3, "niu": 1, "no_first_pad": 4, "node": 9, "non": 0, "nonzero": 1, "nor": 1, "normal": [8, 9], "note": [2, 3], "now": [2, 11], "null": [1, 3, 4], "null_fil": 3, "num": 11, "num_col": 1, "number": [0, 1, 3, 4, 8, 9, 11], "numer": [0, 1], "numtre": [3, 9, 13], "nvl": 1, "object": [1, 3, 7, 11], "often": [2, 4, 11], "onc": [1, 11], "one": [0, 1, 3, 8], "onethird": 9, "onli": [0, 1, 2, 3, 8, 9, 13], "ons": 5, "oper": [0, 1, 2, 3, 9], "option": [0, 1, 3, 4, 7, 8, 9, 11, 13], "or_group": 3, "order": [0, 3, 8], "org": 6, "organ": 11, "original_valu": 3, "oth": [3, 13], "other": [1, 3, 9, 13], "otherwis": [0, 1, 10, 13], "our": 11, "out": [3, 4, 8, 13], "output": [0, 1, 2, 3, 4, 7, 8, 11, 13], "output_col": 4, "output_column": [3, 4, 10], "output_suspicious_td": [], "output_table_nam": 11, "output_typ": 3, "outsid": 10, "overrid": [0, 3], "override_column_a": [0, 3, 4], "override_column_b": [0, 3, 4], "override_transform": [0, 3], "overview": 5, "own": [1, 3], "p": 3, "packag": [6, 9], "page": [1, 2, 3, 11], "pair": [1, 2, 3, 13], "param": 13, "param_grid": [3, 13], "paramet": [3, 7, 8, 9, 11, 13], "parent": [0, 1, 12], "parent_step_chang": 1, "parenthes": 3, "parquet": [3, 8], "part": [3, 8], "particular": [1, 2, 3], "particularli": 1, "partit": 13, "pass": [1, 8, 9], "passthrougharg": 9, "path": [1, 3, 11, 12, 13], "pattern": 11, "peopl": [0, 1, 7, 11], "per": [1, 3, 9, 10, 11], "percent": 8, "percentag": 1, "perform": [1, 3, 7, 8, 9, 12], "persist": 11, "person": [0, 1, 7], "pip": [6, 9], "pipelin": 5, "pipeline_featur": [3, 8, 10], "piplin": 3, "placehold": 1, "pleas": [3, 9, 11], "plu": 3, "point": [10, 13], "popul": 8, "posit": [0, 1, 7], "possibl": 3, "post": [3, 8], "potenti": [1, 2, 5, 8, 9], "potential_match": [1, 2, 8, 11], "potential_matches_prep": 11, "potential_matches_univers": [3, 8], "power": 5, "predefin": 1, "predict": [3, 13], "predicted_match": 11, "prediction_col": 3, "preexist": 11, "prefer": 13, "prefix": 0, "prep": 3, "prep_step": 11, "prepar": [7, 8, 11], "prepend": 4, "prepped_df_a": 11, "prepped_df_b": 11, "preprocess": [5, 7, 11, 13], "present": [1, 3, 10], "pretti": 2, "primari": [1, 7], "primarili": [1, 8], "print": 11, "probabilist": [3, 7], "probabl": [0, 3, 9], "probit": [3, 5], "proceed": 12, "process": [2, 3, 7, 11], "produc": [0, 11], "product": [1, 7, 13], "program": [1, 3, 8, 13], "project": 6, "prompt": 11, "provid": [0, 3, 4, 7, 8, 9, 10, 11], "pull": 8, "punctuat": 0, "put": [11, 13], "py": [1, 3], "pypi": 5, "pyspark": [6, 9, 10, 11], "python": [6, 7, 9, 11], "q": [11, 13], "qmark": 0, "qualifi": 3, "queri": [1, 3], "race": [0, 1, 3, 10, 13], "race_interacted_srac": [3, 10, 13], "racematch": 3, "rais": [1, 4, 9], "random": [5, 9], "random_forest": [3, 13], "randomforestclassifi": 9, "rang": [3, 10], "rate": 1, "ratio": [3, 9], "ration": 0, "rationalize_name_word": 3, "raw": [0, 3, 8, 11], "raw_df_a": 11, "raw_df_b": 11, "read": [0, 1, 3, 8, 11], "readm": 3, "real": 7, "reason": 1, "recod": 0, "record": [0, 1, 2, 3, 7, 8, 9], "recurs": [2, 3], "reduc": [4, 8], "refer": 3, "regex": 5, "regex_word_replac": 12, "region": [3, 13], "region_dict": 3, "regionf": 3, "regist": [8, 11], "regress": [5, 9], "regular": [], "rel": [1, 3, 13], "relat": [0, 1, 3], "relate_a": [3, 10], "relate_col": 1, "relate_div_100": [0, 1, 3], "related_individual_max_jw": 1, "related_individual_row": 1, "relatematch": [1, 3], "relatetyp": [3, 10], "relatetype_interacted_relatematch": 3, "relationship": 7, "relev": 13, "reload": 11, "remain": 8, "remov": 0, "remove_alternate_nam": 3, "remove_qmark_hyphen": 3, "remove_suffix": 3, "repeat_fn": [], "repeat_fp": [], "repeatedli": [], "replac": [0, 1, 2, 5], "replace_apostroph": 3, "replaced_birthyr": [1, 3, 4], "report": [1, 5, 7, 11], "repositori": 6, "repres": [0, 1, 3, 10, 11], "represent": 8, "reproduc": 11, "request": 8, "requir": [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12], "research": 7, "reshap": 8, "resourc": 4, "respect": [1, 8], "restrict": 2, "result": [0, 1, 7, 10, 11, 13], "return": [1, 4, 11], "reus": 5, "right": [9, 11], "risk": 1, "road": 0, "robust": 3, "root": 6, "round": [0, 3], "row": [0, 3, 4], "rule": [2, 7, 8], "run": [5, 6, 7, 8, 13], "run_all_step": [3, 11, 13], "run_step": 11, "runtim": 8, "sai": 11, "same": [0, 1, 2, 3, 4, 7, 8, 11], "sampl": 1, "satisfi": [2, 3, 8], "save": [3, 8, 13], "scala": [1, 9], "scale": 3, "scale_data": [3, 13], "scenario": 13, "schema": 3, "score": [1, 3, 8, 9, 10], "score_with_model": [3, 13], "scored_potential_match": 11, "scratch": 3, "script": [3, 7, 11], "search": 1, "second": [0, 1, 3, 9, 12], "secondari": 1, "secondary_distance_col": 1, "secondary_distances_fil": 1, "secondary_key_count": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_source_column": 1, "secondary_table_nam": 1, "section": [0, 1, 2, 3, 9, 13], "see": [1, 3, 6, 9, 11, 13], "seen": 1, "select": [0, 1, 5, 7, 11, 13], "send": 9, "separ": 8, "sequenc": 7, "seri": 11, "serialp": 3, "serv": 1, "set": [0, 1, 2, 3, 4, 7, 8, 9, 11, 13], "set_executor_memori": 11, "set_link_task": 11, "set_loc": 11, "set_num_cor": 11, "set_preexisting_t": 11, "set_print_sql": 11, "set_value_column_a": [3, 4], "set_value_column_b": [3, 4], "sever": [1, 7], "sex": [1, 3, 12], "sex_equ": 3, "sex_region_interact": 3, "sex_regionf_interact": 3, "sexmatch": 3, "sgen": [1, 3, 13], "share": 7, "short": 3, "should": [1, 3, 4, 9, 10, 11], "show": 11, "showf": 11, "shut": 13, "sibl": 0, "sign": 1, "signific": 0, "similar": [1, 3], "simpli": [0, 3], "simplifi": 1, "sinc": [0, 3, 9], "singl": [0, 4, 11, 13], "size": 1, "skip": [3, 4, 8], "sm_bpl": 1, "sm_namefrst": 1, "sm_sex": 1, "small": 3, "smaller": 7, "smallest": 3, "sn_bpl": 1, "sn_namefrst": 1, "sn_sex": 1, "so": [0, 1, 2, 3, 6, 9, 13], "some": [0, 1, 2, 3, 4, 7, 8, 9, 11], "someth": 11, "sometim": [3, 9], "somewhat": 2, "soundex": 5, "sourc": [1, 5, 8, 11, 13], "source_column_a": 1, "source_column_b": 1, "sp": 1, "sp_caution": [1, 3, 13], "sp_interacted_jw_sp": [3, 13], "sp_pre": [3, 13], "space": [0, 3, 4, 12], "span": 13, "spark": [1, 3, 4, 6, 9, 10, 11, 13], "sparkfactori": 11, "sparksess": 11, "sparkxgbclassifi": 9, "special": [1, 9], "specif": [1, 3, 9, 11], "specifi": [0, 1, 3, 7, 8, 10, 11, 12], "split": [3, 4, 8, 9, 10, 13], "split_by_id_a": [3, 13], "sploc": 1, "spous": 0, "spouse_birthyr": 1, "spouse_bpl": 1, "sql": [0, 1, 2, 3, 4, 5, 8, 11], "sql_condit": 3, "sqrt": 9, "squar": 1, "sr": [0, 3], "srace": [3, 10, 13], "stage": 3, "standard": [0, 1, 12], "start": [0, 12], "state": [1, 7], "state_1900_1910_distances_km": 1, "state_dist": 1, "state_distance_lookup": 1, "statecode1": 1, "statecode2": 1, "statefip": [0, 1, 3], "statefip_h": 3, "step": [0, 1, 3, 7], "stepmom": 1, "still": 12, "stop": [0, 9], "street": [0, 1], "street_jw": [1, 3, 13], "street_unstd": 12, "strictli": 10, "string": [0, 1, 3, 4, 8, 9, 11, 12], "strip": [0, 8], "structur": [2, 3, 7], "sub": [1, 2, 3], "subhead": 12, "subset": [3, 12], "substitut": [5, 8], "substitution_column": [3, 8, 12], "substitution_fil": [3, 12], "substitutions_street_abbrev": 12, "subtract": 1, "suffix": 0, "suppli": 12, "support": [0, 2, 3, 8, 9], "suppos": [0, 2, 3], "sure": [2, 3, 6, 11], "surnam": 1, "surround": 0, "suspect": [1, 7], "swap": 12, "synaps": 9, "syntax": 2, "system": [6, 9], "t": [0, 1, 3, 4, 13], "tabl": [1, 2, 3, 5, 8, 11, 13], "table_nam": 1, "tablenam": 11, "tailor": 7, "take": [0, 1, 2, 3, 4, 8, 11], "taken": [0, 1], "task": [2, 3, 5, 7, 9, 13], "task_nam": 11, "tell": [1, 3, 4], "templat": 1, "ten": 1, "term": 3, "test": [3, 8, 13], "text": 11, "than": [0, 1, 2, 3, 9], "thei": [0, 1, 2, 3, 8, 11], "them": [0, 1, 3, 8], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "thing": 3, "those": [1, 3], "though": 1, "thought": 8, "three": [2, 3, 8], "threshold": [2, 3, 9, 13], "threshold_expr": [2, 3], "threshold_ratio": [3, 9, 13], "through": [6, 8, 9, 10, 11], "thu": 1, "time": [0, 3, 8, 9, 11], "to_icpsrctyi": 1, "to_statefip": 1, "togeth": [0, 1, 3], "toml": [3, 7, 11], "tool": [6, 7], "top": [1, 5], "topic": 11, "total": 8, "train": [1, 5, 7, 9, 11], "training_data": [3, 11], "training_data_1900_1910": 13, "training_data_1900_1910_hlink_featur": 13, "training_data_subset": 3, "training_featur": [11, 13], "training_result": 13, "transform": [1, 3, 5, 7, 8], "transformer_typ": [3, 10], "treat": [3, 10], "tree": [5, 9], "true": [1, 3, 4, 8, 9, 10, 12, 13], "try": [3, 9], "tune": [7, 13], "tutori": [3, 11], "two": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "type": [0, 3, 4, 5, 8, 9, 11, 12, 13], "typic": [3, 8], "u": 0, "ugli": 2, "under": [1, 3], "underli": 9, "understand": 7, "union": 5, "uniqu": 3, "unit": 7, "univers": [1, 5, 8], "unknown": 1, "unlik": 2, "unrel": 1, "unstabl": [9, 11], "up": [1, 3, 11, 12], "updat": 13, "upper": 1, "upper_threshold": 3, "uppercas": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13], "us1900": 3, "us1900m_usa": 3, "us1910": 3, "us1910m_usa": 3, "usag": [3, 4, 5, 11], "use_potential_matches_featur": 13, "use_training_data_featur": [3, 8, 13], "user": [1, 11], "usual": [0, 8, 9, 13], "util": 4, "v": 3, "valu": [0, 1, 2, 3, 4, 9, 10, 11, 12], "var": [1, 3], "vari": [0, 3, 4], "variabl": [0, 1, 3, 13], "variant": 12, "varieti": 8, "variou": 9, "ve": 13, "vector": [3, 10], "verbos": 2, "veri": [4, 8], "version": [0, 6, 9, 13], "vi": 3, "via": [6, 7, 9], "view": 9, "vii": 3, "viii": 3, "volumn": 1, "wa": [1, 13], "wai": [1, 2, 3, 6, 11], "want": [0, 1, 3, 11, 13], "washington": 3, "we": [0, 1, 9, 11, 13], "well": 3, "were": 3, "weren": 13, "what": [1, 3, 11, 13], "when": [0, 1, 2, 3, 4, 8, 9, 13], "where": [1, 8, 11, 13], "whether": [1, 3, 12], "which": [0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13], "white": 0, "whitespac": [0, 8], "who": 1, "whole": [0, 7], "whose": [1, 9], "width": 8, "winkler": [1, 10], "within": [1, 3, 7, 11, 12], "without": 9, "word": [0, 5], "work": [0, 1, 3, 6, 8, 11, 13], "workflow": [4, 5], "world": 7, "would": [1, 2, 3, 13], "write": [1, 11, 13], "written": [3, 7], "x": [3, 10, 11], "x_crosswalk": 11, "x_hh_tfam": 11, "x_hh_tfam_2a": 11, "x_hh_tfam_2b": 11, "x_load": 11, "x_parquet_from_csv": 11, "x_persist": 11, "x_sql": 11, "x_sqlf": 11, "x_summari": 11, "x_tab": 11, "x_tfam": 11, "x_tfam_raw": 11, "x_union": 11, "xgboost": 5, "y": 10, "year": [0, 1, 3, 4, 5], "year_b": 1, "yet": 11, "you": [0, 1, 2, 3, 6, 9, 11, 12, 13], "your": [2, 3, 4, 6, 8, 9, 11, 13], "yrimmig": 1, "zero": 1}, "titles": ["Column Mappings", "Comparison Features", "Comparisons", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "titleterms": {"": 5, "1": 12, "abs_diff": 1, "add": 1, "add_to_a": 0, "advanc": [0, 3, 13], "after": 13, "aggreg": 1, "alia": 1, "all_equ": 1, "an": 13, "any_equ": 1, "api": 5, "arrai": 4, "array_index": 0, "b_minus_a": 1, "basic": [0, 3], "bigram": 4, "block": 3, "btwn_threshold": 1, "bucket": 10, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "column": [0, 3], "comparison": [1, 2, 3], "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "condense_strip_whitespac": 0, "config": 3, "configur": [3, 5, 8], "data": [3, 12, 13], "decision_tre": 9, "defin": 2, "differ": 13, "divide_by_int": 0, "document": 5, "either_are_0": 1, "either_are_1": 1, "equal": 1, "equals_as_int": 1, "exact_mult": 1, "exampl": [11, 13], "explor": [8, 13], "export": 13, "extra_children": 1, "f1_match": 1, "f2_match": 1, "fals": [], "featur": [1, 3, 4, 10, 13], "fetch_a": 1, "fetch_b": 1, "file": 3, "filter": 3, "fn": [], "fp": [], "from": 6, "gener": [3, 10, 13], "geo_dist": 1, "get_floor": 0, "gradient_boosted_tre": 9, "gt_threshold": 1, "hit": 1, "hits2": 1, "hlink": [5, 11], "household": [1, 3, 8], "instal": 6, "interact": [10, 11], "introduct": 7, "jaro_winkl": 1, "jaro_winkler_r": 1, "jaro_winkler_street": 1, "jw_max_a": 1, "jw_max_b": 1, "length_b": 1, "level": 3, "librari": 11, "lightgbm": 9, "link": [8, 11, 13], "list": [], "logistic_regress": 9, "look_at_addl_var": 1, "lower_threshold": 1, "lowercase_strip": 0, "map": [0, 3], "match": [3, 8], "max_jaro_winkl": 1, "maximum_jaro_winkl": 1, "ml": 13, "mode": 11, "model": [3, 8, 9, 13], "multi_jaro_winkler_search": 1, "multipl": [2, 3], "neg": [], "neither_are_nul": 1, "not_equ": 1, "not_zero_and_not_equ": 1, "ons": 1, "overview": [2, 7, 8], "pipelin": [3, 10], "posit": [], "potenti": 3, "power": [1, 4], "preprocess": 8, "present_and_equal_categorical_in_univers": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_both_year": 1, "probit": 9, "program": 11, "pypi": 6, "random_forest": 9, "rationalize_name_word": 0, "regex": 12, "rel_jaro_winkl": 1, "relat": 8, "remove_alternate_nam": 0, "remove_one_letter_nam": 0, "remove_prefix": 0, "remove_punctu": 0, "remove_qmark_hyphen": 0, "remove_stop_word": 0, "remove_suffix": 0, "replac": 12, "replace_apostroph": 0, "report": 8, "requir": 6, "reus": 13, "run": 11, "second_gen_imm": 1, "section": 8, "select": [3, 4], "singl": 3, "soundex": 4, "sourc": [3, 6], "split": 0, "sql_condit": [1, 4], "start": 11, "step": [8, 11], "substitut": [3, 12], "substr": 0, "sum": 1, "tabl": 12, "task": [8, 11], "threshold": 1, "time": 1, "top": 3, "train": [3, 8, 13], "transform": [0, 4, 10], "type": [1, 2, 10], "union": 4, "univers": 3, "upper_threshold": 1, "us": 11, "usag": 0, "welcom": 5, "when_valu": 0, "word": 12, "workflow": [11, 13], "xgboost": 9, "year": 13}})
\ No newline at end of file
diff --git a/docs/use_examples.html b/docs/use_examples.html
index 1e31192..e2419ae 100644
--- a/docs/use_examples.html
+++ b/docs/use_examples.html
@@ -93,12 +93,9 @@ <h3>Example training data export with generated ML features<a class="headerlink"
 </ol>
 </section>
 </section>
-<section id="ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data">
-<h2>ML model exploration and export of lists of potential false positives/negatives in training data<a class="headerlink" href="#ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data" title="Link to this heading">¶</a></h2>
+<section id="an-example-model-exploration-workflow">
+<h2>An Example Model Exploration Workflow<a class="headerlink" href="#an-example-model-exploration-workflow" title="Link to this heading">¶</a></h2>
 <p><code class="docutils literal notranslate"><span class="pre">hlink</span></code> accepts a matrix of ML models and hyper-parameters to run train/test splits for you, and outputs data you can use to select and tune your models.  You can see example <code class="docutils literal notranslate"><span class="pre">training</span></code> and <code class="docutils literal notranslate"><span class="pre">hh_training</span></code> configuration sections that implement this in the <a class="reference external" href="config.html#training-and-models">training</a> and <a class="reference external" href="config.html#household-training-and-models">household training</a> sections of the configuration documentation.</p>
-<p>The model exploration link task also allows you to export lists of potential false positives (FPs) and false negatives (FNs) in your training data.  This is calculated when running the train/test splits in the regular model exploration tasks if the <code class="docutils literal notranslate"><span class="pre">output_suspicious_TD</span></code> flag is true.</p>
-<section id="example-model-exploration-and-fp-fn-export-workflow">
-<h3>Example model exploration and FP/FN export workflow<a class="headerlink" href="#example-model-exploration-and-fp-fn-export-workflow" title="Link to this heading">¶</a></h3>
 <ol>
 <li><p>Create a config file that has a <code class="docutils literal notranslate"><span class="pre">training</span></code> and/or <code class="docutils literal notranslate"><span class="pre">hh_training</span></code> section with model parameters to explore. For example:</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">training</span><span class="p">]</span>
@@ -113,9 +110,6 @@ <h3>Example model exploration and FP/FN export workflow<a class="headerlink" hre
 <span class="c1"># source data years weren&#39;t identical to the linked years of your training data.</span>
 <span class="n">use_training_data_features</span> <span class="o">=</span> <span class="n">false</span>
 
-<span class="c1"># VERY IMPORTANT if you want to output FPs/FNs</span>
-<span class="n">output_suspicious_TD</span> <span class="o">=</span> <span class="n">true</span>
-
 <span class="n">split_by_id_a</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">score_with_model</span> <span class="o">=</span> <span class="n">true</span>
 <span class="n">feature_importances</span> <span class="o">=</span> <span class="n">false</span>
@@ -150,16 +144,9 @@ <h3>Example model exploration and FP/FN export workflow<a class="headerlink" hre
 </pre></div>
 </div>
 </li>
-<li><p>Export the potential FPs and FNs to csv.  For <code class="docutils literal notranslate"><span class="pre">training</span></code> params, the results will be in the <code class="docutils literal notranslate"><span class="pre">repeat_FPs</span></code> and <code class="docutils literal notranslate"><span class="pre">repeat_FNs</span></code> tables, and for <code class="docutils literal notranslate"><span class="pre">hh_training</span></code> in the <code class="docutils literal notranslate"><span class="pre">hh_repeat_FPs</span></code> and <code class="docutils literal notranslate"><span class="pre">hh_repeat_FNs</span></code> tables.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>hlink $ csv repeat_FPs /my/output/1900_1910_potential_FPs.csv
-hlink $ csv repeat_FNs /my/output/1900_1910_potential_FNs.csv
-</pre></div>
-</div>
-</li>
 <li><p>Use your preferred methods to analyze the data you’ve just exported.  Update the <code class="docutils literal notranslate"><span class="pre">chosen_model</span></code> in your configuration, and/or create new versions of your training data following your findings and update the path to the new training data in your configs.</p></li>
 </ol>
 </section>
-</section>
 </section>
 
 
@@ -195,7 +182,7 @@ <h1 class="logo"><a href="index.html">hlink</a></h1>
 <li class="toctree-l1"><a class="reference internal" href="running_the_program.html">Running hlink</a></li>
 <li class="toctree-l1 current"><a class="current reference internal" href="#">Advanced Workflows</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="#export-training-data-after-generating-features-to-reuse-in-different-linking-years">Export training data after generating features to reuse in different linking years</a></li>
-<li class="toctree-l2"><a class="reference internal" href="#ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data">ML model exploration and export of lists of potential false positives/negatives in training data</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#an-example-model-exploration-workflow">An Example Model Exploration Workflow</a></li>
 </ul>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="config.html">Configuration</a></li>
diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 070c1da..6025998 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -342,7 +342,6 @@ def _choose_best_training_results(self, evals: list[ModelEval]) -> ModelEval:
     def _evaluate_threshold_combinations(
         self,
         best_model: ModelEval,
-        suspicious_data: Any,
         split: dict[str : pyspark.sql.DataFrame],
         dep_var: str,
         id_a: str,
@@ -397,16 +396,6 @@ def _evaluate_threshold_combinations(
             id_b,
             dep_var,
         )
-        """
-        thresholding_predict_train = _get_probability_and_select_pred_columns(
-            cached_training_data,
-            thresholding_model,
-            thresholding_post_transformer,
-            id_a,
-            id_b,
-            dep_var,
-        )
-        """
 
         for threshold_index, (
             this_alpha_threshold,
@@ -428,15 +417,6 @@ def _evaluate_threshold_combinations(
                 id_column,
                 decision,
             )
-            """
-            predict_train = threshold_core.predict_using_thresholds(
-                thresholding_predict_train,
-                this_alpha_threshold,
-                this_threshold_ratio,
-                id_column,
-                decision,
-            )
-            """
 
             end_predict_time = perf_counter()
             info = f"Predictions for test-train data on threshold took {end_predict_time - start_predict_time:.2f}s"
@@ -446,27 +426,15 @@ def _evaluate_threshold_combinations(
                 predictions,
                 dep_var,
                 thresholding_model,
-                suspicious_data,
                 this_alpha_threshold,
                 this_threshold_ratio,
                 best_model.score,
             )
-            """
-            training_results[threshold_index] = self._capture_training_results(
-                predict_train,
-                dep_var,
-                thresholding_model,
-                suspicious_data,
-                this_alpha_threshold,
-                this_threshold_ratio,
-                best_model.score,
-            )
-            """
 
         thresholding_test_data.unpersist()
         thresholding_training_data.unpersist()
 
-        return prediction_results, suspicious_data
+        return prediction_results
 
     def _run(self) -> None:
         training_section_name = str(self.task.training_conf)
@@ -487,10 +455,6 @@ def _run(self) -> None:
             .cache()
         )
 
-        # Stores suspicious data
-        # suspicious_data = self._create_suspicious_data(id_a, id_b)
-        suspicious_data = None
-
         outer_fold_count = training_settings.get("n_training_iterations", 10)
         inner_fold_count = 3
 
@@ -500,7 +464,6 @@ def _run(self) -> None:
         # At the end we combine this information collected from every outer fold
         threshold_test_results: list[ThresholdTestResult] = []
         # threshold_training_results: list[ThresholdTestResult]
-        all_suspicious_data: list[Any] = []
         best_models: list[ModelEval] = []
 
         seed = training_settings.get("seed", 2133)
@@ -545,21 +508,17 @@ def _run(self) -> None:
                 hyperparam_evaluation_results
             )
 
-            prediction_results, suspicious_data_for_threshold = (
-                self._evaluate_threshold_combinations(
-                    best_model,
-                    suspicious_data,
-                    {"test": outer_test_data, "training": outer_training_data},
-                    dep_var,
-                    id_a,
-                    id_b,
-                )
+            prediction_results = self._evaluate_threshold_combinations(
+                best_model,
+                {"test": outer_test_data, "training": outer_training_data},
+                dep_var,
+                id_a,
+                id_b,
             )
 
             # Collect the outputs for each fold
             threshold_test_results.append(prediction_results)
             # threshold_training_results.append(training_results)
-            # all_suspicious_data.append(suspicious_data_for_threshold)
             best_models.append(best_model)
 
         combined_test = _combine_by_threshold_matrix_entry(threshold_test_results)
@@ -588,7 +547,6 @@ def _run(self) -> None:
         )
 
         self._save_training_results(thresholded_metrics_df, self.task.spark)
-        # self._save_suspicious_data(suspicious_data, self.task.spark)
         self.task.spark.sql("set spark.sql.shuffle.partitions=200")
 
     def _split_into_folds(
@@ -685,7 +643,6 @@ def _capture_prediction_results(
         predictions: pyspark.sql.DataFrame,
         dep_var: str,
         model: Model,
-        suspicious_data: dict[str, Any] | None,
         alpha_threshold: float,
         threshold_ratio: float | None,
         pr_auc: float,
@@ -699,7 +656,7 @@ def _capture_prediction_results(
             test_FP_count,
             test_FN_count,
             test_TN_count,
-        ) = _get_confusion_matrix(predictions, dep_var, suspicious_data)
+        ) = _get_confusion_matrix(predictions, dep_var)
         test_precision, test_recall, test_mcc = _get_aggregate_metrics(
             test_TP_count, test_FP_count, test_FN_count, test_TN_count
         )
@@ -732,101 +689,6 @@ def _save_training_results(
             #    f"Training results saved to Spark table '{table_prefix}training_results'."
             # )
 
-    def _prepare_suspicious_table(
-        self, spark: pyspark.sql.SparkSession, df: pd.DataFrame, id_a: str, id_b: str
-    ) -> pyspark.sql.DataFrame:
-        spark_df = spark.createDataFrame(df)
-        counted = (
-            spark_df.groupby(id_a, id_b)
-            .agg(
-                count("*").alias("count"),
-                mean("probability").alias("mean_probability"),
-            )
-            .filter("count > 1")
-            .orderBy(["count", id_a, id_b])
-        )
-        return counted
-
-    def _save_suspicious_data(
-        self, suspicious_data: dict[str, Any] | None, spark: pyspark.sql.SparkSession
-    ) -> None:
-        table_prefix = self.task.table_prefix
-
-        if suspicious_data is None:
-            print("OTD suspicious data is None, not saving.")
-            return
-        id_a = suspicious_data["id_a"]
-        id_b = suspicious_data["id_b"]
-
-        if not suspicious_data["FP_data"].empty:
-            table_name = f"{table_prefix}repeat_fps"
-            counted_FPs = self._prepare_suspicious_table(
-                spark, suspicious_data["FP_data"], id_a, id_b
-            )
-            counted_FPs.write.mode("overwrite").saveAsTable(table_name)
-            print(
-                f"A table of false positives of length {counted_FPs.count()} was saved as '{table_name}' for analysis."
-            )
-        else:
-            print("There were no false positives recorded.")
-
-        if not suspicious_data["FN_data"].empty:
-            table_name = f"{table_prefix}repeat_fns"
-            counted_FNs = self._prepare_suspicious_table(
-                spark, suspicious_data["FN_data"], id_a, id_b
-            )
-            counted_FNs.write.mode("overwrite").saveAsTable(table_name)
-            print(
-                f"A table of false negatives of length {counted_FNs.count()} was saved as '{table_name}' for analysis."
-            )
-        else:
-            print("There were no false negatives recorded.")
-
-        if not suspicious_data["TP_data"].empty:
-            table_name = f"{table_prefix}repeat_tps"
-            counted_TPs = self._prepare_suspicious_table(
-                spark, suspicious_data["TP_data"], id_a, id_b
-            )
-            counted_TPs.write.mode("overwrite").saveAsTable(table_name)
-            print(
-                f"A table of true positives of length {counted_TPs.count()} was saved as '{table_name}' for analysis."
-            )
-        else:
-            print("There were no true positives recorded.")
-
-        if not suspicious_data["TN_data"].empty:
-            table_name = f"{table_prefix}repeat_tns"
-            counted_TNs = self._prepare_suspicious_table(
-                spark, suspicious_data["TN_data"], id_a, id_b
-            )
-            counted_TNs.write.mode("overwrite").saveAsTable(table_name)
-            print(
-                f"A table of true negatives of length {counted_TNs.count()} was saved as '{table_name}' for analysis."
-            )
-        else:
-            print("There were no true negatives recorded.")
-
-    def _create_suspicious_data(self, id_a: str, id_b: str) -> dict[str, Any] | None:
-        """Output Suspicious Data (OTD): used to check config to see if you should find sketchy training data that the models routinely mis-classify"""
-        training_section_name = str(self.task.training_conf)
-        config = self.task.link_run.config
-        training_settings = config[training_section_name]
-
-        if (
-            "output_suspicious_TD" in training_settings
-            and training_settings["output_suspicious_TD"]
-        ):
-            return {
-                "FP_data": pd.DataFrame(),
-                "FN_data": pd.DataFrame(),
-                "TP_data": pd.DataFrame(),
-                "TN_data": pd.DataFrame(),
-                "id_a": id_a,
-                "id_b": id_b,
-            }
-        else:
-            return None
-
 
 def _calc_mcc(TP: int, TN: int, FP: int, FN: int) -> float:
     """
@@ -889,9 +751,7 @@ def _get_probability_and_select_pred_columns(
 def _get_confusion_matrix(
     predictions: pyspark.sql.DataFrame,
     dep_var: str,
-    suspicious_data: dict[str, Any] | None,
 ) -> tuple[int, int, int, int]:
-
     TP = predictions.filter((predictions[dep_var] == 1) & (predictions.prediction == 1))
     TP_count = TP.count()
 
@@ -912,38 +772,6 @@ def _get_confusion_matrix(
     #   f"Confusion matrix -- true negatives and false negatives: FN {FN_count}  TN {TN_count}"
     # )
 
-    if suspicious_data:
-        id_a = suspicious_data["id_a"]
-        id_b = suspicious_data["id_b"]
-
-        new_FP_data = FP.select(
-            id_a, id_b, dep_var, "prediction", "probability"
-        ).toPandas()
-        suspicious_data["FP_data"] = pd.concat(
-            [suspicious_data["FP_data"], new_FP_data]
-        )
-
-        new_FN_data = FN.select(
-            id_a, id_b, dep_var, "prediction", "probability"
-        ).toPandas()
-        suspicious_data["FN_data"] = pd.concat(
-            [suspicious_data["FN_data"], new_FN_data]
-        )
-
-        new_TP_data = TP.select(
-            id_a, id_b, dep_var, "prediction", "probability"
-        ).toPandas()
-        suspicious_data["TP_data"] = pd.concat(
-            [suspicious_data["TP_data"], new_TP_data]
-        )
-
-        new_TN_data = TN.select(
-            id_a, id_b, dep_var, "prediction", "probability"
-        ).toPandas()
-        suspicious_data["TN_data"] = pd.concat(
-            [suspicious_data["TN_data"], new_TN_data]
-        )
-
     return TP_count, FP_count, FN_count, TN_count
 
 
diff --git a/sphinx-docs/config.md b/sphinx-docs/config.md
index 0ed63a3..b5ec9f7 100644
--- a/sphinx-docs/config.md
+++ b/sphinx-docs/config.md
@@ -334,7 +334,6 @@ split_by_id_a = true
 decision = "drop_duplicate_with_threshold_ratio"
 
 n_training_iterations = 2
-output_suspicious_TD = true
 param_grid = true
 model_parameters = [ 
     { type = "random_forest", maxDepth = [7], numTrees = [100], threshold = [0.05, 0.005], threshold_ratio = [1.2, 1.3] },
@@ -361,7 +360,6 @@ split_by_id_a = true
 decision = "drop_duplicate_with_threshold_ratio"
 
 n_training_iterations = 10
-output_suspicious_TD = true
 param_grid = false
 model_parameters = [
     { type = "random_forest", maxDepth = 6, numTrees = 50, threshold = 0.5, threshold_ratio = 1.0 },
@@ -750,7 +748,6 @@ splits = [-1,0,6,11,9999]
   * `n_training_iterations` -- Type: `integer`. Optional; default value is 10. The number of training iterations to use during the `model_exploration` task.
   * `scale_data` -- Type: `boolean`.  Optional. Whether to scale the data as part of the machine learning pipeline.
   * `use_training_data_features` -- Type: `boolean`. Optional. If the identifiers in the training data set are not present in your raw input data, you will need to set this to `true`, or training features will not be able to be generated, giving null column errors.  For example, if the training data set you are using has individuals from 1900 and 1910, but you are about to train a model to score the 1930-1940 potential matches, you need this to be set to `true` or it will fail, since the individual IDs are not present in the 1930 and 1940 raw input data.  If you were about to train a model to score the 1900-1910 potential matches with this same training set, it would be best to set this to `false`, so you can be sure the training features are created from scratch to match your exact current configuration settings, although if you know the features haven't changed, you could set it to `true` to save a small amount of processing time.
-  * `output_suspicious_TD` -- Type: `boolean`.  Optional.  Used in the `model_exploration` link task.  Outputs tables of potential matches that the model repeatedly scores differently than the match value given by the training data.  Helps to identify false positives/false negatives in the training data, as well as areas that need additional training feature coverage in the model, or need increased representation in the training data set.
   * `split_by_id_a` -- Type: `boolean`.  Optional.  Used in the `model_exploration` link task.  When set to true, ensures that all potential matches for a given individual with ID_a are grouped together in the same train-test-split group. For example, if individual histid_a "A304BT" has three potential matches in the training data, one each to histid_b "B200", "C201", and "D425", all of those potential matches would either end up in the "train" split or the "test" split when evaluating the model performance.
   * `feature_importances` -- Type: `boolean`. Optional.  Whether to record
     feature importances or coefficients for the training features when training
@@ -764,7 +761,6 @@ scale_data = false
 dataset = "/path/to/1900_1910_training_data_20191023.csv"
 dependent_var = "match"
 use_training_data_features = false
-output_suspicious_TD = true
 split_by_id_a = true
 
 score_with_model = true
@@ -804,7 +800,6 @@ scale_data = false
 dataset = "/path/to/hh_training_data_1900_1910.csv"
 dependent_var = "match"
 use_training_data_features = false
-output_suspicious_TD = true
 split_by_id_a = true
 score_with_model = true
 feature_importances = true
diff --git a/sphinx-docs/use_examples.md b/sphinx-docs/use_examples.md
index e781202..4d41811 100644
--- a/sphinx-docs/use_examples.md
+++ b/sphinx-docs/use_examples.md
@@ -1,6 +1,5 @@
 # Advanced Workflow Examples 
 
-
 ## Export training data after generating features to reuse in different linking years
 
 It is common to have a single training data set that spans two linked years, which is then used to train a model that is applied to a different set of linked years.  For example, we have a training data set that spans linked individuals from the 1900 census to the 1910 census.  We use this training data to predict links in the full count 1900-1910 linking run, but we also use this training data to link year pairs 1910-1920, 1920-1930, and 1930-1940.  
@@ -66,12 +65,9 @@ However, when this training data set is used for other years, the program does n
 
 8) Launch the hlink program using your new config for the new year pair you want to link. Run your link tasks and export relevant data.
 
-## ML model exploration and export of lists of potential false positives/negatives in training data
-`hlink` accepts a matrix of ML models and hyper-parameters to run train/test splits for you, and outputs data you can use to select and tune your models.  You can see example `training` and `hh_training` configuration sections that implement this in the [training](config.html#training-and-models) and [household training](config.html#household-training-and-models) sections of the configuration documentation.
-
-The model exploration link task also allows you to export lists of potential false positives (FPs) and false negatives (FNs) in your training data.  This is calculated when running the train/test splits in the regular model exploration tasks if the `output_suspicious_TD` flag is true.
+## An Example Model Exploration Workflow
 
-### Example model exploration and FP/FN export workflow
+`hlink` accepts a matrix of ML models and hyper-parameters to run train/test splits for you, and outputs data you can use to select and tune your models.  You can see example `training` and `hh_training` configuration sections that implement this in the [training](config.html#training-and-models) and [household training](config.html#household-training-and-models) sections of the configuration documentation.
 
 1) Create a config file that has a `training` and/or `hh_training` section with model parameters to explore. For example:
 
@@ -88,9 +84,6 @@ The model exploration link task also allows you to export lists of potential fal
     # source data years weren't identical to the linked years of your training data.
     use_training_data_features = false
 
-    # VERY IMPORTANT if you want to output FPs/FNs
-    output_suspicious_TD = true
-
     split_by_id_a = true
     score_with_model = true
     feature_importances = false
@@ -127,11 +120,4 @@ The model exploration link task also allows you to export lists of potential fal
     hlink $ csv training_results /my/output/1900_1910_training_results.csv
     ```
 
-5) Export the potential FPs and FNs to csv.  For `training` params, the results will be in the `repeat_FPs` and `repeat_FNs` tables, and for `hh_training` in the `hh_repeat_FPs` and `hh_repeat_FNs` tables.
-
-    ```
-    hlink $ csv repeat_FPs /my/output/1900_1910_potential_FPs.csv
-    hlink $ csv repeat_FNs /my/output/1900_1910_potential_FNs.csv
-    ```
-
-6) Use your preferred methods to analyze the data you've just exported.  Update the `chosen_model` in your configuration, and/or create new versions of your training data following your findings and update the path to the new training data in your configs.
+5) Use your preferred methods to analyze the data you've just exported.  Update the `chosen_model` in your configuration, and/or create new versions of your training data following your findings and update the path to the new training data in your configs.

From 9755f73c3f95557a765e599ff6b2f6ae831dd81d Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 10 Dec 2024 14:08:12 -0600
Subject: [PATCH 2/6] [#176] Add a unit test for _get_confusion_matrix()

---
 hlink/tests/model_exploration_test.py | 32 ++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 46166c5..7414ef4 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -3,14 +3,16 @@
 #   https://github.com/ipums/hlink
 from collections import Counter
 
-import pytest
 import pandas as pd
+from pyspark.sql import SparkSession
+import pytest
 
 import hlink.linking.core.threshold as threshold_core
 from hlink.linking.model_exploration.link_step_train_test_models import (
     LinkStepTrainTestModels,
     _custom_param_grid_builder,
     _get_model_parameters,
+    _get_confusion_matrix,
 )
 
 
@@ -985,3 +987,31 @@ def test_step_2_split_by_id_a(
     assert splits[1][1].toPandas()["id_a"].unique().tolist() == ["30"]
 
     main.do_drop_all("")
+
+
+def test_get_confusion_matrix(spark: SparkSession) -> None:
+    # 1 true negative (0, 0)
+    # 2 false negatives (1, 0)
+    # 3 false postives (0, 1)
+    # 4 true positives (1, 1)
+    rows = [
+        (0, 0),
+        (1, 0),
+        (0, 1),
+        (1, 0),
+        (0, 1),
+        (1, 1),
+        (0, 1),
+        (1, 1),
+        (1, 1),
+        (1, 1),
+    ]
+    predictions = spark.createDataFrame(rows, schema=["match", "prediction"])
+    true_positives, false_positives, false_negatives, true_negatives = (
+        _get_confusion_matrix(predictions, "match")
+    )
+
+    assert true_positives == 4
+    assert false_positives == 3
+    assert false_negatives == 2
+    assert true_negatives == 1

From c43b57d787c74df2fbf74377330ca370151938eb Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 10 Dec 2024 14:19:58 -0600
Subject: [PATCH 3/6] [#176] Rewrite _get_confusion_matrix() to avoid using 4
 filters + counts

Using a single select() should let us take better advantage of Spark's
parallel/distributed computing. My initial results profiling this are
pretty promising.
---
 .../link_step_train_test_models.py            | 45 ++++++++++---------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 6025998..d779121 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -21,7 +21,7 @@
 from pyspark.ml import Model, Transformer
 import pyspark.sql
 from pyspark.sql import DataFrame
-from pyspark.sql.functions import count, mean
+from pyspark.sql.functions import col, count, count_if, mean
 from functools import reduce
 import hlink.linking.core.threshold as threshold_core
 import hlink.linking.core.classifier as classifier_core
@@ -752,27 +752,30 @@ def _get_confusion_matrix(
     predictions: pyspark.sql.DataFrame,
     dep_var: str,
 ) -> tuple[int, int, int, int]:
-    TP = predictions.filter((predictions[dep_var] == 1) & (predictions.prediction == 1))
-    TP_count = TP.count()
-
-    FP = predictions.filter((predictions[dep_var] == 0) & (predictions.prediction == 1))
-    FP_count = FP.count()
-
-    # print(
-    #   f"Confusion matrix -- true positives and false positivesTP {TP_count} FP {FP_count}"
-    # )
-
-    FN = predictions.filter((predictions[dep_var] == 1) & (predictions.prediction == 0))
-    FN_count = FN.count()
-
-    TN = predictions.filter((predictions[dep_var] == 0) & (predictions.prediction == 0))
-    TN_count = TN.count()
-
-    # print(
-    #   f"Confusion matrix -- true negatives and false negatives: FN {FN_count}  TN {TN_count}"
-    # )
+    """
+    Compute the confusion matrix for the given DataFrame of predictions. The
+    confusion matrix is the count of true positives, false positives, false
+    negatives, and true negatives for the predictions.
 
-    return TP_count, FP_count, FN_count, TN_count
+    Return a tuple (true_positives, false_positives, false_negatives,
+    true_negatives).
+    """
+    prediction_col = col("prediction")
+    label_col = col(dep_var)
+
+    confusion_matrix = predictions.select(
+        count_if((label_col == 1) & (prediction_col == 1)).alias("true_positives"),
+        count_if((label_col == 0) & (prediction_col == 1)).alias("false_positives"),
+        count_if((label_col == 1) & (prediction_col == 0)).alias("false_negatives"),
+        count_if((label_col == 0) & (prediction_col == 0)).alias("true_negatives"),
+    )
+    [confusion_row] = confusion_matrix.collect()
+    return (
+        confusion_row.true_positives,
+        confusion_row.false_positives,
+        confusion_row.false_negatives,
+        confusion_row.true_negatives,
+    )
 
 
 def _get_aggregate_metrics(

From 4aad62ef5680a6007c8636ba0184cfc337797f87 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 10 Dec 2024 14:57:45 -0600
Subject: [PATCH 4/6] [#176] Add a unit test for _get_aggregate_metrics()

---
 hlink/tests/model_exploration_test.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 7414ef4..7222dbb 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -13,6 +13,7 @@
     _custom_param_grid_builder,
     _get_model_parameters,
     _get_confusion_matrix,
+    _get_aggregate_metrics,
 )
 
 
@@ -1015,3 +1016,20 @@ def test_get_confusion_matrix(spark: SparkSession) -> None:
     assert false_positives == 3
     assert false_negatives == 2
     assert true_negatives == 1
+
+
+def test_get_aggregate_metrics() -> None:
+    true_positives = 3112
+    false_positives = 205
+    false_negatives = 1134
+    true_negatives = 33259
+
+    precision, recall, mcc = _get_aggregate_metrics(
+        true_positives, false_positives, false_negatives, true_negatives
+    )
+
+    assert (
+        abs(precision - 0.9381972) < 0.0001
+    ), "expected precision to be near 0.9381972"
+    assert abs(recall - 0.7329251) < 0.0001, "expected recall to be near 0.7329251"
+    assert abs(mcc - 0.8111208) < 0.0001, "expected MCC to be near 0.8111208"

From 3efbb0c454bf07d7169dc4ab30cc00cae68623eb Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 10 Dec 2024 15:06:48 -0600
Subject: [PATCH 5/6] [#176] Lowercase tp/fp/fn/tn variable names

---
 .../link_step_train_test_models.py            | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index d779121..4693b9a 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -652,13 +652,13 @@ def _capture_prediction_results(
         predictions.createOrReplaceTempView(f"{table_prefix}predictions")
 
         (
-            test_TP_count,
-            test_FP_count,
-            test_FN_count,
-            test_TN_count,
+            tp_count,
+            fp_count,
+            fn_count,
+            tn_count,
         ) = _get_confusion_matrix(predictions, dep_var)
         test_precision, test_recall, test_mcc = _get_aggregate_metrics(
-            test_TP_count, test_FP_count, test_FN_count, test_TN_count
+            tp_count, fp_count, fn_count, tn_count
         )
 
         result = ThresholdTestResult(
@@ -690,15 +690,15 @@ def _save_training_results(
             # )
 
 
-def _calc_mcc(TP: int, TN: int, FP: int, FN: int) -> float:
+def _calc_mcc(tp: int, tn: int, fp: int, fn: int) -> float:
     """
-    Given the counts of true positives (TP), true negatives (TN), false
-    positives (FP), and false negatives (FN) for a model run, compute the
+    Given the counts of true positives (tp), true negatives (tn), false
+    positives (fp), and false negatives (fn) for a model run, compute the
     Matthews Correlation Coefficient (MCC).
     """
-    if (math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))) != 0:
-        mcc = ((TP * TN) - (FP * FN)) / (
-            math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
+    if (math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))) != 0:
+        mcc = ((tp * tn) - (fp * fn)) / (
+            math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
         )
     else:
         mcc = 0
@@ -779,7 +779,7 @@ def _get_confusion_matrix(
 
 
 def _get_aggregate_metrics(
-    TP_count: int, FP_count: int, FN_count: int, TN_count: int
+    true_positives: int, false_positives: int, false_negatives: int, true_negatives: int
 ) -> tuple[float, float, float]:
     """
     Given the counts of true positives, false positives, false negatives, and
@@ -788,15 +788,15 @@ def _get_aggregate_metrics(
 
     Return a tuple of (precision, recall, Matthews Correlation Coefficient).
     """
-    if (TP_count + FP_count) == 0:
+    if (true_positives + false_positives) == 0:
         precision = np.nan
     else:
-        precision = TP_count / (TP_count + FP_count)
-    if (TP_count + FN_count) == 0:
+        precision = true_positives / (true_positives + false_positives)
+    if (true_positives + false_negatives) == 0:
         recall = np.nan
     else:
-        recall = TP_count / (TP_count + FN_count)
-    mcc = _calc_mcc(TP_count, TN_count, FP_count, FN_count)
+        recall = true_positives / (true_positives + false_negatives)
+    mcc = _calc_mcc(true_positives, true_negatives, false_positives, false_negatives)
     return precision, recall, mcc
 
 

From 627eed88263dec922e47fd853ffe402061974220 Mon Sep 17 00:00:00 2001
From: rileyh <rileyh@umn.edu>
Date: Tue, 10 Dec 2024 16:04:54 -0600
Subject: [PATCH 6/6] Try requiring scikit-learn<1.6 when xgboost is installed

---
 pyproject.toml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 57485e4..2a4b001 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,13 @@ lightgbm = [
 xgboost = [
     "xgboost>=2.0",
     "pyarrow>=4.0",
+    # As of 2024-12-10, the latest scikit-learn version (1.6.0) is incompatible
+    # with the latest xgboost version (2.1.3). scikit-learn 1.6.0 came out
+    # yesterday, 2024-12-09, so I'm guessing that this a temporary bug that
+    # will be resolved with an update to one of the two libraries sometime
+    # sooner rather than later. Until then, we can pin scikit-learn to < 1.6
+    # when using xgboost.
+    "scikit-learn<1.6.0",
 ]
 
 [project.scripts]