From fbb4b4485cd4f99e90bfd109a4d80d76ca3300c6 Mon Sep 17 00:00:00 2001 From: rileyh Date: Thu, 14 Nov 2024 15:11:56 -0600 Subject: [PATCH 01/19] [#161] Add xgboost as an optional dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index efa43f0..5f150de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ dev = [ "sphinx==8.1.3", "recommonmark==0.7.1", ] +xgboost = ["xgboost>=2.0"] [project.scripts] hlink = "hlink.scripts.main:cli" From a51f20f1e08792c0fe318296bf05c8a9eebcfdba Mon Sep 17 00:00:00 2001 From: rileyh Date: Thu, 14 Nov 2024 15:27:06 -0600 Subject: [PATCH 02/19] [#161] Add a test for xgboost classifier support This test is currently failing if you have xgboost installed. If you don't have xgboost installed, it skips itself to prevent failures due to missing packages and dependencies. --- hlink/tests/core/classifier_test.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 hlink/tests/core/classifier_test.py diff --git a/hlink/tests/core/classifier_test.py b/hlink/tests/core/classifier_test.py new file mode 100644 index 0000000..1010262 --- /dev/null +++ b/hlink/tests/core/classifier_test.py @@ -0,0 +1,23 @@ +import pytest + +from hlink.linking.core.classifier import choose_classifier + +try: + import xgboost +except ModuleNotFoundError: + xgboost_available = False +else: + xgboost_available = True + +@pytest.mark.skipif(not xgboost_available, reason="requires the xgboost library") +def test_choose_classifier_supports_xgboost(): + """ + If the xgboost module is installed, then choose_classifier() supports a model + type of "xgboost". + """ + params = { + "max_depth": 2, + "eta": 0.5, + } + classifier = choose_classifier("xgboost", params, "match") + assert classifier.getLabelCol() == "match" From 010f3f54f01035801fd94a9755e49631840340e0 Mon Sep 17 00:00:00 2001 From: rileyh Date: Thu, 14 Nov 2024 15:32:04 -0600 Subject: [PATCH 03/19] [#161] Run black --- hlink/tests/core/classifier_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hlink/tests/core/classifier_test.py b/hlink/tests/core/classifier_test.py index 1010262..4ffa37a 100644 --- a/hlink/tests/core/classifier_test.py +++ b/hlink/tests/core/classifier_test.py @@ -9,6 +9,7 @@ else: xgboost_available = True + @pytest.mark.skipif(not xgboost_available, reason="requires the xgboost library") def test_choose_classifier_supports_xgboost(): """ From a865825237060862269fb1635c68c79a897726db Mon Sep 17 00:00:00 2001 From: rileyh Date: Thu, 14 Nov 2024 15:38:00 -0600 Subject: [PATCH 04/19] [#161] Ignore flake8 unused import error --- hlink/tests/core/classifier_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hlink/tests/core/classifier_test.py b/hlink/tests/core/classifier_test.py index 4ffa37a..7616acb 100644 --- a/hlink/tests/core/classifier_test.py +++ b/hlink/tests/core/classifier_test.py @@ -3,7 +3,7 @@ from hlink.linking.core.classifier import choose_classifier try: - import xgboost + import xgboost # noqa: F401 except ModuleNotFoundError: xgboost_available = False else: From 287912ec8e74099db172e652b26cf33c4c800e11 Mon Sep 17 00:00:00 2001 From: rileyh Date: Thu, 14 Nov 2024 16:06:38 -0600 Subject: [PATCH 05/19] [#161] Create a SparkXGBClassifier in choose_classifier() for model_type xgboost This is only possible when we have the xgboost module, so raise an error if that is not present. --- hlink/linking/core/classifier.py | 23 ++++++++++++++++++++++- hlink/tests/core/classifier_test.py | 2 +- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/hlink/linking/core/classifier.py b/hlink/linking/core/classifier.py index 0efaf38..bf9f898 100644 --- a/hlink/linking/core/classifier.py +++ b/hlink/linking/core/classifier.py @@ -13,6 +13,13 @@ ) import hlink.linking.transformers.rename_prob_column +try: + import xgboost.spark +except ModuleNotFoundError: + _xgboost_available = False +else: + _xgboost_available = True + def choose_classifier(model_type, params, dep_var): """Returns a classifier and a post_classification transformer given model type and params. @@ -96,7 +103,21 @@ def choose_classifier(model_type, params, dep_var): post_transformer = ( hlink.linking.transformers.rename_prob_column.RenameProbColumn() ) - + elif model_type == "xgboost": + if not _xgboost_available: + raise ModuleNotFoundError( + "model_type 'xgboost' requires the xgboost library" + ) + params_without_threshold = { + key: val + for key, val in params.items() + if key not in {"threshold", "threshold_ratio"} + } + classifier = xgboost.spark.SparkXGBClassifier( + **params_without_threshold, + features_col=features_vector, + label_col=dep_var, + ) else: raise ValueError( "Model type not recognized! Please check your config, reload, and try again." diff --git a/hlink/tests/core/classifier_test.py b/hlink/tests/core/classifier_test.py index 7616acb..cf7d2bd 100644 --- a/hlink/tests/core/classifier_test.py +++ b/hlink/tests/core/classifier_test.py @@ -20,5 +20,5 @@ def test_choose_classifier_supports_xgboost(): "max_depth": 2, "eta": 0.5, } - classifier = choose_classifier("xgboost", params, "match") + classifier, _post_transformer = choose_classifier("xgboost", params, "match") assert classifier.getLabelCol() == "match" From a7b0c37f164ea1af76ebf1a8881c3738982cc2ce Mon Sep 17 00:00:00 2001 From: rileyh Date: Fri, 15 Nov 2024 09:11:29 -0600 Subject: [PATCH 06/19] [#161] Add a test that runs the whole training task with an xgboost model This test is failing right now because we also need pyarrow>=4 when using xgboost. We should add this as a dependency in the xgboost extra. If xgboost isn't installed, this test skips itself. --- hlink/tests/training_test.py | 51 ++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py index 0fbdb0a..94be4f4 100644 --- a/hlink/tests/training_test.py +++ b/hlink/tests/training_test.py @@ -7,6 +7,13 @@ from pyspark.ml import Pipeline import hlink.linking.core.pipeline as pipeline_core +try: + import xgboost # noqa: F401 +except ModuleNotFoundError: + xgboost_available = False +else: + xgboost_available = True + @pytest.mark.quickcheck def test_all_steps( @@ -432,6 +439,50 @@ def test_step_3_with_probit_model( ) +@pytest.mark.skipif(not xgboost_available, reason="requires the xgboost library") +def test_step_3_with_xgboost_model( + spark, training, training_conf, datasource_training_input +): + training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + ] + training_conf["training"]["dataset"] = training_data_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = ["namelast_jw", "regionf"] + training_conf["training"]["chosen_model"] = { + "type": "xgboost", + "max_depth": 2, + "eta": 0.5, + "threshold": 0.7, + "threshold_ratio": 1.3, + } + training_conf["training"]["score_with_model"] = True + training_conf["training"]["feature_importances"] = True + + spark.read.csv(prepped_df_a_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_a") + spark.read.csv(prepped_df_b_path, header=True, inferSchema=True).write.mode( + "overwrite" + ).saveAsTable("prepped_df_b") + + training.run_step(0) + training.run_step(1) + training.run_step(2) + training.run_step(3) + + def test_step_3_requires_table(training_conf, training): training_conf["training"]["feature_importances"] = True with pytest.raises(RuntimeError, match="Missing input tables"): From 5c6fdc9ad3686d38dc6be6a5f79d198e6c8d3b6f Mon Sep 17 00:00:00 2001 From: rileyh Date: Fri, 15 Nov 2024 15:56:52 +0000 Subject: [PATCH 07/19] [#161] Update the Dockerfile to support build with different hlink extras This should let us have two different test setups for each Python version. One with xgboost, one without. --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 80d5c6e..0f2e036 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ ARG PYTHON_VERSION=3.10 FROM python:${PYTHON_VERSION} +ARG HLINK_EXTRAS=dev RUN apt-get update && apt-get install default-jre-headless -y @@ -8,4 +9,4 @@ WORKDIR /hlink COPY . . RUN python -m pip install --upgrade pip -RUN pip install -e .[dev] +RUN pip install -e .[${HLINK_EXTRAS}] From a2598112fdcb15012a7729a2be4a4289dad5021b Mon Sep 17 00:00:00 2001 From: Riley Harper <52982949+riley-harper@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:09:02 -0600 Subject: [PATCH 08/19] [#161] Update docker-build.yml to run tests with and without xgboost I've also updated pytest to be more verbose for clarity. --- .github/workflows/docker-build.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 4e42786..fe2b229 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -12,12 +12,13 @@ jobs: fail-fast: false matrix: python_version: ["3.10", "3.11", "3.12"] + hlink_extras: ["dev", "dev,xgboost"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Build the Docker image - run: docker build . --file Dockerfile --tag $HLINK_TAG-${{ matrix.python_version}} --build-arg PYTHON_VERSION=${{ matrix.python_version }} + run: docker build . --file Dockerfile --tag $HLINK_TAG-${{ matrix.python_version}} --build-arg PYTHON_VERSION=${{ matrix.python_version }} --build-arg HLINK_EXTRAS=${{ matrix.hlink_extras }} - name: Check dependency versions run: | @@ -32,7 +33,7 @@ jobs: run: docker run $HLINK_TAG-${{ matrix.python_version}} flake8 --count . - name: Test - run: docker run $HLINK_TAG-${{ matrix.python_version}} pytest + run: docker run $HLINK_TAG-${{ matrix.python_version}} pytest -v - name: Build sdist and wheel run: docker run $HLINK_TAG-${{ matrix.python_version}} python -m build From a95992cf472c8c41749e52679da5a156c7299b06 Mon Sep 17 00:00:00 2001 From: rileyh Date: Fri, 15 Nov 2024 13:17:01 -0600 Subject: [PATCH 09/19] [#161] Add pyarrow as a dependency for the xgboost extra --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5f150de..9397ee2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,10 @@ dev = [ "sphinx==8.1.3", "recommonmark==0.7.1", ] -xgboost = ["xgboost>=2.0"] +xgboost = [ + "xgboost>=2.0", + "pyarrow>=4.0", +] [project.scripts] hlink = "hlink.scripts.main:cli" From c64cf43c8cfaacca1a44d104d8ae2a2860229ea3 Mon Sep 17 00:00:00 2001 From: rileyh Date: Fri, 15 Nov 2024 13:44:22 -0600 Subject: [PATCH 10/19] [#161] Factor conditional xgboost test logic into a single marker --- hlink/tests/core/classifier_test.py | 12 ++---------- hlink/tests/markers.py | 14 ++++++++++++++ hlink/tests/training_test.py | 10 ++-------- 3 files changed, 18 insertions(+), 18 deletions(-) create mode 100644 hlink/tests/markers.py diff --git a/hlink/tests/core/classifier_test.py b/hlink/tests/core/classifier_test.py index cf7d2bd..473dae1 100644 --- a/hlink/tests/core/classifier_test.py +++ b/hlink/tests/core/classifier_test.py @@ -1,16 +1,8 @@ -import pytest - from hlink.linking.core.classifier import choose_classifier - -try: - import xgboost # noqa: F401 -except ModuleNotFoundError: - xgboost_available = False -else: - xgboost_available = True +from hlink.tests.markers import requires_xgboost -@pytest.mark.skipif(not xgboost_available, reason="requires the xgboost library") +@requires_xgboost def test_choose_classifier_supports_xgboost(): """ If the xgboost module is installed, then choose_classifier() supports a model diff --git a/hlink/tests/markers.py b/hlink/tests/markers.py new file mode 100644 index 0000000..cc264b1 --- /dev/null +++ b/hlink/tests/markers.py @@ -0,0 +1,14 @@ +import pytest + +try: + import xgboost # noqa: F401 +except ModuleNotFoundError: + xgboost_available = False +else: + xgboost_available = True + +requires_xgboost = pytest.mark.skipif( + not xgboost_available, reason="requires the xgboost library" +) +"""For tests which require the xgboost library. This checks whether xgboost is +installed and skips the test if it is not.""" diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py index 94be4f4..7ca5908 100644 --- a/hlink/tests/training_test.py +++ b/hlink/tests/training_test.py @@ -6,13 +6,7 @@ import pytest from pyspark.ml import Pipeline import hlink.linking.core.pipeline as pipeline_core - -try: - import xgboost # noqa: F401 -except ModuleNotFoundError: - xgboost_available = False -else: - xgboost_available = True +from hlink.tests.markers import requires_xgboost @pytest.mark.quickcheck @@ -439,7 +433,7 @@ def test_step_3_with_probit_model( ) -@pytest.mark.skipif(not xgboost_available, reason="requires the xgboost library") +@requires_xgboost def test_step_3_with_xgboost_model( spark, training, training_conf, datasource_training_input ): From 88d719964b10e1656ed00703d4ce1e318b00dfcc Mon Sep 17 00:00:00 2001 From: rileyh Date: Fri, 15 Nov 2024 15:21:06 -0600 Subject: [PATCH 11/19] [#161] Add an integration test for xgboost, set the post-transformer Like some of the other models, xgboost returns an array of probabilities like [probability_no, probability_yes]. So we extract just probability_yes as our probability for hlink purposes. --- hlink/linking/core/classifier.py | 4 + ...egration_score_with_trained_models_test.py | 96 +++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/hlink/linking/core/classifier.py b/hlink/linking/core/classifier.py index bf9f898..65295fe 100644 --- a/hlink/linking/core/classifier.py +++ b/hlink/linking/core/classifier.py @@ -117,6 +117,10 @@ def choose_classifier(model_type, params, dep_var): **params_without_threshold, features_col=features_vector, label_col=dep_var, + probability_col="probability_array", + ) + post_transformer = SQLTransformer( + statement="SELECT *, parseProbVector(probability_array, 1) as probability FROM __THIS__" ) else: raise ValueError( diff --git a/hlink/tests/integration_score_with_trained_models_test.py b/hlink/tests/integration_score_with_trained_models_test.py index 993a497..c04819c 100644 --- a/hlink/tests/integration_score_with_trained_models_test.py +++ b/hlink/tests/integration_score_with_trained_models_test.py @@ -3,6 +3,8 @@ # in this project's top-level directory, and also on-line at: # https://github.com/ipums/hlink +from hlink.tests.markers import requires_xgboost + def test_apply_chosen_model_RF( spark, @@ -859,6 +861,100 @@ def test_step_3_apply_chosen_model_boosted_trees( ) +@requires_xgboost +def test_apply_chosen_model_xgboost( + spark, + training, + matching, + training_conf, + datasource_training_input, + potential_matches_path, + state_dist_path, + spark_test_tmp_dir_path, +): + training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input + training_conf["comparison_features"] = [ + { + "alias": "regionf", + "column_name": "region", + "comparison_type": "fetch_a", + "categorical": True, + }, + { + "alias": "namelast_jw", + "column_name": "namelast", + "comparison_type": "jaro_winkler", + }, + { + "alias": "state_distance", + "key_count": 1, + "column_name": "bpl", + "comparison_type": "geo_distance", + "loc_a": "statecode1", + "loc_b": "statecode2", + "distance_col": "dist", + "table_name": "state_distances_lookup", + "distances_file": state_dist_path, + }, + ] + + training_conf["training"]["dataset"] = training_data_path + training_conf["training"]["dependent_var"] = "match" + training_conf["training"]["independent_vars"] = [ + "namelast_jw", + "regionf", + "state_distance", + ] + training_conf["training"]["chosen_model"] = { + "type": "xgboost", + "max_depth": 5, + "eta": 0.5, + "threshold": 0.5, + "threshold_ratio": 1.3, + } + training_conf["training"]["score_with_model"] = True + training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path + training_conf["drop_data_from_scored_matches"] = True + + prepped_df_a = spark.read.csv(prepped_df_a_path, header=True, inferSchema=True) + prepped_df_b = spark.read.csv(prepped_df_b_path, header=True, inferSchema=True) + potential_matches = spark.read.csv( + potential_matches_path, header=True, inferSchema=True + ) + prepped_df_a.write.mode("overwrite").saveAsTable("prepped_df_a") + prepped_df_b.write.mode("overwrite").saveAsTable("prepped_df_b") + potential_matches.write.mode("overwrite").saveAsTable("potential_matches") + + training.run_all_steps() + matching.run_step(2) + + potential_matches_df = spark.table("scored_potential_matches").toPandas() + + # Check one case that we expect to be a match and one case that we expect not + # to be a match. + should_be_match = potential_matches_df.query( + "id_a == '0202928A-AC3E-48BB-8568-3372067F35C7'" + ) + assert ( + should_be_match.shape[0] == 1 + ), "expected exactly one potential match for 0202928A" + assert should_be_match["probability"].iloc[0] >= 0.5 + assert should_be_match["prediction"].iloc[0] == 1 + + # In the real world, this would probably be a match, depending on how the + # additional features looked. But we've included so few training features + # for our test model that small differences in names can really hurt a + # potential match's chances of being classified as a match. + should_not_be_match = potential_matches_df.query( + "id_b == '033FD0FA-C523-42B5-976A-751E830F7021'" + ) + assert ( + should_not_be_match.shape[0] == 1 + ), "expected exactly one potential match for 033FD0FA" + assert should_not_be_match["probability"].iloc[0] <= 0.5 + assert should_not_be_match["prediction"].iloc[0] == 0 + + def test_step_3_apply_chosen_model_RF_threshold( spark, training_conf, From 97aa7e2f65800b2cb8ae69081f8a7880bebf0d9f Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 18 Nov 2024 10:54:19 -0600 Subject: [PATCH 12/19] [#161] Update test to check xgboost training_feature_importances xgboost has a different setup for feature importances, so the current logic ignores it. We'll need to update the save model metadata step to include logic specifically for xgboost. --- hlink/tests/training_test.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py index 7ca5908..ccc62d6 100644 --- a/hlink/tests/training_test.py +++ b/hlink/tests/training_test.py @@ -476,6 +476,14 @@ def test_step_3_with_xgboost_model( training.run_step(2) training.run_step(3) + importances_df = spark.table("training_feature_importances") + assert importances_df.columns == [ + "feature_name", + "category", + "weight", + "average_gain_per_split", + ] + def test_step_3_requires_table(training_conf, training): training_conf["training"]["feature_importances"] = True From 74231695c58f62c0e3f353bce73c0d9631b78731 Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 18 Nov 2024 12:16:50 -0600 Subject: [PATCH 13/19] [#161] Pull column and category logic before feature importances logic --- .../training/link_step_save_model_metadata.py | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/hlink/linking/training/link_step_save_model_metadata.py b/hlink/linking/training/link_step_save_model_metadata.py index 00e3922..fef5ff1 100644 --- a/hlink/linking/training/link_step_save_model_metadata.py +++ b/hlink/linking/training/link_step_save_model_metadata.py @@ -63,29 +63,7 @@ def _run(self): vector_assembler = pipeline_model.stages[0] classifier = pipeline_model.stages[1] - print("Retrieving model feature importances or coefficients...") - try: - feature_imp = classifier.coefficients - except: - try: - feature_imp = classifier.featureImportances - except: - print( - "This model doesn't contain a coefficient or feature importances parameter -- check chosen model type." - ) - return - else: - label = "Feature importances" - else: - label = "Coefficients" - column_names = vector_assembler.getInputCols() - # We need to convert from numpy float64s to Python floats to avoid type - # issues when creating the DataFrame below. - feature_importances = [ - float(importance) for importance in feature_imp.toArray() - ] - tf_prepped = self.task.spark.table(f"{table_prefix}training_features_prepped") tf_prepped_schema = dict(tf_prepped.dtypes) tf_prepped_row = tf_prepped.head() @@ -108,6 +86,28 @@ def _run(self): base_col = col.removesuffix("_imp") true_cols.append((base_col, None)) + print("Retrieving model feature importances or coefficients...") + try: + feature_imp = classifier.coefficients + except: + try: + feature_imp = classifier.featureImportances + except: + print( + "This model doesn't contain a coefficient or feature importances parameter -- check chosen model type." + ) + return + else: + label = "Feature importances" + else: + label = "Coefficients" + + # We need to convert from numpy float64s to Python floats to avoid type + # issues when creating the DataFrame below. + feature_importances = [ + float(importance) for importance in feature_imp.toArray() + ] + true_column_names = [column_name for (column_name, _) in true_cols] true_categories = [category for (_, category) in true_cols] From ffba81a0ff829bf7316c82e6ad9bb4763ae222cd Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 18 Nov 2024 12:49:04 -0600 Subject: [PATCH 14/19] [#161] Support saving model metadata for xgboost This is really different from the Spark models, so I've made it a special case instead of trying to integrate it with the previous logic closely. This section might be due for some refactoring now. --- .../training/link_step_save_model_metadata.py | 65 ++++++++++++------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/hlink/linking/training/link_step_save_model_metadata.py b/hlink/linking/training/link_step_save_model_metadata.py index fef5ff1..ed98274 100644 --- a/hlink/linking/training/link_step_save_model_metadata.py +++ b/hlink/linking/training/link_step_save_model_metadata.py @@ -86,35 +86,52 @@ def _run(self): base_col = col.removesuffix("_imp") true_cols.append((base_col, None)) + true_column_names = [column_name for (column_name, _) in true_cols] + true_categories = [category for (_, category) in true_cols] + model_type = config[training_conf]["chosen_model"]["type"] + print("Retrieving model feature importances or coefficients...") - try: - feature_imp = classifier.coefficients - except: + + if model_type == "xgboost": + raw_weights = classifier.get_feature_importances("weight") + raw_gains = classifier.get_feature_importances("gain") + keys = [f"f{index}" for index in range(len(true_cols))] + + weights = [raw_weights.get(key, 0.0) for key in keys] + gains = [raw_gains.get(key, 0.0) for key in keys] + label = "Feature importances (weights and gain)" + + features_df = self.task.spark.createDataFrame( + zip(true_column_names, true_categories, weights, gains), + "feature_name: string, category: int, weight: double, average_gain_per_split: double", + ).sort("feature_name", "category") + else: try: - feature_imp = classifier.featureImportances + feature_imp = classifier.coefficients except: - print( - "This model doesn't contain a coefficient or feature importances parameter -- check chosen model type." - ) - return + try: + feature_imp = classifier.featureImportances + except: + print( + "This model doesn't contain a coefficient or feature importances parameter -- check chosen model type." + ) + return + else: + label = "Feature importances" else: - label = "Feature importances" - else: - label = "Coefficients" + label = "Coefficients" - # We need to convert from numpy float64s to Python floats to avoid type - # issues when creating the DataFrame below. - feature_importances = [ - float(importance) for importance in feature_imp.toArray() - ] - - true_column_names = [column_name for (column_name, _) in true_cols] - true_categories = [category for (_, category) in true_cols] - - features_df = self.task.spark.createDataFrame( - zip(true_column_names, true_categories, feature_importances, strict=True), - "feature_name: string, category: int, coefficient_or_importance: double", - ).sort("feature_name", "category") + # We need to convert from numpy float64s to Python floats to avoid type + # issues when creating the DataFrame below. + feature_importances = [ + float(importance) for importance in feature_imp.toArray() + ] + features_df = self.task.spark.createDataFrame( + zip( + true_column_names, true_categories, feature_importances, strict=True + ), + "feature_name: string, category: int, coefficient_or_importance: double", + ).sort("feature_name", "category") feature_importances_table = ( f"{self.task.table_prefix}training_feature_importances" From 0277d7d8a4d06e3c4fd2ab9fad495e3e0dd35f0d Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 18 Nov 2024 12:53:03 -0600 Subject: [PATCH 15/19] [#161] Rename a variable in training step 3 --- .../training/link_step_save_model_metadata.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/hlink/linking/training/link_step_save_model_metadata.py b/hlink/linking/training/link_step_save_model_metadata.py index ed98274..88f82a4 100644 --- a/hlink/linking/training/link_step_save_model_metadata.py +++ b/hlink/linking/training/link_step_save_model_metadata.py @@ -58,10 +58,9 @@ def _run(self): raise new_error from e - # The pipeline model has three stages: vector assembler, classifier, post - # transformer. + # The pipeline model has three stages: vector assembler, model, and post transformer. vector_assembler = pipeline_model.stages[0] - classifier = pipeline_model.stages[1] + model = pipeline_model.stages[1] column_names = vector_assembler.getInputCols() tf_prepped = self.task.spark.table(f"{table_prefix}training_features_prepped") @@ -93,13 +92,13 @@ def _run(self): print("Retrieving model feature importances or coefficients...") if model_type == "xgboost": - raw_weights = classifier.get_feature_importances("weight") - raw_gains = classifier.get_feature_importances("gain") + raw_weights = model.get_feature_importances("weight") + raw_gains = model.get_feature_importances("gain") keys = [f"f{index}" for index in range(len(true_cols))] weights = [raw_weights.get(key, 0.0) for key in keys] gains = [raw_gains.get(key, 0.0) for key in keys] - label = "Feature importances (weights and gain)" + label = "Feature importances (weights and gains)" features_df = self.task.spark.createDataFrame( zip(true_column_names, true_categories, weights, gains), @@ -107,10 +106,10 @@ def _run(self): ).sort("feature_name", "category") else: try: - feature_imp = classifier.coefficients + feature_imp = model.coefficients except: try: - feature_imp = classifier.featureImportances + feature_imp = model.featureImportances except: print( "This model doesn't contain a coefficient or feature importances parameter -- check chosen model type." From 3ca39529d918a97bf089f342ec698856156a0b7d Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 18 Nov 2024 15:13:50 -0600 Subject: [PATCH 16/19] [#161] Make the "xgboost is missing" error message more helpful --- hlink/linking/core/classifier.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hlink/linking/core/classifier.py b/hlink/linking/core/classifier.py index 65295fe..8ad400f 100644 --- a/hlink/linking/core/classifier.py +++ b/hlink/linking/core/classifier.py @@ -106,7 +106,9 @@ def choose_classifier(model_type, params, dep_var): elif model_type == "xgboost": if not _xgboost_available: raise ModuleNotFoundError( - "model_type 'xgboost' requires the xgboost library" + "To use the experimental 'xgboost' model type, you need to install " + "the xgboost library and its dependencies. Try installing hlink with " + "the xgboost extra: 'pip install hlink[xgboost]'." ) params_without_threshold = { key: val From b992ba50eb14f9289259dcb4c407d88b6863648a Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 18 Nov 2024 15:57:36 -0600 Subject: [PATCH 17/19] [#161] Update the README with information on XGBoost --- README.md | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c020e7a..d2ee155 100755 --- a/README.md +++ b/README.md @@ -26,19 +26,52 @@ We do our best to make hlink compatible with Python 3.10-3.12. If you have a problem using hlink on one of these versions of Python, please open an issue through GitHub. Versions of Python older than 3.10 are not supported. -Note that pyspark 3.5 does not yet officially support Python 3.12. If you -encounter pyspark-related import errors while running hlink on Python 3.12, try +Note that PySpark 3.5 does not yet officially support Python 3.12. If you +encounter PySpark-related import errors while running hlink on Python 3.12, try - Installing the setuptools package. The distutils package was deleted from the - standard library in Python 3.12, but some versions of pyspark still import + standard library in Python 3.12, but some versions of PySpark still import it. The setuptools package provides a hacky stand-in distutils library which - should fix some import errors in pyspark. We install setuptools in our + should fix some import errors in PySpark. We install setuptools in our development and test dependencies so that our tests work on Python 3.12. -- Downgrading Python to 3.10 or 3.11. Pyspark officially supports these - versions of Python. So you should have better chances getting pyspark to work +- Downgrading Python to 3.10 or 3.11. PySpark officially supports these + versions of Python. So you should have better chances getting PySpark to work well on Python 3.10 or 3.11. +### XGBoost Support + +[XGBoost](https://xgboost.readthedocs.io/en/stable/index.html) is a highly +performant gradient boosting machine learning library. hlink includes optional +support for XGBoost through the xgboost Python package. This support is +experimental and may change since the XGBoost-PySpark integration provided by +the xgboost package is currently unstable. + +To install the xgboost package and its Python dependencies, run `pip install +hlink[xgboost]`. This may be enough to get xgboost running on some machines. If +you run into further errors, you might need to install the libomp package, +which xgboost requires. + +After installing xgboost, you can use it as a model type in training and model +exploration. xgboost has a large list of available parameters, which you can +check out [here](https://xgboost.readthedocs.io/en/latest/parameter.html). +hlink passes parameters defined in your config file through to the xgboost +library. + +```toml +# max_depth, eta, and gamma are parameters for xgboost. threshold and +# threshold_ratio are hlink-specific configurations universal to all model types. +chosen_model = { + type = "xgboost", + max_depth = 5, + eta = 0.5, + gamma = 0.05, + threshold = 0.5, + threshold_ratio = 2.0 +} +``` + + ## Docs The documentation site can be found at [hlink.docs.ipums.org](https://hlink.docs.ipums.org). From 3065310e5a78fe8a41a0b37b76d0961576e26828 Mon Sep 17 00:00:00 2001 From: rileyh Date: Mon, 18 Nov 2024 16:22:01 -0600 Subject: [PATCH 18/19] [#161] Add information about xgboost to models.md --- sphinx-docs/models.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/sphinx-docs/models.md b/sphinx-docs/models.md index a1c9996..357056d 100644 --- a/sphinx-docs/models.md +++ b/sphinx-docs/models.md @@ -87,3 +87,38 @@ chosen_model = { threshold_ratio = 1.3 } ``` + +## xgboost + +*Added in version 3.8.0.* + +This is an alternate, high-performance implementation of gradient boosting. +It uses [xgboost.spark.SparkXGBClassifier](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.spark.SparkXGBClassifier). +Since the XGBoost-PySpark integration which the xgboost Python package provides +is currently unstable, support for the xgboost model type is disabled in hlink +by default. hlink will stop with an error if you try to use this model type +without enabling support for it. To enable support for xgboost, install hlink +with the `xgboost` extra. + +``` +pip install hlink[xgboost] +``` + +This installs the xgboost package and its Python dependencies. Depending on +your machine and operating system, you may also need to install the libomp +library, which is another dependency of xgboost. xgboost should raise a helpful +error if it detects that you need to install libomp. + +You can view a list of xgboost's parameters +[here](https://xgboost.readthedocs.io/en/latest/parameter.html). + +``` +chosen_model = { + type = "xgboost", + max_depth = 5, + eta = 0.5, + gamma = 0.05, + threshold = 0.8, + threshold_ratio = 1.5 +} +``` From ab1d83a228cdea0995072b8fde305420a203e85b Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 19 Nov 2024 11:16:37 -0600 Subject: [PATCH 19/19] [#161] Regenerate Sphinx docs This also updates Alabaster to 1.0.0. --- docs/_sources/models.md.txt | 35 ++++++++ docs/_static/alabaster.css | 115 ++++++++----------------- docs/_static/github-banner.svg | 5 ++ docs/column_mappings.html | 26 +++--- docs/comparison_features.html | 26 +++--- docs/comparisons.html | 26 +++--- docs/config.html | 26 +++--- docs/feature_selection_transforms.html | 26 +++--- docs/genindex.html | 26 +++--- docs/index.html | 27 +++--- docs/installation.html | 26 +++--- docs/introduction.html | 26 +++--- docs/link_tasks.html | 26 +++--- docs/models.html | 57 +++++++++--- docs/pipeline_features.html | 26 +++--- docs/running_the_program.html | 26 +++--- docs/search.html | 5 +- docs/searchindex.js | 2 +- docs/substitutions.html | 26 +++--- docs/use_examples.html | 26 +++--- 20 files changed, 306 insertions(+), 278 deletions(-) create mode 100644 docs/_static/github-banner.svg diff --git a/docs/_sources/models.md.txt b/docs/_sources/models.md.txt index a1c9996..357056d 100644 --- a/docs/_sources/models.md.txt +++ b/docs/_sources/models.md.txt @@ -87,3 +87,38 @@ chosen_model = { threshold_ratio = 1.3 } ``` + +## xgboost + +*Added in version 3.8.0.* + +This is an alternate, high-performance implementation of gradient boosting. +It uses [xgboost.spark.SparkXGBClassifier](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.spark.SparkXGBClassifier). +Since the XGBoost-PySpark integration which the xgboost Python package provides +is currently unstable, support for the xgboost model type is disabled in hlink +by default. hlink will stop with an error if you try to use this model type +without enabling support for it. To enable support for xgboost, install hlink +with the `xgboost` extra. + +``` +pip install hlink[xgboost] +``` + +This installs the xgboost package and its Python dependencies. Depending on +your machine and operating system, you may also need to install the libomp +library, which is another dependency of xgboost. xgboost should raise a helpful +error if it detects that you need to install libomp. + +You can view a list of xgboost's parameters +[here](https://xgboost.readthedocs.io/en/latest/parameter.html). + +``` +chosen_model = { + type = "xgboost", + max_depth = 5, + eta = 0.5, + gamma = 0.05, + threshold = 0.8, + threshold_ratio = 1.5 +} +``` diff --git a/docs/_static/alabaster.css b/docs/_static/alabaster.css index e3174bf..7e75bf8 100644 --- a/docs/_static/alabaster.css +++ b/docs/_static/alabaster.css @@ -1,5 +1,3 @@ -@import url("basic.css"); - /* -- page layout ----------------------------------------------------------- */ body { @@ -160,8 +158,8 @@ div.sphinxsidebar input { font-size: 1em; } -div.sphinxsidebar #searchbox input[type="text"] { - width: 160px; +div.sphinxsidebar #searchbox { + margin: 1em 0; } div.sphinxsidebar .search > div { @@ -263,10 +261,6 @@ div.admonition p.last { margin-bottom: 0; } -div.highlight { - background-color: #fff; -} - dt:target, .highlight { background: #FAF3E8; } @@ -454,7 +448,7 @@ ul, ol { } pre { - background: #EEE; + background: unset; padding: 7px 30px; margin: 15px 0px; line-height: 1.3em; @@ -485,15 +479,15 @@ a.reference { border-bottom: 1px dotted #004B6B; } +a.reference:hover { + border-bottom: 1px solid #6D4100; +} + /* Don't put an underline on images */ a.image-reference, a.image-reference:hover { border-bottom: none; } -a.reference:hover { - border-bottom: 1px solid #6D4100; -} - a.footnote-reference { text-decoration: none; font-size: 0.7em; @@ -509,68 +503,7 @@ a:hover tt, a:hover code { background: #EEE; } - -@media screen and (max-width: 870px) { - - div.sphinxsidebar { - display: none; - } - - div.document { - width: 100%; - - } - - div.documentwrapper { - margin-left: 0; - margin-top: 0; - margin-right: 0; - margin-bottom: 0; - } - - div.bodywrapper { - margin-top: 0; - margin-right: 0; - margin-bottom: 0; - margin-left: 0; - } - - ul { - margin-left: 0; - } - - li > ul { - /* Matches the 30px from the "ul, ol" selector above */ - margin-left: 30px; - } - - .document { - width: auto; - } - - .footer { - width: auto; - } - - .bodywrapper { - margin: 0; - } - - .footer { - width: auto; - } - - .github { - display: none; - } - - - -} - - - -@media screen and (max-width: 875px) { +@media screen and (max-width: 940px) { body { margin: 0; @@ -580,12 +513,16 @@ a:hover tt, a:hover code { div.documentwrapper { float: none; background: #fff; + margin-left: 0; + margin-top: 0; + margin-right: 0; + margin-bottom: 0; } div.sphinxsidebar { display: block; float: none; - width: 102.5%; + width: unset; margin: 50px -30px -20px -30px; padding: 10px 20px; background: #333; @@ -620,8 +557,14 @@ a:hover tt, a:hover code { div.body { min-height: 0; + min-width: auto; /* fixes width on small screens, breaks .hll */ padding: 0; } + + .hll { + /* "fixes" the breakage */ + width: max-content; + } .rtd_doc_footer { display: none; @@ -635,13 +578,18 @@ a:hover tt, a:hover code { width: auto; } - .footer { - width: auto; - } - .github { display: none; } + + ul { + margin-left: 0; + } + + li > ul { + /* Matches the 30px from the "ul, ol" selector above */ + margin-left: 30px; + } } @@ -705,4 +653,11 @@ nav#breadcrumbs li+li:before { div.related { display: none; } +} + +img.github { + position: absolute; + top: 0; + border: 0; + right: 0; } \ No newline at end of file diff --git a/docs/_static/github-banner.svg b/docs/_static/github-banner.svg new file mode 100644 index 0000000..c47d9dc --- /dev/null +++ b/docs/_static/github-banner.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/docs/column_mappings.html b/docs/column_mappings.html index 623d185..c191199 100644 --- a/docs/column_mappings.html +++ b/docs/column_mappings.html @@ -7,7 +7,8 @@ Column Mappings — hlink 3.7.0 documentation - + + @@ -369,7 +370,16 @@

hlink

-

Navigation

+ + +

Navigation

- - @@ -430,7 +430,7 @@

Quick search

| Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Comparison Features — hlink 3.7.0 documentation - + + @@ -1267,7 +1268,16 @@

hlink

-

Navigation

+ + +

Navigation

- - @@ -1329,7 +1329,7 @@

Quick search

| Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Comparisons — hlink 3.7.0 documentation - + + @@ -164,7 +165,16 @@

hlink

-

Navigation

+ + +

Navigation

- - @@ -225,7 +225,7 @@

Quick search

| Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Configuration — hlink 3.7.0 documentation - + + @@ -912,7 +913,16 @@

hlink

-

Navigation

+ + +

Navigation

- - @@ -986,7 +986,7 @@

Quick search

| Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Feature Selection Transforms — hlink 3.7.0 documentation - + + @@ -184,7 +185,16 @@

hlink

-

Navigation

+ + +

Navigation

- - @@ -248,7 +248,7 @@

Quick search

| Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Index — hlink 3.7.0 documentation - + + @@ -52,7 +53,16 @@

hlink

-

Navigation

+ + +

Navigation

- - @@ -106,7 +106,7 @@

Quick search

| Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 diff --git a/docs/index.html b/docs/index.html index fa2a68c..d33d6db 100644 --- a/docs/index.html +++ b/docs/index.html @@ -7,7 +7,8 @@ Welcome to hlink’s documentation! — hlink 3.7.0 documentation - + + @@ -134,6 +135,7 @@

Configuration APIlogistic_regression
  • decision_tree
  • gradient_boosted_trees
  • +
  • xgboost
  • @@ -156,7 +158,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    - - @@ -211,7 +212,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Installation — hlink 3.7.0 documentation - + + @@ -75,7 +76,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    - - @@ -136,7 +136,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Introduction — hlink 3.7.0 documentation - + + @@ -89,7 +90,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    - - @@ -148,7 +148,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Link Tasks — hlink 3.7.0 documentation - + + @@ -237,7 +238,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    - - @@ -301,7 +301,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Models — hlink 3.7.0 documentation - + + @@ -132,6 +133,36 @@

    gradient_boosted_trees +
    +

    xgboost

    +

    Added in version 3.8.0.

    +

    This is an alternate, high-performance implementation of gradient boosting. +It uses xgboost.spark.SparkXGBClassifier. +Since the XGBoost-PySpark integration which the xgboost Python package provides +is currently unstable, support for the xgboost model type is disabled in hlink +by default. hlink will stop with an error if you try to use this model type +without enabling support for it. To enable support for xgboost, install hlink +with the xgboost extra.

    +
    pip install hlink[xgboost]
    +
    +
    +

    This installs the xgboost package and its Python dependencies. Depending on +your machine and operating system, you may also need to install the libomp +library, which is another dependency of xgboost. xgboost should raise a helpful +error if it detects that you need to install libomp.

    +

    You can view a list of xgboost’s parameters +here.

    +
    chosen_model = {
    +    type = "xgboost",
    +    max_depth = 5,
    +    eta = 0.5,
    +    gamma = 0.05,
    +    threshold = 0.8,
    +    threshold_ratio = 1.5
    +}
    +
    +
    +
    @@ -150,7 +181,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    @@ -185,16 +226,6 @@

    Related Topics

    - - @@ -212,7 +243,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Pipeline generated features — hlink 3.7.0 documentation - + + @@ -99,7 +100,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    - - @@ -158,7 +158,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Running hlink — hlink 3.7.0 documentation - + + @@ -285,7 +286,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    - - @@ -345,7 +345,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Search — hlink 3.7.0 documentation - + + @@ -123,7 +124,7 @@

    Related Topics

    | Powered by
    Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 diff --git a/docs/searchindex.js b/docs/searchindex.js index 149159e..85c193f 100644 --- a/docs/searchindex.js +++ b/docs/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1:1 substitution by data table": [[12, "substitution-by-data-table"]], "Advanced Config File": [[3, "advanced-config-file"]], "Advanced Usage": [[0, "advanced-usage"]], "Advanced Workflow Examples": [[13, null]], "Aggregate Features": [[1, "aggregate-features"]], "Basic Config File": [[3, "basic-config-file"]], "Basic Usage": [[0, "basic-usage"]], "Blocking": [[3, "blocking"]], "Column Mappings": [[0, null], [3, "column-mappings"]], "Comparison Features": [[1, null], [3, "comparison-features"]], "Comparison Types": [[1, "comparison-types"], [2, "comparison-types"]], "Comparisons": [[2, null], [3, "comparisons"]], "Configuration": [[3, null]], "Configuration API": [[5, "configuration-api"], [5, null]], "Data sources": [[3, "data-sources"]], "Defining Multiple Comparisons": [[2, "defining-multiple-comparisons"]], "Example interactive mode workflow": [[11, "example-interactive-mode-workflow"]], "Example model exploration and FP/FN export workflow": [[13, "example-model-exploration-and-fp-fn-export-workflow"]], "Example training data export with generated ML features": [[13, "example-training-data-export-with-generated-ml-features"]], "Export training data after generating features to reuse in different linking years": [[13, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Feature Selection Transforms": [[4, null]], "Feature Selections": [[3, "feature-selections"]], "Feature add-ons": [[1, "feature-add-ons"]], "Filter": [[3, "filter"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "Household Comparisons": [[3, "household-comparisons"]], "Household Matching": [[8, "household-matching"]], "Household training and models": [[3, "household-training-and-models"]], "Installation": [[6, null]], "Installing from PyPI": [[6, "installing-from-pypi"]], "Installing from source": [[6, "installing-from-source"]], "Interactive Mode": [[11, "interactive-mode"]], "Introduction": [[7, null]], "Link Tasks": [[8, null]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[13, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Matching": [[8, "matching"]], "Model Exploration and Household Model Exploration": [[8, "model-exploration-and-household-model-exploration"]], "Models": [[9, null]], "Multiple Comparisons": [[3, "multiple-comparisons"]], "Overview": [[2, "overview"], [7, "overview"], [8, "overview"], [8, "id1"], [8, "id4"], [8, "id7"], [8, "id10"], [8, "id13"]], "Pipeline generated features": [[10, null]], "Pipeline-generated Features": [[3, "pipeline-generated-features"]], "Potential Matches Universe": [[3, "potential-matches-universe"]], "Preprocessing": [[8, "preprocessing"]], "Related Configuration Sections": [[8, "related-configuration-sections"], [8, "id3"], [8, "id6"], [8, "id9"], [8, "id12"], [8, "id15"]], "Reporting": [[8, "reporting"]], "Requirements": [[6, "requirements"]], "Running Linking Tasks and Steps": [[11, "running-linking-tasks-and-steps"]], "Running hlink": [[11, null]], "Single Comparison": [[3, "single-comparison"]], "Starting the program": [[11, "starting-the-program"]], "Substitution Columns": [[3, "substitution-columns"]], "Substitution by regex word replace": [[12, "substitution-by-regex-word-replace"]], "Substitutions": [[12, null]], "Task steps": [[8, "task-steps"], [8, "id2"], [8, "id5"], [8, "id8"], [8, "id11"], [8, "id14"]], "Top level configs": [[3, "top-level-configs"]], "Training and Household Training": [[8, "training-and-household-training"]], "Training and models": [[3, "training-and-models"]], "Transformer types": [[10, "transformer-types"]], "Transforms": [[0, "transforms"]], "Using hlink as a Library": [[11, "using-hlink-as-a-library"]], "Welcome to hlink\u2019s documentation!": [[5, null]], "abs_diff": [[1, "abs-diff"]], "add_to_a": [[0, "add-to-a"]], "alias": [[1, "alias"]], "all_equals": [[1, "all-equals"]], "and": [[1, "and"]], "any_equals": [[1, "any-equals"]], "array": [[4, "array"]], "array_index": [[0, "array-index"]], "b_minus_a": [[1, "b-minus-a"]], "bigrams": [[4, "bigrams"]], "btwn_threshold": [[1, "btwn-threshold"]], "bucketizer": [[10, "bucketizer"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "decision_tree": [[9, "decision-tree"]], "divide_by_int": [[0, "divide-by-int"]], "either_are_0": [[1, "either-are-0"]], "either_are_1": [[1, "either-are-1"]], "equals": [[1, "equals"]], "equals_as_int": [[1, "equals-as-int"]], "exact_mult": [[1, "exact-mult"]], "extra_children": [[1, "extra-children"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "geo_distance": [[1, "geo-distance"]], "get_floor": [[0, "get-floor"]], "gradient_boosted_trees": [[9, "gradient-boosted-trees"]], "gt_threshold": [[1, "gt-threshold"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "interaction": [[10, "interaction"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "length_b": [[1, "length-b"]], "logistic_regression": [[9, "logistic-regression"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "lower_threshold": [[1, "lower-threshold"]], "lowercase_strip": [[0, "lowercase-strip"]], "mapping": [[0, "mapping"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "neither_are_null": [[1, "neither-are-null"]], "not_equals": [[1, "not-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "power": [[1, "power"], [4, "power"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_both_years": [[1, "present-both-years"]], "probit": [[9, "probit"]], "random_forest": [[9, "random-forest"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "remove_prefixes": [[0, "remove-prefixes"]], "remove_punctuation": [[0, "remove-punctuation"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_suffixes": [[0, "remove-suffixes"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "second_gen_imm": [[1, "second-gen-imm"]], "soundex": [[4, "soundex"]], "split": [[0, "split"]], "sql_condition": [[1, "sql-condition"], [4, "sql-condition"]], "substring": [[0, "substring"]], "sum": [[1, "sum"]], "threshold": [[1, "threshold"]], "times": [[1, "times"]], "union": [[4, "union"]], "upper_threshold": [[1, "upper-threshold"]], "when_value": [[0, "when-value"]]}, "docnames": ["column_mappings", "comparison_features", "comparisons", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["column_mappings.md", "comparison_features.md", "comparisons.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [0, 1, 3, 7, 8, 11, 12], "0": [0, 1, 2, 3, 8, 9, 10, 11, 13], "005": 3, "012": 1, "05": [1, 3], "1": [0, 1, 2, 3, 5, 8, 9, 10, 11, 13], "10": [0, 3, 6, 13], "100": [0, 3, 13], "11": [0, 3, 6, 10], "12": [0, 6], "14": 1, "15": 9, "1867": 3, "1868": 3, "1869": 3, "1870": 3, "1871": 3, "1872": 3, "1873": 3, "1900": [3, 13], "1900_1910_potential_fn": 13, "1900_1910_potential_fp": 13, "1900_1910_training_data_20191023": 3, "1900_1910_training_result": 13, "1910": [1, 3, 13], "1920": 13, "1930": [3, 13], "1940": [3, 13], "1999": [0, 3], "2": [0, 1, 2, 3, 4, 8, 9, 12, 13], "20": 9, "25": 1, "3": [0, 1, 2, 3, 6, 8, 9, 10, 13], "300": 0, "301": 0, "302": 0, "303": 0, "4": [0, 1, 9], "5": [0, 1, 2, 3, 9, 10, 11, 13], "50": [3, 13], "50g": 13, "53": 3, "5g": 11, "6": [0, 3, 9, 10, 13], "65": 3, "7": [0, 1, 3, 9, 13], "75": [3, 9, 13], "79": [2, 3], "7th": 12, "8": [1, 3, 6, 11], "80": 3, "84": [2, 3], "85": [9, 11], "9": 1, "95": 1, "99": [1, 3], "9998": 0, "9999": [0, 3, 10], "A": [0, 1, 2, 3, 4, 10, 11], "AND": [1, 2, 3], "As": [0, 2], "At": [7, 8], "But": [3, 6], "By": [0, 3, 11], "For": [0, 1, 2, 3, 8, 11, 13], "If": [0, 1, 3, 4, 8, 9, 11, 12], "In": [0, 1, 2, 7, 11, 13], "It": [0, 1, 2, 3, 7, 11, 13], "NOT": 1, "OR": [1, 2, 3], "THEN": 1, "The": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13], "Then": [0, 6], "There": [1, 3, 8], "These": [0, 1, 2, 3, 7, 8, 9, 10], "To": [0, 1, 6, 8, 11], "Will": 3, "With": [0, 10], "_": [0, 1, 2, 3, 4, 5, 9, 10, 11], "_a": 1, "_bpl": 1, "_namefrst": 1, "_sex": 1, "a304bt": 3, "ab": 1, "abbrevi": [8, 12], "abl": 3, "about": [3, 11, 13], "abov": [1, 2, 6], "absolut": 1, "accept": [1, 3, 13], "access": [11, 13], "accord": 1, "across": 1, "ad": [0, 1, 2, 3], "add": [0, 2, 5], "add_to_a": 3, "addit": [0, 1, 3, 6, 7, 11], "addl": 1, "addl_var": 1, "adjust": 11, "adopt": 0, "advanc": 5, "affect": 4, "after": [1, 2, 3, 5, 9, 11], "ag": [0, 1, 3, 4], "against": [1, 3, 12], "age_2": 3, "age_at_dataset_b": 0, "age_threshold": 1, "aggreg": 5, "ah": 0, "ahead": 8, "akin": 1, "algorithm": [1, 2, 7, 8], "alia": [0, 3, 8], "all": [0, 1, 3, 4, 8, 9, 10, 11], "allow": [1, 3, 8, 13], "along": 1, "alpha": 9, "alphabet": 0, "alphanumer": 3, "also": [0, 1, 2, 3, 6, 7, 8, 10, 11, 13], "altern": [0, 3], "although": 3, "alwai": 2, "among": 1, "amount": 3, "an": [0, 1, 2, 3, 7, 9, 11], "analysi": 13, "analyz": [11, 13], "ani": [1, 3, 4, 6, 9], "anoth": [0, 1, 3, 4, 8], "anyon": 8, "anywher": 12, "apach": 6, "apart": 0, "api": [3, 7, 10], "apostroph": 0, "appear": [0, 1], "appli": [0, 2, 3, 4, 8, 13], "apply_model": 3, "appropri": 3, "ar": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "arbitrari": 0, "area": 3, "aren": 1, "arg": 13, "argument": [1, 11, 13], "around": 3, "arrai": [0, 3, 5, 9, 10], "array_index": 3, "ask": 13, "aspect": [8, 11], "assert": [1, 11], "assum": 3, "attach_vari": 3, "attempt": 3, "attribut": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12], "auto": 9, "automat": [0, 3, 6, 8], "av": 12, "avail": [0, 1, 3, 4, 6, 8, 9, 10, 13], "avenu": [0, 12], "b": [0, 1, 2, 3, 4, 11], "b200": 3, "back": 1, "backup": 1, "base": [1, 3, 8], "basic": 5, "becaus": 2, "been": 7, "befor": [0, 1, 3, 4, 6, 8], "begin": 11, "behind": 0, "being": [1, 8], "belong": 3, "below": [0, 1, 3, 4, 9, 10, 11], "best": [3, 8], "beta": [3, 9], "better": [0, 7], "between": [0, 1, 2, 3, 7, 8, 11, 13], "beyond": 1, "bigint": 3, "bigram": [3, 5], "bin": 9, "binomi": 9, "birth": 1, "birthplac": [0, 3], "birthyr": [0, 3], "birthyr_3": 3, "birthyr_col": 1, "block": [2, 5, 8], "blvd": 0, "boolean": [1, 3, 4, 12, 13], "boost": [5, 9], "born": 1, "borrow_t": 11, "both": [0, 1, 2, 3, 8, 13], "boundari": 1, "bpl": [0, 1, 3], "bpl1": 3, "bpl2": 3, "bpl2_str": 3, "bpl3": 3, "bpl_clean": 3, "bpl_orig": 3, "bpl_root": 0, "bpl_str": 3, "bplmatch": 3, "broken": 7, "btwn": 1, "bucket": [3, 8], "built": 6, "builtin": 1, "byrdifcat": 3, "byrdiff": [1, 3, 13], "c": 1, "c201": 3, "calcul": [1, 13], "call": 0, "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "cannot": 3, "care": 2, "cartesian": 1, "case": [0, 1, 2, 3, 4, 7], "cast": 1, "categor": [1, 3, 9, 10], "categori": [0, 9], "caus": 9, "caution": [1, 10], "censu": [0, 8, 13], "census": 7, "chang": [0, 1, 3, 6, 11, 13], "charact": 0, "characterist": [7, 8], "check": [1, 11], "check_val_expr": 1, "checkpoint": 4, "child": [0, 9], "children": 1, "chines": 0, "choic": 11, "chosen": 8, "chosen_model": [3, 9, 13], "circl": 0, "circumst": 1, "class": 11, "classif": [8, 9], "claus": [0, 1], "clean": [0, 7], "clean_birthyr": [0, 3, 4], "clone": 6, "code": [0, 1, 3, 6], "coeffici": [3, 8], "col": [0, 1], "col_to_add": 3, "col_to_join_on": 3, "col_typ": 3, "column": [1, 4, 5, 8, 10, 11, 12, 13], "column_map": [0, 3, 8], "column_nam": [0, 1, 3, 12], "column_to_append": 0, "combin": [1, 2, 3, 4, 8], "come": 1, "command": [3, 7, 11, 13], "comment": 4, "common": [7, 8, 13], "commonli": 9, "comp": 1, "comp_a": [1, 2, 3], "comp_b": [1, 2, 3], "comp_c": 1, "comp_d": 1, "compar": [0, 1, 3, 7, 8], "comparis": 3, "comparison": [5, 8], "comparison_featur": [1, 2, 3, 8], "comparison_typ": [1, 2, 3], "complet": 11, "complex": [3, 4], "comput": [1, 4, 8], "concat": 0, "concaten": [0, 1], "condens": 0, "condense_strip_whitespac": 3, "condit": [0, 1, 2, 3, 4, 5, 8], "conf": [11, 13], "config": [1, 4, 5, 8, 11, 13], "configur": [0, 1, 2, 7, 11, 13], "conjuct": 3, "conjunct": 3, "connect": [2, 3], "consid": [1, 2, 9], "consider": 1, "consol": 11, "constraint": [1, 2], "construct": 8, "contain": [0, 1, 3, 12], "context": 10, "continu": [9, 10, 13], "conveni": 11, "convert": [0, 1, 3], "convert_ints_to_long": 3, "copi": [4, 13], "core": [1, 8, 11, 13], "correspond": [7, 8], "could": [0, 2, 3], "count": [1, 11, 13], "counti": [0, 1], "county_1900_1910_distances_km": 1, "county_a": 1, "county_b": 1, "county_dist": [1, 3, 13], "county_distance_lookup": 1, "county_distance_squar": [1, 3, 13], "county_state_dist": 1, "court": 0, "cover": 10, "coverag": 3, "cpu": 11, "creat": [0, 3, 7, 8, 10, 11, 12, 13], "creation": 3, "crosswalk": 8, "csv": [1, 3, 8, 11, 12, 13], "current": [1, 2, 3, 11], "d": 1, "d425": 3, "data": [0, 1, 5, 7, 8, 11], "databas": 11, "datafram": [8, 11, 13], "dataset": [0, 1, 3, 4, 7, 8, 11, 13], "datasourc": [1, 3, 11], "datasource_a": [3, 8], "datasource_b": [3, 8], "de": 9, "decis": [3, 5, 9, 13], "decisiontreeclassifi": 9, "default": [0, 1, 2, 3, 8, 9, 11], "defin": [1, 3, 5, 8, 9, 10, 11], "definit": [3, 8], "demograph": 8, "depend": [1, 2, 3, 6, 13], "dependent_var": [3, 13], "depth": 9, "deriv": 13, "derived_from": 3, "desc": 11, "describ": [0, 1, 2, 3, 11], "descript": [3, 9, 11], "detail": [0, 3, 11], "determin": [1, 8], "determinist": [7, 8], "dev": 6, "develop": [6, 7], "df": [3, 11], "dictionari": 11, "diff": 1, "differ": [0, 1, 3, 5, 7, 8], "digit": 0, "dir": 12, "directli": 2, "directori": [6, 11, 13], "discard": 9, "discret": 9, "discuss": 3, "dist": 1, "dist_tabl": 1, "distanc": [1, 9], "distance_col": 1, "distance_km": 1, "distances_fil": 1, "distinct": 1, "divid": 0, "divide_by_int": 3, "do": [0, 1, 2, 4, 11, 13], "doc": [9, 10], "document": [1, 3, 9, 11, 13], "doe": [1, 4, 8, 13], "don": [3, 4], "doubl": 10, "down": [0, 7, 13], "drastic": 8, "drop": [0, 3, 11], "drop_al": 11, "drop_all_prc": 11, "drop_all_temp": 11, "drop_data_from_scored_match": 3, "drop_duplicate_a": 3, "drop_duplicate_with_threshold_ratio": [3, 13], "duplic": [3, 9], "durat": 1, "dure": [1, 2, 3, 8], "durmarr": [1, 3], "e": 6, "each": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11], "easiest": [6, 11], "easili": 11, "edit": 6, "effect": [2, 4], "either": [0, 1, 2, 3, 7, 12], "either_0": 1, "either_1": 1, "element": 0, "elig": 8, "els": [1, 3, 4], "else_v": 1, "else_valu": 0, "empti": 1, "enabl": [3, 8, 11], "encod": [3, 4], "end": [0, 1, 3, 4, 12], "ensur": 3, "enter": 11, "entir": [0, 3], "enum_dist": 1, "enumer": 11, "equal": [2, 3, 12], "equal_and_not_null_templ": 1, "equival": 3, "error": [3, 10], "especi": 3, "etc": 13, "eval": 3, "evalu": [1, 3, 7, 8, 9], "even": 1, "everi": [1, 4], "ex": 1, "exact": [1, 3], "exact_mult": [3, 13], "exampl": [0, 1, 2, 3], "except": [3, 10], "exclud": 1, "excute_command": 11, "execut": 11, "execute_command": 11, "execute_task": 11, "executor": 11, "executor_memori": [11, 13], "exist": [1, 3, 11], "exit": 11, "expand": 3, "expand_length": 3, "expect": 1, "experi": 7, "experiment": [8, 11], "explan": 9, "explicitli": [1, 10], "explod": [2, 3, 8], "exploded_df_a": 11, "exploded_df_b": 11, "explor": [3, 5, 7, 11], "expon": 4, "exponenti": 1, "export": [5, 8, 11], "express": [0, 1, 3], "extend": 1, "extens": 8, "extra": 1, "extract": 3, "f": [1, 11], "f1": 1, "f1_match": 3, "f2": 1, "f2_match": 3, "f_caution": [3, 13], "f_interacted_jw_f": [3, 13], "f_pre": [3, 13], "factori": 11, "fail": 3, "fallback": 1, "fals": [1, 3, 4, 5, 7, 11], "famili": 9, "father_namefrst": 1, "favor": 1, "fbpl": 1, "fbpl_nomatch": 1, "fbplmatch": [3, 13], "featur": [2, 5, 7, 8, 9, 11], "feature_import": [3, 8, 13], "feature_nam": [2, 3], "feature_select": [2, 3, 4, 8], "featuresubsetstrategi": 9, "fed": [3, 8], "femal": [3, 12], "fetch": 1, "fetch_a": 3, "few": 4, "fewer": [1, 9], "fi": 1, "file": [1, 4, 5, 7, 8, 11, 12, 13], "filepath": 11, "fill": 1, "filter": [1, 2, 5, 8, 12], "final": [1, 3, 13], "find": [1, 8, 13], "finish": 11, "first": [0, 1, 3, 6, 8, 11, 12], "first_init_col": 1, "first_nam": 0, "five": 11, "fix": 8, "flag": [1, 2, 3, 10, 11, 13], "flexibl": [1, 2], "float": [1, 3, 9], "floor": 0, "focus": [2, 8], "follow": [0, 1, 2, 7, 11, 12, 13], "foreign": 1, "forest": [5, 9], "form": [1, 3, 8, 12], "format": 0, "four": 1, "framework": 13, "from": [0, 1, 3, 5, 7, 8, 9, 10, 11, 13], "from_icpsrctyi": 1, "from_statefip": 1, "fsoundex": [3, 13], "full": [3, 8, 13], "full_count_1870_1880": 11, "full_count_1900_1910": 13, "fullcount_1870_1880": 11, "function": [0, 1, 3, 7, 11], "further": [7, 13], "gbtclassifi": 9, "gen": 1, "gener": [0, 1, 5, 7, 8, 11], "generalizedlinearregress": 9, "geo": 1, "geograph": 1, "get": [0, 1, 2, 3, 11], "get_floor": 3, "get_set": 11, "get_step": 11, "get_tabl": 11, "get_task": 11, "github": 6, "give": [0, 3], "given": [0, 1, 2, 3, 4, 9, 13], "go": [3, 11], "good": 1, "gradient": [5, 9], "greater": [1, 2, 6], "greatest": 1, "group": [3, 8], "gt": 1, "h": 11, "ha": [0, 1, 2, 3, 4, 7, 11, 13], "handl": 11, "harmon": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "haven": 3, "head": 0, "header": [3, 10, 12], "help": [3, 8, 11], "here": [2, 3, 8, 11, 13], "hh": [1, 2], "hh_blocked_match": 11, "hh_col": 3, "hh_comparison": [2, 3, 8], "hh_match": [3, 11], "hh_model_eval_repeat_fn": 11, "hh_model_eval_repeat_fp": 11, "hh_model_eval_training_data": 11, "hh_model_eval_training_featur": 11, "hh_model_eval_training_result": 11, "hh_model_eval_training_vector": 11, "hh_model_explor": 11, "hh_potential_match": [2, 11], "hh_potential_matchs_prep": 11, "hh_predicted_match": 11, "hh_repeat_fn": 13, "hh_repeat_fp": 13, "hh_scored_potential_match": 11, "hh_train": [1, 3, 8, 11, 13], "hh_training_data": 11, "hh_training_data_1900_1910": 3, "hh_training_featur": [11, 13], "hh_training_result": 13, "hidden": 11, "hierarch": [7, 11], "hierarchi": 11, "high": 11, "highest": [1, 3], "highli": [7, 8], "histid": [1, 3, 13], "histid_col": 1, "hit": [3, 11, 13], "hits2": [3, 13], "hlink": [0, 1, 2, 3, 4, 6, 7, 8, 13], "hold": 10, "hot": 3, "household": [0, 2, 5, 7, 9, 11, 13], "how": [1, 3, 8], "howev": [4, 7, 13], "hundr": 0, "hyper": [3, 7, 13], "hyperparamet": [9, 13], "hyphen": 0, "i": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13], "id": [1, 3], "id_column": [1, 3], "ident": 13, "identifi": [3, 7, 13], "if_valu": 0, "ignor": 7, "ii": [0, 3], "iii": [0, 3], "imm": [1, 3, 13], "imm_interacted_bplmatch": 3, "imm_interacted_immyear_caut": [3, 13], "immigr": 1, "immyear_caut": [3, 10], "immyear_diff": [1, 3, 10, 13], "implement": 13, "implicitli": 2, "import": [3, 8, 11, 13], "improv": 8, "includ": [1, 3, 8, 10, 11], "incompar": 1, "increas": [3, 10], "independ": [0, 3], "independent_var": [1, 3, 13], "index": [0, 6], "indic": [1, 13], "individu": [1, 3, 8, 13], "inf": 10, "inform": [0, 1, 3, 11], "ingest": 8, "initi": [0, 1, 11], "input": [0, 1, 3, 4, 7, 8, 11, 12], "input_col": 4, "input_column": [3, 4, 10], "input_table_nam": 11, "inspect": 8, "instal": 5, "instanc": [0, 9], "instead": [1, 3, 4, 6, 8], "instruct": [6, 11], "int": [0, 1, 3, 4, 9], "integ": [0, 1, 3, 10], "integr": 6, "interact": [3, 5, 8, 13], "interfac": 11, "intermedi": 11, "introduct": 5, "introspect": 8, "invalid": [1, 9], "ipum": [0, 7], "ipython": 11, "isn": 13, "istemporari": 11, "item": 0, "iter": 3, "its": [0, 1, 7, 11], "itself": [1, 2], "iv": 3, "jaro": [1, 10], "jaro_winkl": 3, "java": 6, "job": 11, "join": [1, 3, 12], "join_column": [3, 12], "join_valu": [3, 12], "jr": [0, 3], "json": [3, 11], "just": [1, 3, 11, 13], "jw": 1, "jw_col_templ": 1, "jw_f": [1, 3, 13], "jw_m": [3, 10, 13], "jw_max_a": 3, "jw_max_b": 3, "jw_sp": [3, 13], "jw_street": 1, "jw_threshold": 1, "kei": [1, 8, 11], "key_count": 1, "kind": 1, "know": 3, "known": 0, "label": 9, "languag": 7, "last": [0, 1, 8, 10], "latest": 6, "launch": [11, 13], "law": 0, "lead": 0, "learn": [1, 2, 3, 7, 8, 11, 13], "least": [0, 1], "leav": 0, "left": 9, "length": [1, 3, 10], "less": [1, 3], "let": 11, "letter": 0, "level": [1, 5, 11], "leverag": 3, "librari": [5, 7], "like": [0, 2, 3, 8, 11], "limit": 3, "line": [7, 11], "link": [0, 1, 3, 5, 7, 9], "link_run": 11, "linkrun": [7, 11], "list": [0, 1, 3, 4, 5, 9, 11, 12], "liter": 3, "ll": 11, "load": 11, "load_conf_fil": 11, "load_config": 11, "loc_a": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b": 1, "loc_b_0": 1, "loc_b_1": 1, "locat": [1, 3, 11], "log2": 9, "logic": [0, 2, 3], "logist": [5, 9], "logistic_regress": [3, 13], "logisticregress": 9, "long": [3, 12], "longest": 8, "look": [1, 2, 11, 12], "lookup": 1, "lower": [0, 1], "lowercas": 0, "lowercase_strip": 3, "lowest": 0, "lr": 11, "lsoundex": [3, 13], "m": [0, 1], "m_caution": [1, 3, 10, 13], "m_interacted_jw_m": [3, 10, 13], "m_namefrst": 1, "m_pre": [3, 10, 13], "machin": [1, 2, 3, 7, 8, 11, 13], "made": 6, "mai": [0, 2, 3, 7, 8, 11], "main": 11, "mainli": 1, "major": [1, 11], "make": [0, 1, 2, 3, 6, 13], "male": [3, 12], "mani": [2, 8, 11], "manual": 13, "map": [5, 8, 10], "mardurmatch": [1, 3], "mark": 2, "marriag": 1, "match": [0, 1, 2, 5, 7, 11, 12, 13], "matches_df": 11, "matrix": 13, "max": [1, 9, 11], "maxbin": 9, "maxdepth": [3, 9, 13], "maximum": [1, 9], "maximum_jw": 1, "mbpl": 1, "mbplmatch": [3, 13], "mean": [0, 2, 3], "meant": 3, "meet": 1, "member": [1, 8], "memori": 11, "men": 3, "messag": 11, "metadata": 8, "method": [11, 13], "mfbplmatch": 1, "mi": [3, 13], "mi0": 1, "mi1": 1, "mid_init_col": 1, "mid_init_match": 1, "middl": [0, 1], "might": 13, "minimum": [1, 9], "mininstancespernod": 9, "minu": [1, 3], "mismatch": 1, "miss": [1, 11], "ml": [3, 5, 9, 10], "mode": [5, 6, 13], "model": [5, 7, 11], "model_eval_repeat_fn": 11, "model_eval_repeat_fp": 11, "model_eval_training_data": 11, "model_eval_training_featur": 11, "model_eval_training_result": 11, "model_eval_training_vector": 11, "model_explor": [3, 11, 13], "model_paramet": [3, 8, 9, 13], "modifi": 3, "modul": 11, "moment": 8, "momloc": 1, "more": [0, 1, 2, 3, 10, 11, 13], "most": [0, 1, 8, 11], "mother": 10, "mother_birthyr": 1, "mpre": 1, "mr": 0, "much": [1, 8], "mult": 1, "multi": 1, "multipl": [0, 1, 5, 11], "multipli": 1, "must": [0, 1, 2, 3, 4, 9, 10, 12], "my": [11, 13], "my_conf": 11, "my_fil": 3, "myriad": 3, "n": [1, 9, 10], "n_training_iter": [3, 8, 13], "name": [0, 1, 3, 4, 11, 12], "name_col": 1, "name_std": [3, 12], "namefrst": [0, 1, 3], "namefrst_clean": [0, 3], "namefrst_init": 1, "namefrst_jw": [1, 2, 3, 13], "namefrst_mid_init": [0, 1], "namefrst_mid_init_2": 1, "namefrst_rel": 1, "namefrst_related_row": 1, "namefrst_split": [0, 3], "namefrst_std": [3, 12], "namefrst_std_jw": [3, 13], "namefrst_unstd": [1, 3], "namefrst_unstd_bigram": 4, "namelast": [1, 3], "namelast1": 1, "namelast2": 1, "namelast3": 1, "namelast_bigram": 3, "namelast_clean": [1, 3, 4], "namelast_clean_bigram": [3, 4], "namelast_clean_soundex": 4, "namelast_equal_as_int": 1, "namelast_frst_bigram": 4, "namelast_jw": [2, 3, 13], "namelast_jw_max": 1, "namelast_neighbor": 1, "namelast_popular": 1, "namelast_popularity_sum": 1, "nativ": 1, "nbor": [1, 3, 13], "ncount": [3, 4, 13], "ncount2": [3, 4, 13], "nearest": 0, "necessari": [4, 13], "need": [0, 1, 2, 3, 8, 11, 13], "neg": [3, 5, 7], "neighbor": 1, "neighborhood": 1, "neither": 1, "nest": [2, 3, 7], "new": [0, 3, 13], "new_marr": [1, 3], "newli": 3, "niu": 1, "no_first_pad": 4, "node": 9, "non": 0, "nonzero": 1, "nor": 1, "normal": 8, "note": [2, 3], "now": [2, 11], "null": [1, 3, 4], "null_fil": 3, "num": 11, "num_col": 1, "number": [0, 1, 3, 4, 8, 9, 11], "numer": [0, 1], "numtre": [3, 9, 13], "nvl": 1, "object": [1, 3, 7, 11], "often": [2, 4, 11], "onc": [1, 11], "one": [0, 1, 3, 8], "onethird": 9, "onli": [0, 1, 2, 3, 8, 13], "ons": 5, "oper": [0, 1, 2, 3], "option": [0, 1, 3, 4, 7, 8, 9, 11, 13], "or_group": 3, "order": [0, 3, 8], "org": 6, "organ": 11, "original_valu": 3, "oth": [3, 13], "other": [1, 3, 13], "otherwis": [0, 1, 10, 13], "our": 11, "out": [3, 4, 8, 13], "output": [0, 1, 2, 3, 4, 7, 8, 11, 13], "output_col": 4, "output_column": [3, 4, 10], "output_suspicious_td": [3, 13], "output_table_nam": 11, "output_typ": 3, "outsid": 10, "overrid": [0, 3], "override_column_a": [0, 3, 4], "override_column_b": [0, 3, 4], "override_transform": [0, 3], "overview": 5, "own": [1, 3], "p": 3, "packag": 6, "page": [1, 2, 3, 11], "pair": [1, 2, 3, 13], "param": [9, 13], "param_grid": [3, 13], "paramet": [3, 7, 8, 9, 11, 13], "parent": [0, 1, 12], "parent_step_chang": 1, "parenthes": 3, "parquet": [3, 8], "part": [3, 8], "particular": [1, 2, 3], "particularli": 1, "partit": 13, "pass": [1, 8, 9], "path": [1, 3, 11, 12, 13], "pattern": 11, "peopl": [0, 1, 7, 11], "per": [1, 3, 9, 10, 11], "percent": 8, "percentag": 1, "perform": [1, 3, 7, 8, 12], "persist": 11, "person": [0, 1, 7], "pip": 6, "pipelin": 5, "pipeline_featur": [3, 8, 10], "piplin": 3, "placehold": 1, "pleas": [3, 11], "plu": 3, "point": [10, 13], "popul": 8, "posit": [0, 1, 3, 5, 7], "possibl": 3, "post": [3, 8], "potenti": [1, 2, 5, 8], "potential_match": [1, 2, 8, 11], "potential_matches_prep": 11, "potential_matches_univers": [3, 8], "power": 5, "predefin": 1, "predict": [3, 13], "predicted_match": 11, "prediction_col": 3, "preexist": 11, "prefer": 13, "prefix": 0, "prep": 3, "prep_step": 11, "prepar": [7, 8, 11], "prepend": 4, "prepped_df_a": 11, "prepped_df_b": 11, "preprocess": [5, 7, 11, 13], "present": [1, 3, 10], "pretti": 2, "primari": [1, 7], "primarili": [1, 8], "print": 11, "probabilist": [3, 7], "probabl": [0, 3, 9], "probit": [3, 5], "proceed": 12, "process": [2, 3, 7, 11], "produc": [0, 11], "product": [1, 7, 13], "program": [1, 3, 8, 13], "project": 6, "prompt": 11, "provid": [0, 3, 4, 7, 8, 10, 11], "pull": 8, "punctuat": 0, "put": [11, 13], "py": [1, 3], "pypi": 5, "pyspark": [6, 9, 10, 11], "python": [6, 7, 11], "q": [11, 13], "qmark": 0, "qualifi": 3, "queri": [1, 3], "race": [0, 1, 3, 10, 13], "race_interacted_srac": [3, 10, 13], "racematch": 3, "rais": [1, 4], "random": [5, 9], "random_forest": [3, 13], "randomforestclassifi": 9, "rang": [3, 10], "rate": 1, "ratio": [3, 9], "ration": 0, "rationalize_name_word": 3, "raw": [0, 3, 8, 11], "raw_df_a": 11, "raw_df_b": 11, "read": [0, 1, 3, 8, 11], "readm": 3, "real": 7, "reason": 1, "recod": 0, "record": [0, 1, 2, 3, 7, 8], "recurs": [2, 3], "reduc": [4, 8], "refer": 3, "regex": 5, "regex_word_replac": 12, "region": [3, 13], "region_dict": 3, "regionf": 3, "regist": [8, 11], "regress": [5, 9], "regular": 13, "rel": [1, 3, 13], "relat": [0, 1, 3], "relate_a": [3, 10], "relate_col": 1, "relate_div_100": [0, 1, 3], "related_individual_max_jw": 1, "related_individual_row": 1, "relatematch": [1, 3], "relatetyp": [3, 10], "relatetype_interacted_relatematch": 3, "relationship": 7, "relev": 13, "reload": 11, "remain": 8, "remov": 0, "remove_alternate_nam": 3, "remove_qmark_hyphen": 3, "remove_suffix": 3, "repeat_fn": 13, "repeat_fp": 13, "repeatedli": 3, "replac": [0, 1, 2, 5], "replace_apostroph": 3, "replaced_birthyr": [1, 3, 4], "report": [1, 5, 7, 11], "repositori": 6, "repres": [0, 1, 3, 10, 11], "represent": [3, 8], "reproduc": 11, "request": 8, "requir": [0, 1, 2, 3, 4, 5, 8, 10, 11, 12], "research": 7, "reshap": 8, "resourc": 4, "respect": [1, 8], "restrict": 2, "result": [0, 1, 7, 10, 11, 13], "return": [1, 4, 9, 11], "reus": 5, "right": [9, 11], "risk": 1, "road": 0, "robust": 3, "root": 6, "round": [0, 3], "row": [0, 3, 4], "rule": [2, 7, 8], "run": [5, 6, 7, 8, 13], "run_all_step": [3, 11, 13], "run_step": 11, "runtim": 8, "sai": 11, "same": [0, 1, 2, 3, 4, 7, 8, 11], "sampl": 1, "satisfi": [2, 3, 8], "save": [3, 8, 13], "scala": 1, "scale": 3, "scale_data": [3, 13], "scenario": 13, "schema": 3, "score": [1, 3, 8, 10], "score_with_model": [3, 13], "scored_potential_match": 11, "scratch": 3, "script": [3, 7, 11], "search": 1, "second": [0, 1, 3, 12], "secondari": 1, "secondary_distance_col": 1, "secondary_distances_fil": 1, "secondary_key_count": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_source_column": 1, "secondary_table_nam": 1, "section": [0, 1, 2, 3, 13], "see": [1, 3, 6, 11, 13], "seen": 1, "select": [0, 1, 5, 7, 11, 13], "separ": 8, "sequenc": 7, "seri": 11, "serialp": 3, "serv": 1, "set": [0, 1, 2, 3, 4, 7, 8, 11, 13], "set_executor_memori": 11, "set_link_task": 11, "set_loc": 11, "set_num_cor": 11, "set_preexisting_t": 11, "set_print_sql": 11, "set_value_column_a": [3, 4], "set_value_column_b": [3, 4], "sever": [1, 7], "sex": [1, 3, 12], "sex_equ": 3, "sex_region_interact": 3, "sex_regionf_interact": 3, "sexmatch": 3, "sgen": [1, 3, 13], "share": 7, "short": 3, "should": [1, 3, 4, 9, 10, 11], "show": 11, "showf": 11, "shut": 13, "sibl": 0, "sign": 1, "signific": 0, "similar": [1, 3], "simpli": [0, 3], "simplifi": 1, "sinc": [0, 3], "singl": [0, 4, 11, 13], "size": 1, "skip": [3, 4, 8], "sm_bpl": 1, "sm_namefrst": 1, "sm_sex": 1, "small": 3, "smaller": 7, "smallest": 3, "sn_bpl": 1, "sn_namefrst": 1, "sn_sex": 1, "so": [0, 1, 2, 3, 6, 13], "some": [0, 1, 2, 3, 4, 7, 8, 11], "someth": 11, "sometim": 3, "somewhat": 2, "soundex": 5, "sourc": [1, 5, 8, 11, 13], "source_column_a": 1, "source_column_b": 1, "sp": 1, "sp_caution": [1, 3, 13], "sp_interacted_jw_sp": [3, 13], "sp_pre": [3, 13], "space": [0, 3, 4, 12], "span": 13, "spark": [1, 3, 4, 6, 9, 10, 11, 13], "sparkfactori": 11, "sparksess": 11, "special": 1, "specif": [1, 3, 11], "specifi": [0, 1, 3, 7, 8, 10, 11, 12], "split": [3, 4, 8, 9, 10, 13], "split_by_id_a": [3, 13], "sploc": 1, "spous": 0, "spouse_birthyr": 1, "spouse_bpl": 1, "sql": [0, 1, 2, 3, 4, 5, 8, 11], "sql_condit": 3, "sqrt": 9, "squar": 1, "sr": [0, 3], "srace": [3, 10, 13], "stage": 3, "standard": [0, 1, 12], "start": [0, 12], "state": [1, 7], "state_1900_1910_distances_km": 1, "state_dist": 1, "state_distance_lookup": 1, "statecode1": 1, "statecode2": 1, "statefip": [0, 1, 3], "statefip_h": 3, "step": [0, 1, 3, 7], "stepmom": 1, "still": 12, "stop": 0, "street": [0, 1], "street_jw": [1, 3, 13], "street_unstd": 12, "strictli": 10, "string": [0, 1, 3, 4, 8, 9, 11, 12], "strip": [0, 8], "structur": [2, 3, 7], "sub": [1, 2, 3], "subhead": 12, "subset": [3, 12], "substitut": [5, 8], "substitution_column": [3, 8, 12], "substitution_fil": [3, 12], "substitutions_street_abbrev": 12, "subtract": 1, "suffix": 0, "suppli": 12, "support": [0, 2, 3, 8, 9], "suppos": [0, 2, 3], "sure": [2, 3, 6, 11], "surnam": 1, "surround": 0, "suspect": [1, 7], "swap": 12, "syntax": 2, "system": 6, "t": [0, 1, 3, 4, 13], "tabl": [1, 2, 3, 5, 8, 11, 13], "table_nam": 1, "tablenam": 11, "tailor": 7, "take": [0, 1, 2, 3, 4, 8, 11], "taken": [0, 1], "task": [2, 3, 5, 7, 9, 13], "task_nam": 11, "tell": [1, 3, 4], "templat": 1, "ten": 1, "term": 3, "test": [3, 8, 13], "text": 11, "than": [0, 1, 2, 3, 9], "thei": [0, 1, 2, 3, 8, 11], "them": [0, 1, 3, 8], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 10, 11, 13], "thing": 3, "those": [1, 3], "though": 1, "thought": 8, "three": [2, 3, 8], "threshold": [2, 3, 9, 13], "threshold_expr": [2, 3], "threshold_ratio": [3, 9, 13], "through": [6, 8, 10, 11], "thu": 1, "time": [0, 3, 8, 11], "to_icpsrctyi": 1, "to_statefip": 1, "togeth": [0, 1, 3], "toml": [3, 7, 11], "tool": [6, 7], "top": [1, 5], "topic": 11, "total": 8, "train": [1, 5, 7, 9, 11], "training_data": [3, 11], "training_data_1900_1910": 13, "training_data_1900_1910_hlink_featur": 13, "training_data_subset": 3, "training_featur": [11, 13], "training_result": 13, "transform": [1, 3, 5, 7, 8], "transformer_typ": [3, 10], "treat": [3, 10], "tree": [5, 9], "true": [1, 3, 4, 8, 10, 12, 13], "try": 3, "tune": [7, 13], "tutori": [3, 11], "two": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "type": [0, 3, 4, 5, 8, 9, 11, 12, 13], "typic": [3, 8], "u": 0, "ugli": 2, "under": [1, 3], "understand": 7, "union": 5, "uniqu": 3, "unit": 7, "univers": [1, 5, 8], "unknown": 1, "unlik": 2, "unrel": 1, "unstabl": 11, "up": [1, 3, 11, 12], "updat": 13, "upper": 1, "upper_threshold": 3, "uppercas": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13], "us1900": 3, "us1900m_usa": 3, "us1910": 3, "us1910m_usa": 3, "usag": [3, 4, 5, 11], "use_potential_matches_featur": 13, "use_training_data_featur": [3, 8, 13], "user": [1, 11], "usual": [0, 8, 13], "util": 4, "v": 3, "valu": [0, 1, 2, 3, 4, 9, 10, 11, 12], "var": [1, 3], "vari": [0, 3, 4], "variabl": [0, 1, 3, 13], "variant": 12, "varieti": 8, "ve": 13, "vector": [3, 10], "verbos": 2, "veri": [4, 8, 13], "version": [0, 6, 13], "vi": 3, "via": [6, 7], "vii": 3, "viii": 3, "volumn": 1, "wa": [1, 13], "wai": [1, 2, 3, 6, 11], "want": [0, 1, 3, 11, 13], "washington": 3, "we": [0, 1, 11, 13], "well": 3, "were": 3, "weren": 13, "what": [1, 3, 11, 13], "when": [0, 1, 2, 3, 4, 8, 13], "where": [1, 8, 11, 13], "whether": [1, 3, 12], "which": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "white": 0, "whitespac": [0, 8], "who": 1, "whole": [0, 7], "whose": 1, "width": 8, "winkler": [1, 10], "within": [1, 3, 7, 11, 12], "word": [0, 5], "work": [0, 1, 3, 6, 8, 11, 13], "workflow": [4, 5], "world": 7, "would": [1, 2, 3, 13], "write": [1, 11, 13], "written": [3, 7], "x": [3, 10, 11], "x_crosswalk": 11, "x_hh_tfam": 11, "x_hh_tfam_2a": 11, "x_hh_tfam_2b": 11, "x_load": 11, "x_parquet_from_csv": 11, "x_persist": 11, "x_sql": 11, "x_sqlf": 11, "x_summari": 11, "x_tab": 11, "x_tfam": 11, "x_tfam_raw": 11, "x_union": 11, "y": 10, "year": [0, 1, 3, 4, 5], "year_b": 1, "yet": 11, "you": [0, 1, 2, 3, 6, 11, 12, 13], "your": [2, 3, 4, 6, 8, 11, 13], "yrimmig": 1, "zero": 1}, "titles": ["Column Mappings", "Comparison Features", "Comparisons", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "titleterms": {"": 5, "1": 12, "abs_diff": 1, "add": 1, "add_to_a": 0, "advanc": [0, 3, 13], "after": 13, "aggreg": 1, "alia": 1, "all_equ": 1, "any_equ": 1, "api": 5, "arrai": 4, "array_index": 0, "b_minus_a": 1, "basic": [0, 3], "bigram": 4, "block": 3, "btwn_threshold": 1, "bucket": 10, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "column": [0, 3], "comparison": [1, 2, 3], "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "condense_strip_whitespac": 0, "config": 3, "configur": [3, 5, 8], "data": [3, 12, 13], "decision_tre": 9, "defin": 2, "differ": 13, "divide_by_int": 0, "document": 5, "either_are_0": 1, "either_are_1": 1, "equal": 1, "equals_as_int": 1, "exact_mult": 1, "exampl": [11, 13], "explor": [8, 13], "export": 13, "extra_children": 1, "f1_match": 1, "f2_match": 1, "fals": 13, "featur": [1, 3, 4, 10, 13], "fetch_a": 1, "fetch_b": 1, "file": 3, "filter": 3, "fn": 13, "fp": 13, "from": 6, "gener": [3, 10, 13], "geo_dist": 1, "get_floor": 0, "gradient_boosted_tre": 9, "gt_threshold": 1, "hit": 1, "hits2": 1, "hlink": [5, 11], "household": [1, 3, 8], "instal": 6, "interact": [10, 11], "introduct": 7, "jaro_winkl": 1, "jaro_winkler_r": 1, "jaro_winkler_street": 1, "jw_max_a": 1, "jw_max_b": 1, "length_b": 1, "level": 3, "librari": 11, "link": [8, 11, 13], "list": 13, "logistic_regress": 9, "look_at_addl_var": 1, "lower_threshold": 1, "lowercase_strip": 0, "map": [0, 3], "match": [3, 8], "max_jaro_winkl": 1, "maximum_jaro_winkl": 1, "ml": 13, "mode": 11, "model": [3, 8, 9, 13], "multi_jaro_winkler_search": 1, "multipl": [2, 3], "neg": 13, "neither_are_nul": 1, "not_equ": 1, "not_zero_and_not_equ": 1, "ons": 1, "overview": [2, 7, 8], "pipelin": [3, 10], "posit": 13, "potenti": [3, 13], "power": [1, 4], "preprocess": 8, "present_and_equal_categorical_in_univers": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_both_year": 1, "probit": 9, "program": 11, "pypi": 6, "random_forest": 9, "rationalize_name_word": 0, "regex": 12, "rel_jaro_winkl": 1, "relat": 8, "remove_alternate_nam": 0, "remove_one_letter_nam": 0, "remove_prefix": 0, "remove_punctu": 0, "remove_qmark_hyphen": 0, "remove_stop_word": 0, "remove_suffix": 0, "replac": 12, "replace_apostroph": 0, "report": 8, "requir": 6, "reus": 13, "run": 11, "second_gen_imm": 1, "section": 8, "select": [3, 4], "singl": 3, "soundex": 4, "sourc": [3, 6], "split": 0, "sql_condit": [1, 4], "start": 11, "step": [8, 11], "substitut": [3, 12], "substr": 0, "sum": 1, "tabl": 12, "task": [8, 11], "threshold": 1, "time": 1, "top": 3, "train": [3, 8, 13], "transform": [0, 4, 10], "type": [1, 2, 10], "union": 4, "univers": 3, "upper_threshold": 1, "us": 11, "usag": 0, "welcom": 5, "when_valu": 0, "word": 12, "workflow": [11, 13], "year": 13}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1:1 substitution by data table": [[12, "substitution-by-data-table"]], "Advanced Config File": [[3, "advanced-config-file"]], "Advanced Usage": [[0, "advanced-usage"]], "Advanced Workflow Examples": [[13, null]], "Aggregate Features": [[1, "aggregate-features"]], "Basic Config File": [[3, "basic-config-file"]], "Basic Usage": [[0, "basic-usage"]], "Blocking": [[3, "blocking"]], "Column Mappings": [[0, null], [3, "column-mappings"]], "Comparison Features": [[1, null], [3, "comparison-features"]], "Comparison Types": [[1, "comparison-types"], [2, "comparison-types"]], "Comparisons": [[2, null], [3, "comparisons"]], "Configuration": [[3, null]], "Configuration API": [[5, "configuration-api"], [5, null]], "Data sources": [[3, "data-sources"]], "Defining Multiple Comparisons": [[2, "defining-multiple-comparisons"]], "Example interactive mode workflow": [[11, "example-interactive-mode-workflow"]], "Example model exploration and FP/FN export workflow": [[13, "example-model-exploration-and-fp-fn-export-workflow"]], "Example training data export with generated ML features": [[13, "example-training-data-export-with-generated-ml-features"]], "Export training data after generating features to reuse in different linking years": [[13, "export-training-data-after-generating-features-to-reuse-in-different-linking-years"]], "Feature Selection Transforms": [[4, null]], "Feature Selections": [[3, "feature-selections"]], "Feature add-ons": [[1, "feature-add-ons"]], "Filter": [[3, "filter"]], "Household Aggregate Features": [[1, "household-aggregate-features"]], "Household Comparisons": [[3, "household-comparisons"]], "Household Matching": [[8, "household-matching"]], "Household training and models": [[3, "household-training-and-models"]], "Installation": [[6, null]], "Installing from PyPI": [[6, "installing-from-pypi"]], "Installing from source": [[6, "installing-from-source"]], "Interactive Mode": [[11, "interactive-mode"]], "Introduction": [[7, null]], "Link Tasks": [[8, null]], "ML model exploration and export of lists of potential false positives/negatives in training data": [[13, "ml-model-exploration-and-export-of-lists-of-potential-false-positives-negatives-in-training-data"]], "Matching": [[8, "matching"]], "Model Exploration and Household Model Exploration": [[8, "model-exploration-and-household-model-exploration"]], "Models": [[9, null]], "Multiple Comparisons": [[3, "multiple-comparisons"]], "Overview": [[2, "overview"], [7, "overview"], [8, "overview"], [8, "id1"], [8, "id4"], [8, "id7"], [8, "id10"], [8, "id13"]], "Pipeline generated features": [[10, null]], "Pipeline-generated Features": [[3, "pipeline-generated-features"]], "Potential Matches Universe": [[3, "potential-matches-universe"]], "Preprocessing": [[8, "preprocessing"]], "Related Configuration Sections": [[8, "related-configuration-sections"], [8, "id3"], [8, "id6"], [8, "id9"], [8, "id12"], [8, "id15"]], "Reporting": [[8, "reporting"]], "Requirements": [[6, "requirements"]], "Running Linking Tasks and Steps": [[11, "running-linking-tasks-and-steps"]], "Running hlink": [[11, null]], "Single Comparison": [[3, "single-comparison"]], "Starting the program": [[11, "starting-the-program"]], "Substitution Columns": [[3, "substitution-columns"]], "Substitution by regex word replace": [[12, "substitution-by-regex-word-replace"]], "Substitutions": [[12, null]], "Task steps": [[8, "task-steps"], [8, "id2"], [8, "id5"], [8, "id8"], [8, "id11"], [8, "id14"]], "Top level configs": [[3, "top-level-configs"]], "Training and Household Training": [[8, "training-and-household-training"]], "Training and models": [[3, "training-and-models"]], "Transformer types": [[10, "transformer-types"]], "Transforms": [[0, "transforms"]], "Using hlink as a Library": [[11, "using-hlink-as-a-library"]], "Welcome to hlink\u2019s documentation!": [[5, null]], "abs_diff": [[1, "abs-diff"]], "add_to_a": [[0, "add-to-a"]], "alias": [[1, "alias"]], "all_equals": [[1, "all-equals"]], "and": [[1, "and"]], "any_equals": [[1, "any-equals"]], "array": [[4, "array"]], "array_index": [[0, "array-index"]], "b_minus_a": [[1, "b-minus-a"]], "bigrams": [[4, "bigrams"]], "btwn_threshold": [[1, "btwn-threshold"]], "bucketizer": [[10, "bucketizer"]], "caution_comp_3": [[1, "caution-comp-3"]], "caution_comp_3_012": [[1, "caution-comp-3-012"]], "caution_comp_4": [[1, "caution-comp-4"]], "caution_comp_4_012": [[1, "caution-comp-4-012"]], "concat_to_a": [[0, "concat-to-a"]], "concat_to_b": [[0, "concat-to-b"]], "concat_two_cols": [[0, "concat-two-cols"]], "condense_strip_whitespace": [[0, "condense-strip-whitespace"]], "decision_tree": [[9, "decision-tree"]], "divide_by_int": [[0, "divide-by-int"]], "either_are_0": [[1, "either-are-0"]], "either_are_1": [[1, "either-are-1"]], "equals": [[1, "equals"]], "equals_as_int": [[1, "equals-as-int"]], "exact_mult": [[1, "exact-mult"]], "extra_children": [[1, "extra-children"]], "f1_match": [[1, "f1-match"]], "f2_match": [[1, "f2-match"]], "fetch_a": [[1, "fetch-a"]], "fetch_b": [[1, "fetch-b"]], "geo_distance": [[1, "geo-distance"]], "get_floor": [[0, "get-floor"]], "gradient_boosted_trees": [[9, "gradient-boosted-trees"]], "gt_threshold": [[1, "gt-threshold"]], "hits": [[1, "hits"]], "hits2": [[1, "hits2"]], "interaction": [[10, "interaction"]], "jaro_winkler": [[1, "jaro-winkler"]], "jaro_winkler_rate": [[1, "jaro-winkler-rate"]], "jaro_winkler_street": [[1, "jaro-winkler-street"]], "jw_max_a": [[1, "jw-max-a"]], "jw_max_b": [[1, "jw-max-b"]], "length_b": [[1, "length-b"]], "logistic_regression": [[9, "logistic-regression"]], "look_at_addl_var": [[1, "look-at-addl-var"]], "lower_threshold": [[1, "lower-threshold"]], "lowercase_strip": [[0, "lowercase-strip"]], "mapping": [[0, "mapping"]], "max_jaro_winkler": [[1, "max-jaro-winkler"]], "maximum_jaro_winkler": [[1, "maximum-jaro-winkler"]], "multi_jaro_winkler_search": [[1, "multi-jaro-winkler-search"]], "neither_are_null": [[1, "neither-are-null"]], "not_equals": [[1, "not-equals"]], "not_zero_and_not_equals": [[1, "not-zero-and-not-equals"]], "or": [[1, "or"]], "power": [[1, "power"], [4, "power"]], "present_and_equal_categorical_in_universe": [[1, "present-and-equal-categorical-in-universe"]], "present_and_matching_categorical": [[1, "present-and-matching-categorical"]], "present_and_not_equal": [[1, "present-and-not-equal"]], "present_both_years": [[1, "present-both-years"]], "probit": [[9, "probit"]], "random_forest": [[9, "random-forest"]], "rationalize_name_words": [[0, "rationalize-name-words"]], "rel_jaro_winkler": [[1, "rel-jaro-winkler"]], "remove_alternate_names": [[0, "remove-alternate-names"]], "remove_one_letter_names": [[0, "remove-one-letter-names"]], "remove_prefixes": [[0, "remove-prefixes"]], "remove_punctuation": [[0, "remove-punctuation"]], "remove_qmark_hyphen": [[0, "remove-qmark-hyphen"]], "remove_stop_words": [[0, "remove-stop-words"]], "remove_suffixes": [[0, "remove-suffixes"]], "replace_apostrophe": [[0, "replace-apostrophe"]], "second_gen_imm": [[1, "second-gen-imm"]], "soundex": [[4, "soundex"]], "split": [[0, "split"]], "sql_condition": [[1, "sql-condition"], [4, "sql-condition"]], "substring": [[0, "substring"]], "sum": [[1, "sum"]], "threshold": [[1, "threshold"]], "times": [[1, "times"]], "union": [[4, "union"]], "upper_threshold": [[1, "upper-threshold"]], "when_value": [[0, "when-value"]], "xgboost": [[9, "xgboost"]]}, "docnames": ["column_mappings", "comparison_features", "comparisons", "config", "feature_selection_transforms", "index", "installation", "introduction", "link_tasks", "models", "pipeline_features", "running_the_program", "substitutions", "use_examples"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["column_mappings.md", "comparison_features.md", "comparisons.md", "config.md", "feature_selection_transforms.md", "index.rst", "installation.md", "introduction.md", "link_tasks.md", "models.md", "pipeline_features.md", "running_the_program.md", "substitutions.md", "use_examples.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [0, 1, 3, 7, 8, 9, 11, 12], "0": [0, 1, 2, 3, 8, 9, 10, 11, 13], "005": 3, "012": 1, "05": [1, 3, 9], "1": [0, 1, 2, 3, 5, 8, 9, 10, 11, 13], "10": [0, 3, 6, 13], "100": [0, 3, 13], "11": [0, 3, 6, 10], "12": [0, 6], "14": 1, "15": 9, "1867": 3, "1868": 3, "1869": 3, "1870": 3, "1871": 3, "1872": 3, "1873": 3, "1900": [3, 13], "1900_1910_potential_fn": 13, "1900_1910_potential_fp": 13, "1900_1910_training_data_20191023": 3, "1900_1910_training_result": 13, "1910": [1, 3, 13], "1920": 13, "1930": [3, 13], "1940": [3, 13], "1999": [0, 3], "2": [0, 1, 2, 3, 4, 8, 9, 12, 13], "20": 9, "25": 1, "3": [0, 1, 2, 3, 6, 8, 9, 10, 13], "300": 0, "301": 0, "302": 0, "303": 0, "4": [0, 1, 9], "5": [0, 1, 2, 3, 9, 10, 11, 13], "50": [3, 13], "50g": 13, "53": 3, "5g": 11, "6": [0, 3, 9, 10, 13], "65": 3, "7": [0, 1, 3, 9, 13], "75": [3, 9, 13], "79": [2, 3], "7th": 12, "8": [1, 3, 6, 9, 11], "80": 3, "84": [2, 3], "85": [9, 11], "9": 1, "95": 1, "99": [1, 3], "9998": 0, "9999": [0, 3, 10], "A": [0, 1, 2, 3, 4, 10, 11], "AND": [1, 2, 3], "As": [0, 2], "At": [7, 8], "But": [3, 6], "By": [0, 3, 11], "For": [0, 1, 2, 3, 8, 11, 13], "If": [0, 1, 3, 4, 8, 9, 11, 12], "In": [0, 1, 2, 7, 11, 13], "It": [0, 1, 2, 3, 7, 9, 11, 13], "NOT": 1, "OR": [1, 2, 3], "THEN": 1, "The": [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 13], "Then": [0, 6], "There": [1, 3, 8], "These": [0, 1, 2, 3, 7, 8, 9, 10], "To": [0, 1, 6, 8, 9, 11], "Will": 3, "With": [0, 10], "_": [0, 1, 2, 3, 4, 5, 9, 10, 11], "_a": 1, "_bpl": 1, "_namefrst": 1, "_sex": 1, "a304bt": 3, "ab": 1, "abbrevi": [8, 12], "abl": 3, "about": [3, 11, 13], "abov": [1, 2, 6], "absolut": 1, "accept": [1, 3, 13], "access": [11, 13], "accord": 1, "across": 1, "ad": [0, 1, 2, 3, 9], "add": [0, 2, 5], "add_to_a": 3, "addit": [0, 1, 3, 6, 7, 11], "addl": 1, "addl_var": 1, "adjust": 11, "adopt": 0, "advanc": 5, "affect": 4, "after": [1, 2, 3, 5, 9, 11], "ag": [0, 1, 3, 4], "against": [1, 3, 12], "age_2": 3, "age_at_dataset_b": 0, "age_threshold": 1, "aggreg": 5, "ah": 0, "ahead": 8, "akin": 1, "algorithm": [1, 2, 7, 8], "alia": [0, 3, 8], "all": [0, 1, 3, 4, 8, 9, 10, 11], "allow": [1, 3, 8, 13], "along": 1, "alpha": 9, "alphabet": 0, "alphanumer": 3, "also": [0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 13], "altern": [0, 3, 9], "although": 3, "alwai": 2, "among": 1, "amount": 3, "an": [0, 1, 2, 3, 7, 9, 11], "analysi": 13, "analyz": [11, 13], "ani": [1, 3, 4, 6, 9], "anoth": [0, 1, 3, 4, 8, 9], "anyon": 8, "anywher": 12, "apach": 6, "apart": 0, "api": [3, 7, 10], "apostroph": 0, "appear": [0, 1], "appli": [0, 2, 3, 4, 8, 13], "apply_model": 3, "appropri": 3, "ar": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "arbitrari": 0, "area": 3, "aren": 1, "arg": 13, "argument": [1, 11, 13], "around": 3, "arrai": [0, 3, 5, 9, 10], "array_index": 3, "ask": 13, "aspect": [8, 11], "assert": [1, 11], "assum": 3, "attach_vari": 3, "attempt": 3, "attribut": [0, 1, 2, 3, 4, 8, 9, 10, 11, 12], "auto": 9, "automat": [0, 3, 6, 8], "av": 12, "avail": [0, 1, 3, 4, 6, 8, 9, 10, 13], "avenu": [0, 12], "b": [0, 1, 2, 3, 4, 11], "b200": 3, "back": 1, "backup": 1, "base": [1, 3, 8], "basic": 5, "becaus": 2, "been": 7, "befor": [0, 1, 3, 4, 6, 8], "begin": 11, "behind": 0, "being": [1, 8], "belong": 3, "below": [0, 1, 3, 4, 9, 10, 11], "best": [3, 8], "beta": [3, 9], "better": [0, 7], "between": [0, 1, 2, 3, 7, 8, 11, 13], "beyond": 1, "bigint": 3, "bigram": [3, 5], "bin": 9, "binomi": 9, "birth": 1, "birthplac": [0, 3], "birthyr": [0, 3], "birthyr_3": 3, "birthyr_col": 1, "block": [2, 5, 8], "blvd": 0, "boolean": [1, 3, 4, 12, 13], "boost": [5, 9], "born": 1, "borrow_t": 11, "both": [0, 1, 2, 3, 8, 13], "boundari": 1, "bpl": [0, 1, 3], "bpl1": 3, "bpl2": 3, "bpl2_str": 3, "bpl3": 3, "bpl_clean": 3, "bpl_orig": 3, "bpl_root": 0, "bpl_str": 3, "bplmatch": 3, "broken": 7, "btwn": 1, "bucket": [3, 8], "built": 6, "builtin": 1, "byrdifcat": 3, "byrdiff": [1, 3, 13], "c": 1, "c201": 3, "calcul": [1, 13], "call": 0, "can": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "cannot": 3, "care": 2, "cartesian": 1, "case": [0, 1, 2, 3, 4, 7], "cast": 1, "categor": [1, 3, 9, 10], "categori": [0, 9], "caus": 9, "caution": [1, 10], "censu": [0, 8, 13], "census": 7, "chang": [0, 1, 3, 6, 11, 13], "charact": 0, "characterist": [7, 8], "check": [1, 11], "check_val_expr": 1, "checkpoint": 4, "child": [0, 9], "children": 1, "chines": 0, "choic": 11, "chosen": 8, "chosen_model": [3, 9, 13], "circl": 0, "circumst": 1, "class": 11, "classif": [8, 9], "claus": [0, 1], "clean": [0, 7], "clean_birthyr": [0, 3, 4], "clone": 6, "code": [0, 1, 3, 6], "coeffici": [3, 8], "col": [0, 1], "col_to_add": 3, "col_to_join_on": 3, "col_typ": 3, "column": [1, 4, 5, 8, 10, 11, 12, 13], "column_map": [0, 3, 8], "column_nam": [0, 1, 3, 12], "column_to_append": 0, "combin": [1, 2, 3, 4, 8], "come": 1, "command": [3, 7, 11, 13], "comment": 4, "common": [7, 8, 13], "commonli": 9, "comp": 1, "comp_a": [1, 2, 3], "comp_b": [1, 2, 3], "comp_c": 1, "comp_d": 1, "compar": [0, 1, 3, 7, 8], "comparis": 3, "comparison": [5, 8], "comparison_featur": [1, 2, 3, 8], "comparison_typ": [1, 2, 3], "complet": 11, "complex": [3, 4], "comput": [1, 4, 8], "concat": 0, "concaten": [0, 1], "condens": 0, "condense_strip_whitespac": 3, "condit": [0, 1, 2, 3, 4, 5, 8], "conf": [11, 13], "config": [1, 4, 5, 8, 11, 13], "configur": [0, 1, 2, 7, 11, 13], "conjuct": 3, "conjunct": 3, "connect": [2, 3], "consid": [1, 2, 9], "consider": 1, "consol": 11, "constraint": [1, 2], "construct": 8, "contain": [0, 1, 3, 12], "context": 10, "continu": [9, 10, 13], "conveni": 11, "convert": [0, 1, 3], "convert_ints_to_long": 3, "copi": [4, 13], "core": [1, 8, 11, 13], "correspond": [7, 8], "could": [0, 2, 3], "count": [1, 11, 13], "counti": [0, 1], "county_1900_1910_distances_km": 1, "county_a": 1, "county_b": 1, "county_dist": [1, 3, 13], "county_distance_lookup": 1, "county_distance_squar": [1, 3, 13], "county_state_dist": 1, "court": 0, "cover": 10, "coverag": 3, "cpu": 11, "creat": [0, 3, 7, 8, 10, 11, 12, 13], "creation": 3, "crosswalk": 8, "csv": [1, 3, 8, 11, 12, 13], "current": [1, 2, 3, 9, 11], "d": 1, "d425": 3, "data": [0, 1, 5, 7, 8, 11], "databas": 11, "datafram": [8, 11, 13], "dataset": [0, 1, 3, 4, 7, 8, 11, 13], "datasourc": [1, 3, 11], "datasource_a": [3, 8], "datasource_b": [3, 8], "de": 9, "decis": [3, 5, 9, 13], "decisiontreeclassifi": 9, "default": [0, 1, 2, 3, 8, 9, 11], "defin": [1, 3, 5, 8, 9, 10, 11], "definit": [3, 8], "demograph": 8, "depend": [1, 2, 3, 6, 9, 13], "dependent_var": [3, 13], "depth": 9, "deriv": 13, "derived_from": 3, "desc": 11, "describ": [0, 1, 2, 3, 11], "descript": [3, 9, 11], "detail": [0, 3, 11], "detect": 9, "determin": [1, 8], "determinist": [7, 8], "dev": 6, "develop": [6, 7], "df": [3, 11], "dictionari": 11, "diff": 1, "differ": [0, 1, 3, 5, 7, 8], "digit": 0, "dir": 12, "directli": 2, "directori": [6, 11, 13], "disabl": 9, "discard": 9, "discret": 9, "discuss": 3, "dist": 1, "dist_tabl": 1, "distanc": [1, 9], "distance_col": 1, "distance_km": 1, "distances_fil": 1, "distinct": 1, "divid": 0, "divide_by_int": 3, "do": [0, 1, 2, 4, 11, 13], "doc": [9, 10], "document": [1, 3, 9, 11, 13], "doe": [1, 4, 8, 13], "don": [3, 4], "doubl": 10, "down": [0, 7, 13], "drastic": 8, "drop": [0, 3, 11], "drop_al": 11, "drop_all_prc": 11, "drop_all_temp": 11, "drop_data_from_scored_match": 3, "drop_duplicate_a": 3, "drop_duplicate_with_threshold_ratio": [3, 13], "duplic": [3, 9], "durat": 1, "dure": [1, 2, 3, 8], "durmarr": [1, 3], "e": 6, "each": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11], "easiest": [6, 11], "easili": 11, "edit": 6, "effect": [2, 4], "either": [0, 1, 2, 3, 7, 12], "either_0": 1, "either_1": 1, "element": 0, "elig": 8, "els": [1, 3, 4], "else_v": 1, "else_valu": 0, "empti": 1, "enabl": [3, 8, 9, 11], "encod": [3, 4], "end": [0, 1, 3, 4, 12], "ensur": 3, "enter": 11, "entir": [0, 3], "enum_dist": 1, "enumer": 11, "equal": [2, 3, 12], "equal_and_not_null_templ": 1, "equival": 3, "error": [3, 9, 10], "especi": 3, "eta": 9, "etc": 13, "eval": 3, "evalu": [1, 3, 7, 8, 9], "even": 1, "everi": [1, 4], "ex": 1, "exact": [1, 3], "exact_mult": [3, 13], "exampl": [0, 1, 2, 3], "except": [3, 10], "exclud": 1, "excute_command": 11, "execut": 11, "execute_command": 11, "execute_task": 11, "executor": 11, "executor_memori": [11, 13], "exist": [1, 3, 11], "exit": 11, "expand": 3, "expand_length": 3, "expect": 1, "experi": 7, "experiment": [8, 11], "explan": 9, "explicitli": [1, 10], "explod": [2, 3, 8], "exploded_df_a": 11, "exploded_df_b": 11, "explor": [3, 5, 7, 11], "expon": 4, "exponenti": 1, "export": [5, 8, 11], "express": [0, 1, 3], "extend": 1, "extens": 8, "extra": [1, 9], "extract": 3, "f": [1, 11], "f1": 1, "f1_match": 3, "f2": 1, "f2_match": 3, "f_caution": [3, 13], "f_interacted_jw_f": [3, 13], "f_pre": [3, 13], "factori": 11, "fail": 3, "fallback": 1, "fals": [1, 3, 4, 5, 7, 11], "famili": 9, "father_namefrst": 1, "favor": 1, "fbpl": 1, "fbpl_nomatch": 1, "fbplmatch": [3, 13], "featur": [2, 5, 7, 8, 9, 11], "feature_import": [3, 8, 13], "feature_nam": [2, 3], "feature_select": [2, 3, 4, 8], "featuresubsetstrategi": 9, "fed": [3, 8], "femal": [3, 12], "fetch": 1, "fetch_a": 3, "few": 4, "fewer": [1, 9], "fi": 1, "file": [1, 4, 5, 7, 8, 11, 12, 13], "filepath": 11, "fill": 1, "filter": [1, 2, 5, 8, 12], "final": [1, 3, 13], "find": [1, 8, 13], "finish": 11, "first": [0, 1, 3, 6, 8, 11, 12], "first_init_col": 1, "first_nam": 0, "five": 11, "fix": 8, "flag": [1, 2, 3, 10, 11, 13], "flexibl": [1, 2], "float": [1, 3, 9], "floor": 0, "focus": [2, 8], "follow": [0, 1, 2, 7, 11, 12, 13], "foreign": 1, "forest": [5, 9], "form": [1, 3, 8, 12], "format": 0, "four": 1, "framework": 13, "from": [0, 1, 3, 5, 7, 8, 9, 10, 11, 13], "from_icpsrctyi": 1, "from_statefip": 1, "fsoundex": [3, 13], "full": [3, 8, 13], "full_count_1870_1880": 11, "full_count_1900_1910": 13, "fullcount_1870_1880": 11, "function": [0, 1, 3, 7, 11], "further": [7, 13], "gamma": 9, "gbtclassifi": 9, "gen": 1, "gener": [0, 1, 5, 7, 8, 11], "generalizedlinearregress": 9, "geo": 1, "geograph": 1, "get": [0, 1, 2, 3, 11], "get_floor": 3, "get_set": 11, "get_step": 11, "get_tabl": 11, "get_task": 11, "github": 6, "give": [0, 3], "given": [0, 1, 2, 3, 4, 9, 13], "go": [3, 11], "good": 1, "gradient": [5, 9], "greater": [1, 2, 6], "greatest": 1, "group": [3, 8], "gt": 1, "h": 11, "ha": [0, 1, 2, 3, 4, 7, 11, 13], "handl": 11, "harmon": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 13], "haven": 3, "head": 0, "header": [3, 10, 12], "help": [3, 8, 9, 11], "here": [2, 3, 8, 9, 11, 13], "hh": [1, 2], "hh_blocked_match": 11, "hh_col": 3, "hh_comparison": [2, 3, 8], "hh_match": [3, 11], "hh_model_eval_repeat_fn": 11, "hh_model_eval_repeat_fp": 11, "hh_model_eval_training_data": 11, "hh_model_eval_training_featur": 11, "hh_model_eval_training_result": 11, "hh_model_eval_training_vector": 11, "hh_model_explor": 11, "hh_potential_match": [2, 11], "hh_potential_matchs_prep": 11, "hh_predicted_match": 11, "hh_repeat_fn": 13, "hh_repeat_fp": 13, "hh_scored_potential_match": 11, "hh_train": [1, 3, 8, 11, 13], "hh_training_data": 11, "hh_training_data_1900_1910": 3, "hh_training_featur": [11, 13], "hh_training_result": 13, "hidden": 11, "hierarch": [7, 11], "hierarchi": 11, "high": [9, 11], "highest": [1, 3], "highli": [7, 8], "histid": [1, 3, 13], "histid_col": 1, "hit": [3, 11, 13], "hits2": [3, 13], "hlink": [0, 1, 2, 3, 4, 6, 7, 8, 9, 13], "hold": 10, "hot": 3, "household": [0, 2, 5, 7, 9, 11, 13], "how": [1, 3, 8], "howev": [4, 7, 13], "hundr": 0, "hyper": [3, 7, 13], "hyperparamet": [9, 13], "hyphen": 0, "i": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13], "id": [1, 3], "id_column": [1, 3], "ident": 13, "identifi": [3, 7, 13], "if_valu": 0, "ignor": 7, "ii": [0, 3], "iii": [0, 3], "imm": [1, 3, 13], "imm_interacted_bplmatch": 3, "imm_interacted_immyear_caut": [3, 13], "immigr": 1, "immyear_caut": [3, 10], "immyear_diff": [1, 3, 10, 13], "implement": [9, 13], "implicitli": 2, "import": [3, 8, 11, 13], "improv": 8, "includ": [1, 3, 8, 10, 11], "incompar": 1, "increas": [3, 10], "independ": [0, 3], "independent_var": [1, 3, 13], "index": [0, 6], "indic": [1, 13], "individu": [1, 3, 8, 13], "inf": 10, "inform": [0, 1, 3, 11], "ingest": 8, "initi": [0, 1, 11], "input": [0, 1, 3, 4, 7, 8, 11, 12], "input_col": 4, "input_column": [3, 4, 10], "input_table_nam": 11, "inspect": 8, "instal": [5, 9], "instanc": [0, 9], "instead": [1, 3, 4, 6, 8], "instruct": [6, 11], "int": [0, 1, 3, 4, 9], "integ": [0, 1, 3, 10], "integr": [6, 9], "interact": [3, 5, 8, 13], "interfac": 11, "intermedi": 11, "introduct": 5, "introspect": 8, "invalid": [1, 9], "ipum": [0, 7], "ipython": 11, "isn": 13, "istemporari": 11, "item": 0, "iter": 3, "its": [0, 1, 7, 9, 11], "itself": [1, 2], "iv": 3, "jaro": [1, 10], "jaro_winkl": 3, "java": 6, "job": 11, "join": [1, 3, 12], "join_column": [3, 12], "join_valu": [3, 12], "jr": [0, 3], "json": [3, 11], "just": [1, 3, 11, 13], "jw": 1, "jw_col_templ": 1, "jw_f": [1, 3, 13], "jw_m": [3, 10, 13], "jw_max_a": 3, "jw_max_b": 3, "jw_sp": [3, 13], "jw_street": 1, "jw_threshold": 1, "kei": [1, 8, 11], "key_count": 1, "kind": 1, "know": 3, "known": 0, "label": 9, "languag": 7, "last": [0, 1, 8, 10], "latest": 6, "launch": [11, 13], "law": 0, "lead": 0, "learn": [1, 2, 3, 7, 8, 11, 13], "least": [0, 1], "leav": 0, "left": 9, "length": [1, 3, 10], "less": [1, 3], "let": 11, "letter": 0, "level": [1, 5, 11], "leverag": 3, "libomp": 9, "librari": [5, 7, 9], "like": [0, 2, 3, 8, 11], "limit": 3, "line": [7, 11], "link": [0, 1, 3, 5, 7, 9], "link_run": 11, "linkrun": [7, 11], "list": [0, 1, 3, 4, 5, 9, 11, 12], "liter": 3, "ll": 11, "load": 11, "load_conf_fil": 11, "load_config": 11, "loc_a": 1, "loc_a_0": 1, "loc_a_1": 1, "loc_b": 1, "loc_b_0": 1, "loc_b_1": 1, "locat": [1, 3, 11], "log2": 9, "logic": [0, 2, 3], "logist": [5, 9], "logistic_regress": [3, 13], "logisticregress": 9, "long": [3, 12], "longest": 8, "look": [1, 2, 11, 12], "lookup": 1, "lower": [0, 1], "lowercas": 0, "lowercase_strip": 3, "lowest": 0, "lr": 11, "lsoundex": [3, 13], "m": [0, 1], "m_caution": [1, 3, 10, 13], "m_interacted_jw_m": [3, 10, 13], "m_namefrst": 1, "m_pre": [3, 10, 13], "machin": [1, 2, 3, 7, 8, 9, 11, 13], "made": 6, "mai": [0, 2, 3, 7, 8, 9, 11], "main": 11, "mainli": 1, "major": [1, 11], "make": [0, 1, 2, 3, 6, 13], "male": [3, 12], "mani": [2, 8, 11], "manual": 13, "map": [5, 8, 10], "mardurmatch": [1, 3], "mark": 2, "marriag": 1, "match": [0, 1, 2, 5, 7, 11, 12, 13], "matches_df": 11, "matrix": 13, "max": [1, 9, 11], "max_depth": 9, "maxbin": 9, "maxdepth": [3, 9, 13], "maximum": [1, 9], "maximum_jw": 1, "mbpl": 1, "mbplmatch": [3, 13], "mean": [0, 2, 3], "meant": 3, "meet": 1, "member": [1, 8], "memori": 11, "men": 3, "messag": 11, "metadata": 8, "method": [11, 13], "mfbplmatch": 1, "mi": [3, 13], "mi0": 1, "mi1": 1, "mid_init_col": 1, "mid_init_match": 1, "middl": [0, 1], "might": 13, "minimum": [1, 9], "mininstancespernod": 9, "minu": [1, 3], "mismatch": 1, "miss": [1, 11], "ml": [3, 5, 9, 10], "mode": [5, 6, 13], "model": [5, 7, 11], "model_eval_repeat_fn": 11, "model_eval_repeat_fp": 11, "model_eval_training_data": 11, "model_eval_training_featur": 11, "model_eval_training_result": 11, "model_eval_training_vector": 11, "model_explor": [3, 11, 13], "model_paramet": [3, 8, 9, 13], "modifi": 3, "modul": 11, "moment": 8, "momloc": 1, "more": [0, 1, 2, 3, 10, 11, 13], "most": [0, 1, 8, 11], "mother": 10, "mother_birthyr": 1, "mpre": 1, "mr": 0, "much": [1, 8], "mult": 1, "multi": 1, "multipl": [0, 1, 5, 11], "multipli": 1, "must": [0, 1, 2, 3, 4, 9, 10, 12], "my": [11, 13], "my_conf": 11, "my_fil": 3, "myriad": 3, "n": [1, 9, 10], "n_training_iter": [3, 8, 13], "name": [0, 1, 3, 4, 11, 12], "name_col": 1, "name_std": [3, 12], "namefrst": [0, 1, 3], "namefrst_clean": [0, 3], "namefrst_init": 1, "namefrst_jw": [1, 2, 3, 13], "namefrst_mid_init": [0, 1], "namefrst_mid_init_2": 1, "namefrst_rel": 1, "namefrst_related_row": 1, "namefrst_split": [0, 3], "namefrst_std": [3, 12], "namefrst_std_jw": [3, 13], "namefrst_unstd": [1, 3], "namefrst_unstd_bigram": 4, "namelast": [1, 3], "namelast1": 1, "namelast2": 1, "namelast3": 1, "namelast_bigram": 3, "namelast_clean": [1, 3, 4], "namelast_clean_bigram": [3, 4], "namelast_clean_soundex": 4, "namelast_equal_as_int": 1, "namelast_frst_bigram": 4, "namelast_jw": [2, 3, 13], "namelast_jw_max": 1, "namelast_neighbor": 1, "namelast_popular": 1, "namelast_popularity_sum": 1, "nativ": 1, "nbor": [1, 3, 13], "ncount": [3, 4, 13], "ncount2": [3, 4, 13], "nearest": 0, "necessari": [4, 13], "need": [0, 1, 2, 3, 8, 9, 11, 13], "neg": [3, 5, 7], "neighbor": 1, "neighborhood": 1, "neither": 1, "nest": [2, 3, 7], "new": [0, 3, 13], "new_marr": [1, 3], "newli": 3, "niu": 1, "no_first_pad": 4, "node": 9, "non": 0, "nonzero": 1, "nor": 1, "normal": 8, "note": [2, 3], "now": [2, 11], "null": [1, 3, 4], "null_fil": 3, "num": 11, "num_col": 1, "number": [0, 1, 3, 4, 8, 9, 11], "numer": [0, 1], "numtre": [3, 9, 13], "nvl": 1, "object": [1, 3, 7, 11], "often": [2, 4, 11], "onc": [1, 11], "one": [0, 1, 3, 8], "onethird": 9, "onli": [0, 1, 2, 3, 8, 13], "ons": 5, "oper": [0, 1, 2, 3, 9], "opt": [], "option": [0, 1, 3, 4, 7, 8, 9, 11, 13], "or_group": 3, "order": [0, 3, 8], "org": 6, "organ": 11, "original_valu": 3, "oth": [3, 13], "other": [1, 3, 13], "otherwis": [0, 1, 10, 13], "our": 11, "out": [3, 4, 8, 13], "output": [0, 1, 2, 3, 4, 7, 8, 11, 13], "output_col": 4, "output_column": [3, 4, 10], "output_suspicious_td": [3, 13], "output_table_nam": 11, "output_typ": 3, "outsid": 10, "overrid": [0, 3], "override_column_a": [0, 3, 4], "override_column_b": [0, 3, 4], "override_transform": [0, 3], "overview": 5, "own": [1, 3], "p": 3, "packag": [6, 9], "page": [1, 2, 3, 11], "pair": [1, 2, 3, 13], "param": [9, 13], "param_grid": [3, 13], "paramet": [3, 7, 8, 9, 11, 13], "parent": [0, 1, 12], "parent_step_chang": 1, "parenthes": 3, "parquet": [3, 8], "part": [3, 8], "particular": [1, 2, 3], "particularli": 1, "partit": 13, "pass": [1, 8, 9], "path": [1, 3, 11, 12, 13], "pattern": 11, "peopl": [0, 1, 7, 11], "per": [1, 3, 9, 10, 11], "percent": 8, "percentag": 1, "perform": [1, 3, 7, 8, 9, 12], "persist": 11, "person": [0, 1, 7], "pip": [6, 9], "pipelin": 5, "pipeline_featur": [3, 8, 10], "piplin": 3, "placehold": 1, "pleas": [3, 11], "plu": 3, "point": [10, 13], "popul": 8, "posit": [0, 1, 3, 5, 7], "possibl": 3, "post": [3, 8], "potenti": [1, 2, 5, 8], "potential_match": [1, 2, 8, 11], "potential_matches_prep": 11, "potential_matches_univers": [3, 8], "power": 5, "predefin": 1, "predict": [3, 13], "predicted_match": 11, "prediction_col": 3, "preexist": 11, "prefer": 13, "prefix": 0, "prep": 3, "prep_step": 11, "prepar": [7, 8, 11], "prepend": 4, "prepped_df_a": 11, "prepped_df_b": 11, "preprocess": [5, 7, 11, 13], "present": [1, 3, 10], "pretti": 2, "primari": [1, 7], "primarili": [1, 8], "print": 11, "probabilist": [3, 7], "probabl": [0, 3, 9], "probit": [3, 5], "proceed": 12, "process": [2, 3, 7, 11], "produc": [0, 11], "product": [1, 7, 13], "program": [1, 3, 8, 13], "project": 6, "prompt": 11, "provid": [0, 3, 4, 7, 8, 9, 10, 11], "pull": 8, "punctuat": 0, "put": [11, 13], "py": [1, 3], "pypi": 5, "pyspark": [6, 9, 10, 11], "python": [6, 7, 9, 11], "q": [11, 13], "qmark": 0, "qualifi": 3, "queri": [1, 3], "race": [0, 1, 3, 10, 13], "race_interacted_srac": [3, 10, 13], "racematch": 3, "rais": [1, 4, 9], "random": [5, 9], "random_forest": [3, 13], "randomforestclassifi": 9, "rang": [3, 10], "rate": 1, "ratio": [3, 9], "ration": 0, "rationalize_name_word": 3, "raw": [0, 3, 8, 11], "raw_df_a": 11, "raw_df_b": 11, "read": [0, 1, 3, 8, 11], "readm": 3, "real": 7, "reason": 1, "recod": 0, "record": [0, 1, 2, 3, 7, 8], "recurs": [2, 3], "reduc": [4, 8], "refer": 3, "regex": 5, "regex_word_replac": 12, "region": [3, 13], "region_dict": 3, "regionf": 3, "regist": [8, 11], "regress": [5, 9], "regular": 13, "rel": [1, 3, 13], "relat": [0, 1, 3], "relate_a": [3, 10], "relate_col": 1, "relate_div_100": [0, 1, 3], "related_individual_max_jw": 1, "related_individual_row": 1, "relatematch": [1, 3], "relatetyp": [3, 10], "relatetype_interacted_relatematch": 3, "relationship": 7, "relev": 13, "reload": 11, "remain": 8, "remov": 0, "remove_alternate_nam": 3, "remove_qmark_hyphen": 3, "remove_suffix": 3, "repeat_fn": 13, "repeat_fp": 13, "repeatedli": 3, "replac": [0, 1, 2, 5], "replace_apostroph": 3, "replaced_birthyr": [1, 3, 4], "report": [1, 5, 7, 11], "repositori": 6, "repres": [0, 1, 3, 10, 11], "represent": [3, 8], "reproduc": 11, "request": 8, "requir": [0, 1, 2, 3, 4, 5, 8, 10, 11, 12], "research": 7, "reshap": 8, "resourc": 4, "respect": [1, 8], "restrict": 2, "result": [0, 1, 7, 10, 11, 13], "return": [1, 4, 9, 11], "reus": 5, "right": [9, 11], "risk": 1, "road": 0, "robust": 3, "root": 6, "round": [0, 3], "row": [0, 3, 4], "rule": [2, 7, 8], "run": [5, 6, 7, 8, 13], "run_all_step": [3, 11, 13], "run_step": 11, "runtim": 8, "sai": 11, "same": [0, 1, 2, 3, 4, 7, 8, 11], "sampl": 1, "satisfi": [2, 3, 8], "save": [3, 8, 13], "scala": 1, "scale": 3, "scale_data": [3, 13], "scenario": 13, "schema": 3, "score": [1, 3, 8, 10], "score_with_model": [3, 13], "scored_potential_match": 11, "scratch": 3, "script": [3, 7, 11], "search": 1, "second": [0, 1, 3, 12], "secondari": 1, "secondary_distance_col": 1, "secondary_distances_fil": 1, "secondary_key_count": 1, "secondary_loc_a": 1, "secondary_loc_b": 1, "secondary_source_column": 1, "secondary_table_nam": 1, "section": [0, 1, 2, 3, 13], "see": [1, 3, 6, 11, 13], "seen": 1, "select": [0, 1, 5, 7, 11, 13], "separ": 8, "sequenc": 7, "seri": 11, "serialp": 3, "serv": 1, "set": [0, 1, 2, 3, 4, 7, 8, 11, 13], "set_executor_memori": 11, "set_link_task": 11, "set_loc": 11, "set_num_cor": 11, "set_preexisting_t": 11, "set_print_sql": 11, "set_value_column_a": [3, 4], "set_value_column_b": [3, 4], "sever": [1, 7], "sex": [1, 3, 12], "sex_equ": 3, "sex_region_interact": 3, "sex_regionf_interact": 3, "sexmatch": 3, "sgen": [1, 3, 13], "share": 7, "short": 3, "should": [1, 3, 4, 9, 10, 11], "show": 11, "showf": 11, "shut": 13, "sibl": 0, "sign": 1, "signific": 0, "similar": [1, 3], "simpli": [0, 3], "simplifi": 1, "sinc": [0, 3, 9], "singl": [0, 4, 11, 13], "size": 1, "skip": [3, 4, 8], "sm_bpl": 1, "sm_namefrst": 1, "sm_sex": 1, "small": 3, "smaller": 7, "smallest": 3, "sn_bpl": 1, "sn_namefrst": 1, "sn_sex": 1, "so": [0, 1, 2, 3, 6, 13], "some": [0, 1, 2, 3, 4, 7, 8, 11], "someth": 11, "sometim": 3, "somewhat": 2, "soundex": 5, "sourc": [1, 5, 8, 11, 13], "source_column_a": 1, "source_column_b": 1, "sp": 1, "sp_caution": [1, 3, 13], "sp_interacted_jw_sp": [3, 13], "sp_pre": [3, 13], "space": [0, 3, 4, 12], "span": 13, "spark": [1, 3, 4, 6, 9, 10, 11, 13], "sparkfactori": 11, "sparksess": 11, "sparkxgbclassifi": 9, "special": 1, "specif": [1, 3, 11], "specifi": [0, 1, 3, 7, 8, 10, 11, 12], "split": [3, 4, 8, 9, 10, 13], "split_by_id_a": [3, 13], "sploc": 1, "spous": 0, "spouse_birthyr": 1, "spouse_bpl": 1, "sql": [0, 1, 2, 3, 4, 5, 8, 11], "sql_condit": 3, "sqrt": 9, "squar": 1, "sr": [0, 3], "srace": [3, 10, 13], "stage": 3, "standard": [0, 1, 12], "start": [0, 12], "state": [1, 7], "state_1900_1910_distances_km": 1, "state_dist": 1, "state_distance_lookup": 1, "statecode1": 1, "statecode2": 1, "statefip": [0, 1, 3], "statefip_h": 3, "step": [0, 1, 3, 7], "stepmom": 1, "still": 12, "stop": [0, 9], "street": [0, 1], "street_jw": [1, 3, 13], "street_unstd": 12, "strictli": 10, "string": [0, 1, 3, 4, 8, 9, 11, 12], "strip": [0, 8], "structur": [2, 3, 7], "sub": [1, 2, 3], "subhead": 12, "subset": [3, 12], "substitut": [5, 8], "substitution_column": [3, 8, 12], "substitution_fil": [3, 12], "substitutions_street_abbrev": 12, "subtract": 1, "suffix": 0, "suppli": 12, "support": [0, 2, 3, 8, 9], "suppos": [0, 2, 3], "sure": [2, 3, 6, 11], "surnam": 1, "surround": 0, "suspect": [1, 7], "swap": 12, "syntax": 2, "system": [6, 9], "t": [0, 1, 3, 4, 13], "tabl": [1, 2, 3, 5, 8, 11, 13], "table_nam": 1, "tablenam": 11, "tailor": 7, "take": [0, 1, 2, 3, 4, 8, 11], "taken": [0, 1], "task": [2, 3, 5, 7, 9, 13], "task_nam": 11, "tell": [1, 3, 4], "templat": 1, "ten": 1, "term": 3, "test": [3, 8, 13], "text": 11, "than": [0, 1, 2, 3, 9], "thei": [0, 1, 2, 3, 8, 11], "them": [0, 1, 3, 8], "thi": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13], "thing": 3, "those": [1, 3], "though": 1, "thought": 8, "three": [2, 3, 8], "threshold": [2, 3, 9, 13], "threshold_expr": [2, 3], "threshold_ratio": [3, 9, 13], "through": [6, 8, 10, 11], "thu": 1, "time": [0, 3, 8, 11], "to_icpsrctyi": 1, "to_statefip": 1, "togeth": [0, 1, 3], "toml": [3, 7, 11], "tool": [6, 7], "top": [1, 5], "topic": 11, "total": 8, "train": [1, 5, 7, 9, 11], "training_data": [3, 11], "training_data_1900_1910": 13, "training_data_1900_1910_hlink_featur": 13, "training_data_subset": 3, "training_featur": [11, 13], "training_result": 13, "transform": [1, 3, 5, 7, 8], "transformer_typ": [3, 10], "treat": [3, 10], "tree": [5, 9], "true": [1, 3, 4, 8, 10, 12, 13], "try": [3, 9], "tune": [7, 13], "tutori": [3, 11], "two": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13], "type": [0, 3, 4, 5, 8, 9, 11, 12, 13], "typic": [3, 8], "u": 0, "ugli": 2, "under": [1, 3], "understand": 7, "union": 5, "uniqu": 3, "unit": 7, "univers": [1, 5, 8], "unknown": 1, "unlik": 2, "unrel": 1, "unstabl": [9, 11], "up": [1, 3, 11, 12], "updat": 13, "upper": 1, "upper_threshold": 3, "uppercas": 0, "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13], "us1900": 3, "us1900m_usa": 3, "us1910": 3, "us1910m_usa": 3, "usag": [3, 4, 5, 11], "use_potential_matches_featur": 13, "use_training_data_featur": [3, 8, 13], "user": [1, 11], "usual": [0, 8, 13], "util": 4, "v": 3, "valu": [0, 1, 2, 3, 4, 9, 10, 11, 12], "var": [1, 3], "vari": [0, 3, 4], "variabl": [0, 1, 3, 13], "variant": 12, "varieti": 8, "ve": 13, "vector": [3, 10], "verbos": 2, "veri": [4, 8, 13], "version": [0, 6, 9, 13], "vi": 3, "via": [6, 7], "view": 9, "vii": 3, "viii": 3, "volumn": 1, "wa": [1, 13], "wai": [1, 2, 3, 6, 11], "want": [0, 1, 3, 11, 13], "washington": 3, "we": [0, 1, 11, 13], "well": 3, "were": 3, "weren": 13, "what": [1, 3, 11, 13], "when": [0, 1, 2, 3, 4, 8, 13], "where": [1, 8, 11, 13], "whether": [1, 3, 12], "which": [0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 13], "white": 0, "whitespac": [0, 8], "who": 1, "whole": [0, 7], "whose": 1, "width": 8, "winkler": [1, 10], "within": [1, 3, 7, 11, 12], "without": 9, "word": [0, 5], "work": [0, 1, 3, 6, 8, 11, 13], "workflow": [4, 5], "world": 7, "would": [1, 2, 3, 13], "write": [1, 11, 13], "written": [3, 7], "x": [3, 10, 11], "x_crosswalk": 11, "x_hh_tfam": 11, "x_hh_tfam_2a": 11, "x_hh_tfam_2b": 11, "x_load": 11, "x_parquet_from_csv": 11, "x_persist": 11, "x_sql": 11, "x_sqlf": 11, "x_summari": 11, "x_tab": 11, "x_tfam": 11, "x_tfam_raw": 11, "x_union": 11, "xgboost": 5, "y": 10, "year": [0, 1, 3, 4, 5], "year_b": 1, "yet": 11, "you": [0, 1, 2, 3, 6, 9, 11, 12, 13], "your": [2, 3, 4, 6, 8, 9, 11, 13], "yrimmig": 1, "zero": 1}, "titles": ["Column Mappings", "Comparison Features", "Comparisons", "Configuration", "Feature Selection Transforms", "Welcome to hlink\u2019s documentation!", "Installation", "Introduction", "Link Tasks", "Models", "Pipeline generated features", "Running hlink", "Substitutions", "Advanced Workflow Examples"], "titleterms": {"": 5, "1": 12, "abs_diff": 1, "add": 1, "add_to_a": 0, "advanc": [0, 3, 13], "after": 13, "aggreg": 1, "alia": 1, "all_equ": 1, "any_equ": 1, "api": 5, "arrai": 4, "array_index": 0, "b_minus_a": 1, "basic": [0, 3], "bigram": 4, "block": 3, "btwn_threshold": 1, "bucket": 10, "caution_comp_3": 1, "caution_comp_3_012": 1, "caution_comp_4": 1, "caution_comp_4_012": 1, "column": [0, 3], "comparison": [1, 2, 3], "concat_to_a": 0, "concat_to_b": 0, "concat_two_col": 0, "condense_strip_whitespac": 0, "config": 3, "configur": [3, 5, 8], "data": [3, 12, 13], "decision_tre": 9, "defin": 2, "differ": 13, "divide_by_int": 0, "document": 5, "either_are_0": 1, "either_are_1": 1, "equal": 1, "equals_as_int": 1, "exact_mult": 1, "exampl": [11, 13], "explor": [8, 13], "export": 13, "extra_children": 1, "f1_match": 1, "f2_match": 1, "fals": 13, "featur": [1, 3, 4, 10, 13], "fetch_a": 1, "fetch_b": 1, "file": 3, "filter": 3, "fn": 13, "fp": 13, "from": 6, "gener": [3, 10, 13], "geo_dist": 1, "get_floor": 0, "gradient_boosted_tre": 9, "gt_threshold": 1, "hit": 1, "hits2": 1, "hlink": [5, 11], "household": [1, 3, 8], "instal": 6, "interact": [10, 11], "introduct": 7, "jaro_winkl": 1, "jaro_winkler_r": 1, "jaro_winkler_street": 1, "jw_max_a": 1, "jw_max_b": 1, "length_b": 1, "level": 3, "librari": 11, "link": [8, 11, 13], "list": 13, "logistic_regress": 9, "look_at_addl_var": 1, "lower_threshold": 1, "lowercase_strip": 0, "map": [0, 3], "match": [3, 8], "max_jaro_winkl": 1, "maximum_jaro_winkl": 1, "ml": 13, "mode": 11, "model": [3, 8, 9, 13], "multi_jaro_winkler_search": 1, "multipl": [2, 3], "neg": 13, "neither_are_nul": 1, "not_equ": 1, "not_zero_and_not_equ": 1, "ons": 1, "overview": [2, 7, 8], "pipelin": [3, 10], "posit": 13, "potenti": [3, 13], "power": [1, 4], "preprocess": 8, "present_and_equal_categorical_in_univers": 1, "present_and_matching_categor": 1, "present_and_not_equ": 1, "present_both_year": 1, "probit": 9, "program": 11, "pypi": 6, "random_forest": 9, "rationalize_name_word": 0, "regex": 12, "rel_jaro_winkl": 1, "relat": 8, "remove_alternate_nam": 0, "remove_one_letter_nam": 0, "remove_prefix": 0, "remove_punctu": 0, "remove_qmark_hyphen": 0, "remove_stop_word": 0, "remove_suffix": 0, "replac": 12, "replace_apostroph": 0, "report": 8, "requir": 6, "reus": 13, "run": 11, "second_gen_imm": 1, "section": 8, "select": [3, 4], "singl": 3, "soundex": 4, "sourc": [3, 6], "split": 0, "sql_condit": [1, 4], "start": 11, "step": [8, 11], "substitut": [3, 12], "substr": 0, "sum": 1, "tabl": 12, "task": [8, 11], "threshold": 1, "time": 1, "top": 3, "train": [3, 8, 13], "transform": [0, 4, 10], "type": [1, 2, 10], "union": 4, "univers": 3, "upper_threshold": 1, "us": 11, "usag": 0, "welcom": 5, "when_valu": 0, "word": 12, "workflow": [11, 13], "xgboost": 9, "year": 13}}) \ No newline at end of file diff --git a/docs/substitutions.html b/docs/substitutions.html index 2d12001..b3739cc 100644 --- a/docs/substitutions.html +++ b/docs/substitutions.html @@ -7,7 +7,8 @@ Substitutions — hlink 3.7.0 documentation - + + @@ -113,7 +114,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    - - @@ -173,7 +173,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 | Advanced Workflow Examples — hlink 3.7.0 documentation - + + @@ -177,7 +178,16 @@

    hlink

    -

    Navigation

    + + +

    Navigation

    - - @@ -237,7 +237,7 @@

    Quick search

    | Powered by Sphinx 8.1.3 - & Alabaster 0.7.16 + & Alabaster 1.0.0 |